Commit 00ddc741 authored by Tags's avatar Tags

add analysis:

1) Find empty tags.
2) Find very frequent words
3) Make word frequency distribution
4) Find uncommon words
parent 3984df1c
import numpy as np
from pprint import pprint
from news_classify_tag import Classifier
import re
import csv
from pprint import pprint
import numpy as np
def load():
......@@ -91,14 +91,74 @@ def save():
return True
def graph_results(in_data, title=''):
# import numpy as np
from pylab import figure, show, hist
data = np.asarray(in_data)
figure()
hist(data, bins=50)
show()
def text_clear(word):
from stemming.porter2 import stem
texts = word.lower()
texts = stem(texts)
return texts
if __name__ == '__main__':
rows, columns, data_dictionary, tags, matrix = load ()
if data_dictionary is None:
print ("Something wrong whit data files. ")
exit (1)
# print(rows, columns, data_dictionary, tags, matrix )
np_matrix = np.asarray (matrix)
np_matrix = np.asarray(matrix)
# print(np_matrix)
print ("Empty taggers are :")
tags_empty = [tag for tag in range (columns) if sum (np_matrix[tag]) == 0]
# print(tags_empty)
print("Empty taggers are :")
tags_empty = [tag for tag in range(columns-1, 0, -1) if sum (np_matrix[tag]) == 0]
print(len(tags_empty), "are empty of ", len(tags))
# erase "empty" tags and rows in matrix
for i in tags_empty:
del tags[i]
print(i, end=",")
np_matrix = np.delete(np_matrix, tags_empty, axis=0)
print('\n')
print('Now tags are', len(tags))
print(np_matrix.shape)
np_transpose = np_matrix.transpose()
print(np_transpose.shape)
# print(np_transpose[2][25])
for i in range(np_transpose.shape[0]):
for j in range(np_transpose.shape[1]):
if np_transpose[i][j] != 0:
np_transpose[i][j] = 1
# print(i,j)
# words frequency measurement
words_frequency={}
for row in range(np_transpose.shape[0]):
# print(row, sum(np_transpose[row]), data_dictionary[row])
words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row])))
# print(words_frequency)
frequency = list(words_frequency.values())
# !!!!graph results
graph_results(frequency, "The Words Frequencies")
# the are some words with great frequency
very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
print('There are ', len(very_frequent_words), ' very frequent words: ')
print(very_frequent_words)
# compare words set from news to words of DICTIONARY
dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
# print(len(dict_standart), dict_standart)
diff_dict_sets = set(data_dictionary) - dict_standart
print('The difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
print(diff_dict_sets)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment