add analysis:

1) Find empty tags. 2) Find very frequent words 3) Make word frequency distribution 4) Find uncommon words

add analysis:
1) Find empty tags. 2) Find very frequent words 3) Make word frequency distribution 4) Find uncommon words
00ddc741 · Tags · 3984df1c · 00ddc741
Commit 00ddc741 authored Aug 02, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 7 deletions

model_data_analysis.py model_data_analysis.py +67 -7

No files found.
--- a/model_data_analysis.py
+++ b/model_data_analysis.py
-import numpy as np
-from pprint import pprint
+from news_classify_tag import Classifier
 import re
-import csv
+from pprint import pprint
+import numpy as np


 def load():
@@ -91,14 +91,74 @@ def save():
    return True


+def graph_results(in_data, title=''):
+    # import numpy as np
+    from pylab import figure, show, hist
+    data = np.asarray(in_data)
+    figure()
+
+    hist(data, bins=50)
+    show()
+
+
+def text_clear(word):
+    from stemming.porter2 import stem
+
+    texts = word.lower()
+    texts = stem(texts)
+    return texts
+
+
 if __name__ == '__main__':
    rows, columns, data_dictionary, tags, matrix = load ()
    if data_dictionary is None:
        print ("Something wrong whit data files. ")
        exit (1)
    # print(rows, columns, data_dictionary, tags, matrix )
-    np_matrix = np.asarray (matrix)
+    np_matrix = np.asarray(matrix)
    # print(np_matrix)
-    print ("Empty taggers are :")
-    tags_empty = [tag for tag in range (columns) if sum (np_matrix[tag]) == 0]
-    # print(tags_empty)
+    print("Empty taggers are :")
+    tags_empty = [tag for tag in range(columns-1, 0, -1) if sum (np_matrix[tag]) == 0]
+    print(len(tags_empty), "are empty of ", len(tags))
+
+    # erase "empty" tags and rows in matrix
+    for i in tags_empty:
+        del tags[i]
+        print(i, end=",")
+    np_matrix = np.delete(np_matrix, tags_empty, axis=0)
+    print('\n')
+    print('Now tags are', len(tags))
+    print(np_matrix.shape)
+    np_transpose = np_matrix.transpose()
+    print(np_transpose.shape)
+    # print(np_transpose[2][25])
+    for i in range(np_transpose.shape[0]):
+        for j in range(np_transpose.shape[1]):
+            if np_transpose[i][j] != 0:
+                np_transpose[i][j] = 1
+            # print(i,j)
+
+    # words frequency measurement
+    words_frequency={}
+    for row in range(np_transpose.shape[0]):
+        # print(row, sum(np_transpose[row]), data_dictionary[row])
+        words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row])))
+    # print(words_frequency)
+    frequency = list(words_frequency.values())
+
+    # !!!!graph results
+    graph_results(frequency, "The Words Frequencies")
+
+    # the are some words with great frequency
+
+    very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
+    print('There are ', len(very_frequent_words), ' very frequent words: ')
+    print(very_frequent_words)
+
+    # compare words set from news to words of DICTIONARY
+    dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
+    # print(len(dict_standart), dict_standart)
+    diff_dict_sets = set(data_dictionary) - dict_standart
+    print('The difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
+    print(diff_dict_sets)
+