Add External dictionary to analysis model

07129d22 · Tags · 5a945d1a · 07129d22 · 07129d22 · 07129d22
Commit 07129d22 authored Aug 15, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 8 deletions

stemm_test.py .idea/stemm_test.py +16 -0

model_data_analysis.py model_data_analysis.py +18 -6

news_classify_tag.py news_classify_tag.py +8 -2

No files found.
--- a/.idea/stemm_test.py
+++ b/.idea/stemm_test.py
+def text_clear(word):
+    from stemming.porter2 import stem
+
+    word = word.lower()
+    texts = stem(word)
+    return texts
+
+# import nltk
+from nltk.stem.lancaster import LancasterStemmer
+
+
+if __name__ == "__main__":
+    st = LancasterStemmer()
+    test_list = ['little', 'long', 'longer', 'longest', 'family', 'trendline', 'unable', 'able', 'understable', "TomSoyr", 'largest', 'aumobiles' ]
+    for item_text in test_list:
+        print(item_text, ':', text_clear(item_text), ":", st.stem(item_text))
\ No newline at end of file
--- a/model_data_analysis.py
+++ b/model_data_analysis.py
@@ -128,15 +128,16 @@ if __name__ == '__main__':
    np_matrix = np.delete(np_matrix, tags_empty, axis=0)
    print('\n')
    print('Now tags are', len(tags))
+    print(tags)
    print(np_matrix.shape)
    np_transpose = np_matrix.transpose()
-    print(np_transpose.shape)
+    # print(np_transpose.shape)
    # print(np_transpose[2][25])
    for i in range(np_transpose.shape[0]):
        for j in range(np_transpose.shape[1]):
            if np_transpose[i][j] != 0:
                np_transpose[i][j] = 1
-            # print(i,j)
+                # print(i,j)

    # words frequency measurement
    words_frequency={}
@@ -144,21 +145,32 @@ if __name__ == '__main__':
        # print(row, sum(np_transpose[row]), data_dictionary[row])
        words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row])))
    # print(words_frequency)
-    frequency = list(words_frequency.values())

    # !!!!graph results
-    graph_results(frequency, "The Words Frequencies")
+    frequency = list(words_frequency.values())
+    # graph_results(frequency, "The Words Frequencies")

    # the are some words with great frequency

    very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
    print('There are ', len(very_frequent_words), ' very frequent words: ')
-    print(very_frequent_words)
+    for item in sorted(very_frequent_words, key=very_frequent_words.get, reverse=True):
+        print(item, ":", very_frequent_words[item], end=", ")

    # compare words set from news to words of DICTIONARY
    dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
+    dict_standart1 = set([word for word in open('large.txt', 'r').read().split('\n')])
    # print(len(dict_standart), dict_standart)
    diff_dict_sets = set(data_dictionary) - dict_standart
-    print('The difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
+    print('\nThe difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
    print(diff_dict_sets)
+    diff_dict_sets1 = set(data_dictionary) - dict_standart1
+    print ('\n2\nThe difference of extracted dictionary and common dictionary is', len (diff_dict_sets1))
+    print (diff_dict_sets1)
+
+    print("And the diff of sets are: ", len(diff_dict_sets1 ^ diff_dict_sets))
+    print(diff_dict_sets1 ^ diff_dict_sets)

+    diff_diff_dictionary= set(data_dictionary) - dict_standart - dict_standart1
+    print("If double diffs to do we get: ", len(diff_diff_dictionary))
+    print(diff_diff_dictionary)
\ No newline at end of file
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -26,6 +26,13 @@ class Classifier:
            self.train_data = corpus
            return

+        # load external f=dictionary
+        file_name_dictionary = 'large.txt'
+        file_dict = open(file_name_dictionary, "r")
+        self.correct_words = set()
+        for word in file_dict:
+            self.correct_words.add(word[:-1])
+
        # Extract data form DataBase
        self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)

@@ -75,7 +82,7 @@ class Classifier:
        texts = re.sub(r'\d', " ", texts)
        texts = texts.split(" ")
        texts = [word.lower() for word in texts]
-        texts = [word for word in texts if len(word) > 3 and word not in stopwords]
+        texts = [word for word in texts if (len(word) > 1) and (word not in stopwords) and (word in self.correct_words)]
        # texts = [word for word in texts if word not in stopwords]
        texts = [stem(word) for word in texts]
        texts = " ".join(texts)
@@ -284,7 +291,6 @@ if __name__ == "__main__":
    #         tag = int(tag)
    #         # print(tag)
    #         print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
-    #
    # exit(0)
    tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 25")
    result = tags_classif.db.store_result()