Method Cross validation

6c4536c4 · Tags · 7a5d8e24 · 6c4536c4 · 6c4536c4
Commit 6c4536c4 authored Aug 23, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 10 deletions

model_data_analysis.py model_data_analysis.py +7 -2

news_classify_tag.py news_classify_tag.py +7 -8

No files found.
--- a/model_data_analysis.py
+++ b/model_data_analysis.py
@@ -92,7 +92,7 @@ def save():

 def graph_results(in_data, title=''):
    # import numpy as np
-    from pylab import figure, hist, savefig
+    from pylab import figure, hist, savefig, show
    data = np.asarray(in_data)
    figure()

@@ -152,11 +152,16 @@ if __name__ == '__main__':

    # the are some words with great frequency

-    very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
+    very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 100])
    print('There are ', len(very_frequent_words), ' very frequent words: ')
    for item in sorted(very_frequent_words, key=very_frequent_words.get, reverse=True):
        print(item, ":", very_frequent_words[item], end=", ")

+    un_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value < 10])
+    print('\nThere are ', len(un_frequent_words), ' very seldom words: ')
+    for item in sorted(un_frequent_words, key=un_frequent_words.get, reverse=True):
+        print(item, ":", un_frequent_words[item], end=", ")
+
    # compare words set from news to words of DICTIONARY
    dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
    dict_standart1 = set([word for word in open('large.txt', 'r').read().split('\n')])

--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -6,8 +6,6 @@ import numpy as np
 import _mysql


-# test submodule commit
-
 # noinspection PyUnresolvedReferences
 class Classifier:
    def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
@@ -120,7 +118,7 @@ class Classifier:
        texts = [word.lower() for word in texts]
        texts = [word for word in texts if word not in stopwords]
        # Addition incorrect words
-        # and word in self.correct_words]
+        # texts = [word for word in texts if word not in stopwords and word in self.correct_words]
        texts = [stem(word) for word in texts]
        texts = " ".join(texts)
        return texts
@@ -130,11 +128,12 @@ class Classifier:
        from sklearn.pipeline import Pipeline
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.neighbors import KNeighborsClassifier
+        from sklearn.neural_network import MLPClassifier

        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
        self.tfidf = TfidfTransformer()
-        # self.classifier = MultinomialNB()
-        self.classifier = KNeighborsClassifier()
+        # self.classifier = KNeighborsClassifier()
+        self.classifier = MLPClassifier()
        # self.classifier = Pipeline([
        #     ('vect', TfidfVectorizer(stop_words='english')),
        #     ('clf', MultinomialNB()),
@@ -332,8 +331,8 @@ def cross_validation():
        if len(rez_accordance & tags_in_article) != 0:
            total_score += 1
        else:
-            print("\n#", str(i))
-            print("\nNews title: " + title.decode('ascii', 'ignore'))
+            print("\n\n#", str(i))
+            print("News title: " + title.decode('ascii', 'ignore'))
            print("Model calculated Accordance  :", end=" ")
            for item_tag in tags_classif.tags:
                if int(item_tag[0]) in rez_accordance:
@@ -345,7 +344,7 @@ def cross_validation():
                    print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')

    print("\n\nThe Model was tasted on ", len(news_results), " news. ")
-    print("The total result is:", total_score / len(news_results))
+    print("The total accuracy is:", total_score / len(news_results))

    # exit(0)