Method Cross validation

5dd6300d · Tags · f9f1cbfb · 5dd6300d
Commit 5dd6300d authored Aug 22, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 36 deletions

news_classify_tag.py news_classify_tag.py +42 -36

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -105,9 +105,9 @@ class Classifier:
                self.entity_tags[entity] = "" + item_tag
            else:
                self.entity_tags[entity] += " " + item_tag
-        # print(self.entity_tags)
-        # print(len(self.entity_tags))
-        # exit(0)
+                # print(self.entity_tags)
+                # print(len(self.entity_tags))
+                # exit(0)

    def text_clear(self, texts):
        import re
@@ -132,7 +132,7 @@ class Classifier:
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.neighbors import KNeighborsClassifier

-        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 2))
+        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
        self.tfidf = TfidfTransformer()
        # self.classifier = MultinomialNB()
        self.classifier = KNeighborsClassifier()
@@ -310,6 +310,41 @@ def log_data(i=None, text_to_analise=None, tags=None):
    out_file.close()


+def cross_validation():
+    ## _______ test from indentifyed news ______________
+    sql1 = '''SELECT rez.news_id , rez.title, rez.description
+                FROM(
+                        SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
+                           FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
+                              ORDER BY wp_esi_tag_news.tag_id ) AS rez
+    		                  ORDER BY rand()'''
+
+    tags_classif.db.query(sql1)
+    news_results = tags_classif.db.store_result().fetch_row(maxrows=0)
+    total_score = 0
+    for i, (id_news, title, description) in enumerate(news_results):
+        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
+        tags_classif.classify(text_for_analis)
+        rez_accordance = {item[0] for item in tags_classif.tag_accordance}
+
+        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
+        results = tags_classif.db.query(sql)
+        tags_in_article = {tag for (tag, ) in tags_classif.db.store_result().fetch_row(maxrows=0)}
+        print(rez_accordance, tags_in_article,len(rez_accordance & tags_in_article))
+        if len(rez_accordance & tags_in_article) != 0:
+
+            total_score += 1
+        else:
+            print("\n#", str(i))
+            print("\nNews title: " + title.decode('ascii', 'ignore'))
+            print("Model calculated Accordance  :", end=" ")
+            print(set(rez_accordance))
+            print("User classified tags for news:", set(tags_in_article))
+    print("\n\nThe Model was tasted on ", len(news_results), " news. ")
+    print(" The total result is:", total_score / len(news_results))
+
+    # exit(0)
+
 if __name__ == "__main__":
    try:
        # if localhost database is not available then use server
@@ -324,41 +359,12 @@ if __name__ == "__main__":
    tags_classif.save()
    # exit(0)

-    ## _______ test from indentifyed news ______________
-    sql1 = '''SELECT rez.news_id , rez.title, rez.description
-            FROM(
-                    SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
-                       FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-                          ORDER BY wp_esi_tag_news.tag_id ) AS rez
-		                  ORDER BY rand() LIMIT 30'''
-
-    tags_classif.db.query(sql1)
-    news_results = tags_classif.db.store_result()
-    news_results = news_results.fetch_row(maxrows=0)
-    data = list()
-    for i, (id_news, title, description) in enumerate(news_results):
-        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
-        # print("\n#", str(i))
-        print("\nNews title: " + title.decode('ascii', 'ignore'))
-        tags_classif.classify(text_for_analis)
-        print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
-        print(tags_classif.tag_accordance)
-        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
-        # print(sql)
-        results = tags_classif.db.query(sql)
-        tags = tags_classif.db.store_result()
-        tags = tags.fetch_row(maxrows=0)
-        # print(tags)
-        # print(tags_classif.tags)
-        print("User classified tags for present news:")
-        for (tag,) in tags:
-            tag = int(tag)
-            # print(tag)
-            print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
+    cross_validation()
    exit(0)
    # print("\n\n")

-    tags_classif.db.query("SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
+    tags_classif.db.query(
+        "SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
    result = tags_classif.db.store_result()

    # news analysis witt title + description + text + company_id