Add

1) read entities from Db 2) read tags - entities into entity_tags DICT 3) Chaned Clasifier into naive Bayes 4) Into classify- Method add 'entity_id' int field to increase tag likelihood.

Add
1) read entities from Db 2) read tags - entities into entity_tags DICT 3) Chaned Clasifier into naive Bayes 4) Into classify- Method add 'entity_id' int field to increase tag likelihood.
1ca27792 · Tags · a1b6dbe9 · 1ca27792
Commit 1ca27792 authored Aug 21, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 85 additions and 42 deletions

news_classify_tag.py news_classify_tag.py +85 -42

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -73,14 +73,42 @@ class Classifier:
            if description is not None:
                texts += description.decode('ascii', 'ignore')
            if text is not None:
-                # if type(text) is bytes:
                texts += text.decode('ascii', 'ignore')
-                # else:
-                #     texts
            if texts != '':
                self.X_text_data.append(self.text_clear(texts))
                self.y_data.append(int(tag_id))

+        # _____________ entity develop ______________
+        sql_entity = "SELECT id, name FROM wp_esi_entity"
+        self.db.query(sql_entity)
+        result_entity = self.db.store_result()
+        self.entity = list(
+            (id_entity, name.decode('ascii', "ignore")) for id_entity, name in result_entity.fetch_row(maxrows=0))
+        del result_entity
+        # print("total entitys # is ", len(self.entity))
+        sql_entity_news = """SELECT wp_esi_news_entity.entity_id, concat(wp_esi_news.title, wp_esi_news.description)
+		    FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
+        self.db.query(sql_entity_news)
+        result_entity_news = self.db.store_result()
+        self.entity_news = list(
+            (entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
+        # print("total # of news linked with entity is", len(self.entity_news))
+        sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
+
+        # ________Entitys -- tags _________________
+        sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
+        self.db.query(sql_entity)
+        rez = self.db.store_result()
+        self.entity_tags = dict()
+        for entity, item_tag in rez.fetch_row(maxrows=0):
+            if entity not in self.entity_tags.keys():
+                self.entity_tags[entity] = "" + item_tag
+            else:
+                self.entity_tags[entity] += " " + item_tag
+        # print(self.entity_tags)
+        # print(len(self.entity_tags))
+        # exit(0)
+
    def text_clear(self, texts):
        import re
        from stemming.porter2 import stem
@@ -103,7 +131,8 @@ class Classifier:
        from sklearn.pipeline import Pipeline
        from sklearn.naive_bayes import MultinomialNB

-        self.vectorizer = CountVectorizer(min_df=2, max_df=15, stop_words='english')
+        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
+        self.tfidf = TfidfTransformer()
        self.classifier = MultinomialNB()
        # self.classifier = Pipeline([
        #     ('vect', TfidfVectorizer(stop_words='english')),
@@ -115,6 +144,7 @@ class Classifier:

        # _________ ML learning posses ___________
        X_data = self.vectorizer.fit_transform(self.X_text_data)
+        # X_data = self.tfidf.fit_transform(X_data)
        self.classifier.fit(X_data, self.y_data)
        self.links_tags = set(self.y_data)

@@ -152,7 +182,7 @@ class Classifier:

        return vector_accordance

-    def classify(self, text, test_corpus=None):
+    def classify(self, text, test_corpus=None, entity_id=0):
        """

        :type text: is text for analysis
@@ -174,7 +204,17 @@ class Classifier:
        rez = self.classifier.predict_proba(matrix_test_data)
        # exit(0)
        self.likelihood_list = rez[0]
-        # print (self.likelihood_list)
+        print("Entitys tags:", end="")
+        if entity_id != 0:
+            if entity_id in self.entity_tags.keys():
+                rez = str(self.entity_tags[entity_id])
+                tags_list_from_entity = list(int(item) for item in rez.split(" "))
+                print(tags_list_from_entity)
+
+                for item_num, tag_id, in enumerate(self.links_tags):
+                    if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
+                        self.likelihood_list[item_num] += 0.1
+                        # print (self.likelihood_list)

    def graph_results(self):
        # import numpy as np
@@ -275,43 +315,45 @@ if __name__ == "__main__":
    # tags_classif.save()
    # exit(0)

-    sql1 = '''SELECT rez.news_id , rez.title, rez.description
-            FROM(
-                    SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
-                       FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-                          ORDER BY wp_esi_tag_news.tag_id ) AS rez
-		                  ORDER BY rand() LIMIT 30'''
-
-    tags_classif.db.query(sql1)
-    news_results = tags_classif.db.store_result()
-    news_results = news_results.fetch_row(maxrows=0)
-    data = list()
-    for i, (id_news, title, description) in enumerate(news_results):
-        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
-        # print("\n#", str(i))
-        print("\nNews title: " + title.decode('ascii', 'ignore'))
-        tags_classif.classify(text_for_analis)
-        print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
-        print(tags_classif.tag_accordance)
-        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
-        # print(sql)
-        results = tags_classif.db.query(sql)
-        tags = tags_classif.db.store_result()
-        tags = tags.fetch_row(maxrows=0)
-        # print(tags)
-        # print(tags_classif.tags)
-        print("User classified tags for present news:")
-        for (tag,) in tags:
-            tag = int(tag)
-            # print(tag)
-            print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
-    # exit(0)
-    print("\n\n")
-    tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
+    # ## _______ test from indentifyed news ______________
+    # sql1 = '''SELECT rez.news_id , rez.title, rez.description
+    #         FROM(
+    #                 SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
+    #                    FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
+    #                       ORDER BY wp_esi_tag_news.tag_id ) AS rez
+		#                   ORDER BY rand() LIMIT 30'''
+    #
+    # tags_classif.db.query(sql1)
+    # news_results = tags_classif.db.store_result()
+    # news_results = news_results.fetch_row(maxrows=0)
+    # data = list()
+    # for i, (id_news, title, description) in enumerate(news_results):
+    #     text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
+    #     # print("\n#", str(i))
+    #     print("\nNews title: " + title.decode('ascii', 'ignore'))
+    #     tags_classif.classify(text_for_analis)
+    #     print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
+    #     print(tags_classif.tag_accordance)
+    #     sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
+    #     # print(sql)
+    #     results = tags_classif.db.query(sql)
+    #     tags = tags_classif.db.store_result()
+    #     tags = tags.fetch_row(maxrows=0)
+    #     # print(tags)
+    #     # print(tags_classif.tags)
+    #     print("User classified tags for present news:")
+    #     for (tag,) in tags:
+    #         tag = int(tag)
+    #         # print(tag)
+    #         print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
+    # # exit(0)
+    # print("\n\n")
+
+    tags_classif.db.query("SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
    result = tags_classif.db.store_result()

-    # news analysis witt title + description + text
-    for i, (title, description, text) in enumerate(result.fetch_row(maxrows=0)):
+    # news analysis witt title + description + text + company_id
+    for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
        text_for_analys = ''
        if title is not None:
            text_for_analys += title.decode("ascii", 'ignore') + '\n'
@@ -322,7 +364,8 @@ if __name__ == "__main__":
        print("\n#", str(i))
        print('Title: ', title.decode("ascii", 'ignore'))
        print("Descr.:", text.decode("ascii", 'ignore')[:80])
-        tags_results = tags_classif.classify(text_for_analys)
+        print('entity :', entity_id)
+        tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)

        # accordance %
        print("Accordance(#tag, %-accordance, tag_description): ")