Make changes according the requirements.

1b3a12f2 · Tags · d531aef4 · 1b3a12f2
Commit 1b3a12f2 authored Jul 27, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 42 deletions

news_classify_tag.py news_classify_tag.py +77 -42

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
-import sys
-from _mysql import OperationalError
-from pprint import pprint
+# import sys
+# from _mysql import OperationalError
+# from pprint import pprint
 import numpy as np
+import _mysql


 # noinspection PyUnresolvedReferences
@@ -11,7 +12,6 @@ class Classifier:
        try:
            import nltk
            import operator
-            from MySQLdb import connect
            from stemming.porter2 import stem
        except ImportError:
            print ('You have import flowing packages: sklearn & nltk & re.')
@@ -24,27 +24,29 @@ class Classifier:
            return

        # Extract data form DataBase
-        conn = connect (host=host, port=port, user=user, password=password, db=db)
-        self.cursor = conn.cursor ()
+        self.db = _mysql.connect (host=host, port=port, user=user, passwd=password, db=db)

-        #geting tags
-        result = self.cursor.execute ("select id, name from wp_esi_tag")
+        # geting tags
+        self.db.query("SELECT id, name FROM wp_esi_tag")
+        rez = self.db.store_result()
+        # result =
        tags = list ()
-        for id, description in self.cursor.fetchall ():
-            tags.append((id, description))
+        for id, description in rez.fetch_row(maxrows=0):
+            tags.append ((id, description))
        self.tags = tags
-        # print (tags)
+        # print (len(tags), tags)
        del tags

        train_data = []
        # text_id = []
-        sql1 = '''select wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
-                       from wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-                          order by wp_esi_tag_news.tag_id LIMIT 1000 '''
-        result = self.cursor.execute (sql1)
-        data = list()
-        for tag_id, title, description in self.cursor.fetchall():
-            data.append((tag_id, title, description))
+        sql1 = '''SELECT wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
+                       FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
+                          ORDER BY wp_esi_tag_news.tag_id LIMIT 1000 '''
+        self.db.query (sql1)
+        result = self.db.store_result()
+        data = list ()
+        for tag_id, title, description in result.fetch_row(maxrows=0):
+            data.append ((tag_id, title, description))

        for tag_id, _ in self.tags:
            texts = ''
@@ -95,13 +97,13 @@ class Classifier:

        # normalise rowsfrequency matrix
        for j in range (columns):
-            suma=0
+            suma = 0
            suma = frequency_matrix[j].sum ()
            if suma == 0:
                continue
            for i in range (rows):
                frequency_matrix[j, i] /= suma
-        self.tag_frequency_matrix= frequency_matrix
+        self.tag_frequency_matrix = frequency_matrix
        return frequency_matrix

    def teach_model(self, data_text=''):
@@ -122,12 +124,12 @@ class Classifier:
        :return: tuple of % af tag accordance
        """
        vector_accordance = []
-        for row in range(self.tag_frequency_matrix.shape[0]):
+        for row in range (self.tag_frequency_matrix.shape[0]):
            temp_matrix = self.tag_frequency_matrix[row]
-            rez_summ = temp_matrix[self.matrix_test_data.toarray()[0] > 0].sum()
+            rez_summ = temp_matrix[self.matrix_test_data.toarray ()[0] > 0].sum ()
            if rez_summ > (persantage / 100):
-                vector_accordance.append((row, int(rez_summ*1000)/10, self.tags[row][1]))
-            vector_accordance.sort(key=lambda tup: tup[1], reverse=True)
+                vector_accordance.append ((row, int (rez_summ * 1000) / 10, self.tags[row][1]))
+            vector_accordance.sort (key=lambda tup: tup[1], reverse=True)

        return vector_accordance

@@ -168,15 +170,16 @@ class Classifier:

    def save(self):
        import os
-        file_tag_frequency= 'data_tag_frequency.csv'
-        if os.path.isfile(file_tag_frequency):
-            os.remove(file_tag_frequency)
-        file_data= open(file_tag_frequency,"w")
+        file_tag_frequency = 'data_tag_frequency.csv'
+        if os.path.isfile (file_tag_frequency):
+            os.remove (file_tag_frequency)
+        file_data = open (file_tag_frequency, "w")
        if file_data == None:
-            print("Can't create data storage file")
+            print ("Can't create data storage file")
            return False
-        np.savetxt(file_tag_frequency, self.tag_frequency_matrix, delimiter=';', fmt='%1.4f')
-        print("Data is saved into file: "+ file_tag_frequency+" " + str(int(os.stat(file_tag_frequency).st_size/1024)) + 'kB')
+        np.savetxt (file_tag_frequency, self.tag_frequency_matrix, delimiter=';', fmt='%1.4f')
+        print ("Data is saved into file: " + file_tag_frequency + " " + str (
+            int (os.stat (file_tag_frequency).st_size / 1024)) + 'kB')
        return True


@@ -206,28 +209,60 @@ if __name__ == "__main__":
    try:
        # if localhost database is not available then use server
        tags_classif = Classifier ()
-        print('\nI use SERVER DataBase.\n')
-    except OperationalError:
+        print ('\nI use SERVER DataBase.\n')
+    except _mysql.OperationalError:
        # use server DataBase
-        print('\nI use local DataBase.\n')
+        print ('\nI use local DataBase.\n')
        tags_classif = Classifier (host='localhost', port=8080, user='root', password='password', db='news')
    # the method is not implicated
    tags_classif.teach_model ()
-    tags_classif.save ()
+    # tags_classif.save ()
    # exit(0)

-    result = tags_classif.cursor.execute ("select title, description from wp_esi_news ORDER BY RAND() limit 25")
-
-    for i, (title, description) in enumerate (tags_classif.cursor.fetchall ()):
+    sql1 = '''SELECT rez.news_id , rez.title, rez.description
+            FROM (
+                    SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
+                       FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
+                          ORDER BY wp_esi_tag_news.tag_id ) as rez 
+			ORDER BY rand() LIMIT 15'''
+    tags_classif.db.query(sql1)
+    news_results = tags_classif.db.store_result()
+    news_results = news_results.fetch_row(maxrows=0)
+    data = list ()
+    for i, (id_news, title, description) in enumerate (news_results):
+        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
+        # print ("\n#", str (i))
+        print ("\nNews title: " + title.decode('ascii', 'ignore'))
+        tags_classif.classify (text_for_analis)
+        print ("Model calculated Accordance (#tag, %-accordance, tag_description): ", end=" ")
+        print (tags_classif.teg_accordance)
+        sql = " select tag_id from wp_esi_tag_news where news_id =" + str(id_news)
+        # print(sql)
+        results = tags_classif.db.query(sql)
+        tags = tags_classif.db.store_result()
+        tags = tags.fetch_row(maxrows=0)
+        # print(tags)
+        # print(tags_classif.tags)
+        print ("User classified tags for present news:")
+        for (tag, ) in tags:
+            tag = int(tag)
+            # print(tag)
+            print (str(tag - 1) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
+
+    exit (0)
+    tags_classif.db.query("SELECT title, description FROM wp_esi_news ORDER BY RAND() LIMIT 25")
+    result = tags_classif.db.store_result()
+
+    for i, (title, description) in enumerate (result.fetch_row (maxrows=0)):
        text_for_analis = title + '\n' + description
-        print ("\n\n#", str (i))
-        pprint (text_for_analis)
+        print ("\n#", str (i))
+        print (text_for_analis)
        tags_results = tags_classif.classify (text_for_analis)
        # pprint (tags_results[:5])

        # accordance %
-        print("Accordance (#tag, %-accordance, tag_description): ")
-        print(tags_classif.teg_accordance)
+        print ("Accordance (#tag, %-accordance, tag_description): ")
+        print (tags_classif.teg_accordance)
        # log_data (i=i, text_to_analise=text_for_analis, tags=tags_results)

        # if i > 10: break