Merge branch 'master' of https://gitlab.com/taraslut78/esi_news_classification

d71c4654 · Andrii Marynets · 9b6fec7b · f505b55e · d71c4654
Commit d71c4654 authored Nov 04, 2017 by Andrii Marynets
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 142 deletions

news_classify_tag.py news_classify_tag.py +52 -142

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -8,7 +8,7 @@ import _mysql

 # noinspection PyUnresolvedReferences
 class Classifier:
-    def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
+    def __init__(self, corpus=None, host="", user="", password="", db="", port=0):
        try:
            # import nltk
            import operator
@@ -16,12 +16,17 @@ class Classifier:
        except ImportError:
            print('You have import flowing packages: sklearn & nltk & re.')

-        stopwords = set(nltk.corpus.stopwords.words('english'))
-        stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])

-        if corpus is not None:
-            self.train_data = corpus
-            return
+        # stopwords for text_clearner Method
+        self.stopwords = set(nltk.corpus.stopwords.words('english'))
+        self.stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
+
+        # if corpus is not None:
+        #     self.train_data = corpus
+        #     return
+
+        # Extract data form DataBase
+        self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)

        # load external dictionary
        try:
@@ -36,8 +41,11 @@ class Classifier:
        for word in file_dict:
            self.correct_words.add(word[:-1])

-        # Extract data form DataBase
-        self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
+        #
+        # from stemming.porter2 import stem
+        from nltk.stem import PorterStemmer
+        self.stem = PorterStemmer().stem
+        # self.stem.

        # getting tags
        self.db.query("SELECT id, name FROM wp_esi_tag")
@@ -46,31 +54,37 @@ class Classifier:
        for id, description in rez.fetch_row(maxrows=0):
            tags.append((int(id), description))
        self.tags_dict = dict(tags)
-
        self.tags = tags
-        # print('tags array = ', len(self.tags), self.tags)
-        # print(self.tags_dict)
-        # print(self.tags_dict[355])
-        # exit(0)
        del tags

-        # train_data = []
-        # text_id = []
-        # sql1 = '''SELECT wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
-        #                FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-        #                   ORDER BY wp_esi_tag_news.tag_id LIMIT 1000 '''
-        # self.db.query(sql1)
-        # result = self.db.store_result()
-        # data = list()
-        # for tag_id, title, description in result.fetch_row(maxrows=0):
-        #     data.append((tag_id, title, description))
+    def text_clear(self, texts):
+        import re
+        from nltk.tokenize import word_tokenize
+        texts = re.sub(r'\d', "", texts)
+        texts = texts.translate(str.maketrans("?!,.+-:;\/", 10*" "))
+        texts = word_tokenize(texts) #texts.split(" ")
+        texts = [word.lower() for word in texts if word not in self.stopwords]
+        # Addition incorrect words
+        # texts = [word for word in texts if word not in stopwords and word in self.correct_words]
+        try:
+            texts = [self.stem(word) for word in texts if len(word) > 0]
+        except:
+            pass
+        texts = " ".join(texts)
+        return texts
+
+    def teach_model(self, data_text=''):
+        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
+        from sklearn.pipeline import Pipeline
+        from sklearn.naive_bayes import MultinomialNB
+        from sklearn.neighbors import KNeighborsClassifier
+        # from sklearn.neural_network import MLPClassifier

        sql1 = '''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
-                               FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
-                                  ORDER BY wp_esi_ml_data.tag_id'''
+                                       FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
+                                          ORDER BY wp_esi_ml_data.tag_id'''
        self.db.query(sql1)
        result = self.db.store_result()
-
        self.X_text_data = list()
        self.y_data = list()
        for tag_id, title, description, text in result.fetch_row(maxrows=0):
@@ -84,6 +98,8 @@ class Classifier:
            if texts != '':
                self.X_text_data.append(self.text_clear(texts))
                self.y_data.append(int(tag_id))
+        # print(self.X_text_data)
+        # print(self.y_data)

        # _____________ entity develop ______________
        sql_entity = "SELECT id, name FROM wp_esi_entity"
@@ -94,13 +110,12 @@ class Classifier:
        del result_entity
        # print("total entitys # is ", len(self.entity))
        sql_entity_news = """SELECT wp_esi_news_entity.entity_id, concat(wp_esi_news.title, wp_esi_news.description)
-		    FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
+        		    FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
        self.db.query(sql_entity_news)
        result_entity_news = self.db.store_result()
        self.entity_news = list(
            (entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
        # print("total # of news linked with entity is", len(self.entity_news))
-        sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """

        # ________Entitys -- tags _________________
        sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
@@ -116,33 +131,14 @@ class Classifier:
                # print(len(self.entity_tags))
                # exit(0)

-    def text_clear(self, texts):
-        import re
-        from stemming.porter2 import stem
-
-        stopwords = set(nltk.corpus.stopwords.words('english'))
-        stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])

-        texts = re.sub(r'\d', " ", texts)
-        texts = texts.split(" ")
-        texts = [word.lower() for word in texts]
-        texts = [word for word in texts if word not in stopwords]
-        # Addition incorrect words
-        # texts = [word for word in texts if word not in stopwords and word in self.correct_words]
-        texts = [stem(word) for word in texts]
-        texts = " ".join(texts)
-        return texts
-
-    def teach_model(self, data_text=''):
-        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
-        from sklearn.pipeline import Pipeline
-        from sklearn.naive_bayes import MultinomialNB
-        from sklearn.neighbors import KNeighborsClassifier
-        from sklearn.neural_network import MLPClassifier
-
-        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
+        self.vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1))
        self.tfidf = TfidfTransformer()
-        self.classifier = KNeighborsClassifier()
+        # k-mean model
+        # self.classifier = KNeighborsClassifier()
+        # The Naive Bayes
+        self.classifier = MultinomialNB(alpha=2)
+        # Multilyer Perseptron
        # self.classifier = MLPClassifier()
        # self.classifier = Pipeline([
        #     ('vect', TfidfVectorizer(stop_words='english')),
@@ -173,7 +169,7 @@ class Classifier:
        # exit(0)

    @property
-    def tag_accordance(self, persantage=50):
+    def tag_accordance(self, persantage=20):
        """
        Class method for computing % af tag accordance 
        :return: tuple of % af tag accordance
@@ -218,6 +214,7 @@ class Classifier:
        if type(text) is bytes:
            text.decode('ascii', 'ignore')
        matrix_test_data = self.vectorizer.transform([self.text_clear(text_test)])
+        matrix_test_data = self.tfidf.fit_transform(matrix_test_data)
        rez = self.classifier.predict_proba(matrix_test_data)
        # exit(0)
        self.likelihood_list = rez[0]
@@ -231,7 +228,7 @@ class Classifier:
                for item_num, tag_id, in enumerate(self.links_tags):
                    if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
                        self.likelihood_list[item_num] += 0.1
-                        # print (self.likelihood_list)
+        # print (self.likelihood_list)

    def graph_results(self):
        # import numpy as np
@@ -321,90 +318,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
    out_file.write('\n')
    out_file.close()

-
-def cross_validation():
-    ## _______ test from indentifyed news ______________
-    sql1 = '''SELECT rez.news_id , rez.title, rez.description
-                FROM(
-                        SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
-                           FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-                              ORDER BY wp_esi_tag_news.tag_id ) AS rez
-    		                  ORDER BY rand()'''
-
-    tags_classif.db.query(sql1)
-    news_results = tags_classif.db.store_result().fetch_row(maxrows=0)
-    total_score = 0
-    for i, (id_news, title, description) in enumerate(news_results):
-        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
-        tags_classif.classify(text_for_analis)
-        rez_accordance = {item[0] for item in tags_classif.tag_accordance}
-
-        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
-        results = tags_classif.db.query(sql)
-        tags_in_article = {int(tag) for (tag, ) in tags_classif.db.store_result().fetch_row(maxrows=0)}
-        if len(rez_accordance & tags_in_article) != 0:
-            total_score += 1
-        else:
-            print("\n\n#", str(i))
-            print("News title: " + title.decode('ascii', 'ignore'))
-            print("Model calculated Accordance  :", end=" ")
-            for item_tag in tags_classif.tags:
-                if int(item_tag[0]) in rez_accordance:
-                    print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
-            print("\nUser classified tags for news: ", end="")
-            # print(set(rez_accordance))
-            for item_tag in tags_classif.tags:
-                if int(item_tag[0]) in tags_in_article:
-                    print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
-
-    print("\n\nThe Model was tasted on ", len(news_results), " news. ")
-    print("The total accuracy is:", total_score / len(news_results))
-
-    # exit(0)
-
-if __name__ == "__main__":
-    try:
-        # if localhost database is not available then use server
-        tags_classif = Classifier()
-        print('\nI use SERVER DataBase.\n')
-    except _mysql.OperationalError:
-        # use server DataBase
-        print('\nI use local DataBase.\n')
-        tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
-    # the method is not implicated
-    tags_classif.teach_model()
-    # tags_classif.save()
-    # exit(0)
-
-    cross_validation()
-    exit(0)
-    # print("\n\n")
-
-    tags_classif.db.query(
-        "SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
-    result = tags_classif.db.store_result()
-
-    # news analysis witt title + description + text + company_id
-    for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
-        text_for_analys = ''
-        if title is not None:
-            text_for_analys += title.decode("ascii", 'ignore') + '\n'
-        if description is not None:
-            text_for_analys += description.decode('ascii', 'ignore') + " "
-        if text is not None:
-            text_for_analys += text.decode("ascii", 'ignore')
-        print("\n#", str(i))
-        print('Title: ', title.decode("ascii", 'ignore'))
-        print("Descr.:", text.decode("ascii", 'ignore')[:80])
-        print('entity :', entity_id)
-        tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)
-
-        # accordance %
-        print("Accordance(#tag, %-accordance, tag_description): ")
-        print(tags_classif.tag_accordance)
-        # log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
-
-        # if i > 10: break
-
-        # Graph presentation results
-        # tags_classif.graph_results()