Fix Init issues clear: DB info.

Clear main, cross-validation Methods.

Fix Init issues clear: DB info.
Clear main, cross-validation Methods.
ca1fa8a0 · Tags · 9e95fb2f · ca1fa8a0 · ca1fa8a0
Commit ca1fa8a0 authored Sep 18, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 99 deletions

teach_1.py Spacy/teach_1.py +8 -0

news_classify_tag.py news_classify_tag.py +10 -99

No files found.
--- a/Spacy/teach_1.py
+++ b/Spacy/teach_1.py
+import spacy
+
+nlp = spacy.load('en_core_web_md')
+
+doc = nlp(u'This is the most interesting story about USA and Nokia. All is Ok. Samsung is bad company.')
+
+for item in doc.sentence:
+    print(item)
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -8,7 +8,7 @@ import _mysql

 # noinspection PyUnresolvedReferences
 class Classifier:
-    def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
+    def __init__(self, corpus=None, host="", user="", password="", db="", port=0):
        try:
            # import nltk
            import operator
@@ -23,6 +23,9 @@ class Classifier:
        #     self.train_data = corpus
        #     return

+        # Extract data form DataBase
+        self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
+
        # load external dictionary
        try:
            directory = os.path.dirname(os.path.abspath(__file__))
@@ -36,9 +39,6 @@ class Classifier:
        for word in file_dict:
            self.correct_words.add(word[:-1])

-        # Extract data form DataBase
-        self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
-
        # getting tags
        self.db.query("SELECT id, name FROM wp_esi_tag")
        rez = self.db.store_result()
@@ -46,7 +46,6 @@ class Classifier:
        for id, description in rez.fetch_row(maxrows=0):
            tags.append((int(id), description))
        self.tags_dict = dict(tags)
-
        self.tags = tags
        del tags

@@ -61,12 +60,10 @@ class Classifier:
        texts = re.sub(r'\s', " ", texts)
        texts = texts.split(" ")
        texts = [word.lower() for word in texts if word not in stopwords]
-        # texts = [word for word in texts ]
        # Addition incorrect words
        # texts = [word for word in texts if word not in stopwords and word in self.correct_words]
        texts = [stem(word) for word in texts]
        texts = " ".join(texts)
-        # print(texts)
        return texts

    def teach_model(self, data_text=''):
@@ -130,7 +127,11 @@ class Classifier:

        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
        self.tfidf = TfidfTransformer()
-        self.classifier = KNeighborsClassifier()
+        # k-mean model
+        # self.classifier = KNeighborsClassifier()
+        # The Naive Bayes
+        self.classifier = MultinomialNB(alpha=2)
+        # Multilyer Perseptron
        # self.classifier = MLPClassifier()
        # self.classifier = Pipeline([
        #     ('vect', TfidfVectorizer(stop_words='english')),
@@ -219,7 +220,7 @@ class Classifier:
                for item_num, tag_id, in enumerate(self.links_tags):
                    if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
                        self.likelihood_list[item_num] += 0.1
-                        # print (self.likelihood_list)
+        # print (self.likelihood_list)

    def graph_results(self):
        # import numpy as np
@@ -309,93 +310,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
    out_file.write('\n')
    out_file.close()

-
-def cross_validation():
-    ## _______ test from indentifyed news ______________
-    sql1 = '''SELECT rez.news_id , rez.title, rez.description
-                FROM(
-                        SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
-                           FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-                              ORDER BY wp_esi_tag_news.tag_id ) AS rez
-    		                  ORDER BY rand() LIMIT 20'''
-
-    tags_classif.db.query(sql1)
-    news_results = tags_classif.db.store_result().fetch_row(maxrows=0)
-    total_score = 0
-    for i, (id_news, title, description) in enumerate(news_results):
-        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
-        tags_classif.classify(text_for_analis)
-        rez_accordance = {item[0] for item in tags_classif.tag_accordance}
-
-        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
-        results = tags_classif.db.query(sql)
-        tags_in_article = {int(tag) for (tag, ) in tags_classif.db.store_result().fetch_row(maxrows=0)}
-        if len(rez_accordance & tags_in_article) != 0:
-            total_score += 1
-        else:
-            print("\n\n#", str(i))
-            print("News title: " + title.decode('ascii', 'ignore'))
-            print("Model calculated Accordance  :", end=" ")
-            for item_tag in tags_classif.tags:
-                if int(item_tag[0]) in rez_accordance:
-                    print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
-            print("\nUser classified tags for news: ", end="")
-            # print(set(rez_accordance))
-            for item_tag in tags_classif.tags:
-                if int(item_tag[0]) in tags_in_article:
-                    print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
-
-    print("\n\nThe Model was tasted on ", len(news_results), " news. ")
-    print("The total accuracy is:", total_score / len(news_results))
-
-    # exit(0)
-
-if __name__ == "__main__":
-    try:
-        # if localhost database is not available then use server
-        tags_classif = Classifier()
-        print('\nI use SERVER DataBase.\n')
-    except _mysql.OperationalError:
-        # use server DataBase
-        print('\nI use local DataBase.\n')
-        tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
-    # the method is not implicated
-    from time import time
-    now = time()
-    tags_classif.teach_model()
-    print("_"*40, "\n TRAINING TIME IS ", (time()-now), 's\n', "_"*40)
-    # tags_classif.save()
-    # exit(0)
-
-    cross_validation()
-    exit(0)
-    # print("\n\n")
-
-    tags_classif.db.query(
-        "SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 20")
-    result = tags_classif.db.store_result()
-
-    # news analysis witt title + description + text + company_id
-    for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
-        text_for_analys = ''
-        if title is not None:
-            text_for_analys += title.decode("ascii", 'ignore') + '\n'
-        if description is not None:
-            text_for_analys += description.decode('ascii', 'ignore') + " "
-        if text is not None:
-            text_for_analys += text.decode("ascii", 'ignore')
-        print("\n#", str(i))
-        print('Title: ', title.decode("ascii", 'ignore'))
-        print("Descr.:", text.decode("ascii", 'ignore')[:80])
-        print('entity :', entity_id)
-        tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)
-
-        # accordance %
-        print("Accordance(#tag, %-accordance, tag_description): ")
-        print(tags_classif.tag_accordance)
-        # log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
-
-        # if i > 10: break
-
-        # Graph presentation results
-        # tags_classif.graph_results()