Improve save Method.

f9f1cbfb · Tags · 27909e71 · f9f1cbfb · f9f1cbfb
Commit f9f1cbfb authored Aug 21, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 44 deletions

model_data_analysis.py model_data_analysis.py +1 -1

news_classify_tag.py news_classify_tag.py +50 -43

No files found.
--- a/model_data_analysis.py
+++ b/model_data_analysis.py
@@ -97,8 +97,8 @@ def graph_results(in_data, title=''):
    figure()

    hist(data, bins=50)
+    show()
    savefig("fig.png")
-    # show()


 def text_clear(word):

--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -132,7 +132,7 @@ class Classifier:
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.neighbors import KNeighborsClassifier

-        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
+        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 2))
        self.tfidf = TfidfTransformer()
        # self.classifier = MultinomialNB()
        self.classifier = KNeighborsClassifier()
@@ -146,9 +146,10 @@ class Classifier:

        # _________ ML learning posses ___________
        X_data = self.vectorizer.fit_transform(self.X_text_data)
-        # X_data = self.tfidf.fit_transform(X_data)
+        X_data = self.tfidf.fit_transform(X_data)
        self.classifier.fit(X_data, self.y_data)
        self.links_tags = set(self.y_data)
+        self.X_data = X_data

        # ___________________result usage __________________________
        # X_test = self.vectorarizer.transform(["Hello In the world from android programmers. My apps are so cool.",])
@@ -186,8 +187,10 @@ class Classifier:

    def classify(self, text, test_corpus=None, entity_id=0):
        """
+        Method which test the :type:text for tags relevancy

-        :type text: is text for analysis
+        :type text: is text for analysis.
+        :type entity_id: is the entity id  in the DB.
        """

        if text is None:
@@ -235,36 +238,40 @@ class Classifier:
        return True

    def save(self):
-        return
+        # return
        import os

        # save info data
-        file_info = open("info_model.txt", 'w')
+
+        directory = os.path.dirname(os.path.abspath(__file__))
+        file_name_dictionary = directory + '/info_model.txt'
+        file_info = open(file_name_dictionary, 'w')
        if file_info == None:
            return False
        file_info.write("number of features = " + str(len(self.vectorizer.get_feature_names())) + '\n')
        file_info.write("Quantity of tags = " + str(len(self.tags)) + '\n')
        file_info.write("\ndata_tag_frequency.csv  ---- Model data matrix --- " + \
-                        str(self.trained_model.shape) + "\n")
+                        str(self.X_data.shape) + "\n")
        file_info.write("\nDictionary.txt -- Total number words in  is---" + str(
            len(self.vectorizer.get_feature_names())) + '\n')
        file_info.write("\ntags.txt --- Total number is ---" + str(len(self.tags)) + '\n')
        file_info.close()

        # save model matrix
-        file_tag_frequency = 'data_tag_frequency.csv'
+        file_tag_frequency = directory + '/data_tag_frequency.csv'
        if os.path.isfile(file_tag_frequency):
            os.remove(file_tag_frequency)
        file_data = open(file_tag_frequency, "w")
        if file_data == None:
            print("Can't create data storage file")
            return False
-        np.savetxt(file_tag_frequency, self.tag_frequency_matrix, delimiter=';', fmt='%1.4f')
+        np.savetxt(file_tag_frequency, self.X_data.toarray(), delimiter=';', fmt='%1.4f')
        print("Data is saved into file: " + file_tag_frequency + " " + str(
            int(os.stat(file_tag_frequency).st_size / 1024)) + 'kB')

        # save dictionary
-        file_info_dictionaries = open("Dictionary.txt", "w")
+
+        file_info_dictionaries = open(directory + "/Dictionary.txt", "w")
        if file_info_dictionaries == None:
            return False
        for line in self.vectorizer.get_feature_names():
@@ -272,7 +279,7 @@ class Classifier:
        file_info_dictionaries.close()

        # save tags
-        file_info_tags = open("tags.txt", "w")
+        file_info_tags = open(directory + "/tags.txt", "w")
        if file_info_tags == None:
            return False
        for text1, text2 in self.tags:
@@ -314,41 +321,41 @@ if __name__ == "__main__":
        tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
    # the method is not implicated
    tags_classif.teach_model()
-    # tags_classif.save()
+    tags_classif.save()
    # exit(0)

-    # ## _______ test from indentifyed news ______________
-    # sql1 = '''SELECT rez.news_id , rez.title, rez.description
-    #         FROM(
-    #                 SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
-    #                    FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
-    #                       ORDER BY wp_esi_tag_news.tag_id ) AS rez
-		#                   ORDER BY rand() LIMIT 30'''
-    #
-    # tags_classif.db.query(sql1)
-    # news_results = tags_classif.db.store_result()
-    # news_results = news_results.fetch_row(maxrows=0)
-    # data = list()
-    # for i, (id_news, title, description) in enumerate(news_results):
-    #     text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
-    #     # print("\n#", str(i))
-    #     print("\nNews title: " + title.decode('ascii', 'ignore'))
-    #     tags_classif.classify(text_for_analis)
-    #     print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
-    #     print(tags_classif.tag_accordance)
-    #     sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
-    #     # print(sql)
-    #     results = tags_classif.db.query(sql)
-    #     tags = tags_classif.db.store_result()
-    #     tags = tags.fetch_row(maxrows=0)
-    #     # print(tags)
-    #     # print(tags_classif.tags)
-    #     print("User classified tags for present news:")
-    #     for (tag,) in tags:
-    #         tag = int(tag)
-    #         # print(tag)
-    #         print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
-    # # exit(0)
+    ## _______ test from indentifyed news ______________
+    sql1 = '''SELECT rez.news_id , rez.title, rez.description
+            FROM(
+                    SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
+                       FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
+                          ORDER BY wp_esi_tag_news.tag_id ) AS rez
+		                  ORDER BY rand() LIMIT 30'''
+
+    tags_classif.db.query(sql1)
+    news_results = tags_classif.db.store_result()
+    news_results = news_results.fetch_row(maxrows=0)
+    data = list()
+    for i, (id_news, title, description) in enumerate(news_results):
+        text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
+        # print("\n#", str(i))
+        print("\nNews title: " + title.decode('ascii', 'ignore'))
+        tags_classif.classify(text_for_analis)
+        print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
+        print(tags_classif.tag_accordance)
+        sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
+        # print(sql)
+        results = tags_classif.db.query(sql)
+        tags = tags_classif.db.store_result()
+        tags = tags.fetch_row(maxrows=0)
+        # print(tags)
+        # print(tags_classif.tags)
+        print("User classified tags for present news:")
+        for (tag,) in tags:
+            tag = int(tag)
+            # print(tag)
+            print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
+    exit(0)
    # print("\n\n")

    tags_classif.db.query("SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")