Commit f9f1cbfb authored by Tags's avatar Tags

Improve save Method.

parent 27909e71
......@@ -97,8 +97,8 @@ def graph_results(in_data, title=''):
hist(data, bins=50)
# show()
def text_clear(word):
......@@ -132,7 +132,7 @@ class Classifier:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 2))
self.tfidf = TfidfTransformer()
# self.classifier = MultinomialNB()
self.classifier = KNeighborsClassifier()
......@@ -146,9 +146,10 @@ class Classifier:
# _________ ML learning posses ___________
X_data = self.vectorizer.fit_transform(self.X_text_data)
# X_data = self.tfidf.fit_transform(X_data)
X_data = self.tfidf.fit_transform(X_data), self.y_data)
self.links_tags = set(self.y_data)
self.X_data = X_data
# ___________________result usage __________________________
# X_test = self.vectorarizer.transform(["Hello In the world from android programmers. My apps are so cool.",])
......@@ -186,8 +187,10 @@ class Classifier:
def classify(self, text, test_corpus=None, entity_id=0):
Method which test the :type:text for tags relevancy
:type text: is text for analysis
:type text: is text for analysis.
:type entity_id: is the entity id in the DB.
if text is None:
......@@ -235,36 +238,40 @@ class Classifier:
return True
def save(self):
# return
import os
# save info data
file_info = open("info_model.txt", 'w')
directory = os.path.dirname(os.path.abspath(__file__))
file_name_dictionary = directory + '/info_model.txt'
file_info = open(file_name_dictionary, 'w')
if file_info == None:
return False
file_info.write("number of features = " + str(len(self.vectorizer.get_feature_names())) + '\n')
file_info.write("Quantity of tags = " + str(len(self.tags)) + '\n')
file_info.write("\ndata_tag_frequency.csv ---- Model data matrix --- " + \
str(self.trained_model.shape) + "\n")
str(self.X_data.shape) + "\n")
file_info.write("\nDictionary.txt -- Total number words in is---" + str(
len(self.vectorizer.get_feature_names())) + '\n')
file_info.write("\ntags.txt --- Total number is ---" + str(len(self.tags)) + '\n')
# save model matrix
file_tag_frequency = 'data_tag_frequency.csv'
file_tag_frequency = directory + '/data_tag_frequency.csv'
if os.path.isfile(file_tag_frequency):
file_data = open(file_tag_frequency, "w")
if file_data == None:
print("Can't create data storage file")
return False
np.savetxt(file_tag_frequency, self.tag_frequency_matrix, delimiter=';', fmt='%1.4f')
np.savetxt(file_tag_frequency, self.X_data.toarray(), delimiter=';', fmt='%1.4f')
print("Data is saved into file: " + file_tag_frequency + " " + str(
int(os.stat(file_tag_frequency).st_size / 1024)) + 'kB')
# save dictionary
file_info_dictionaries = open("Dictionary.txt", "w")
file_info_dictionaries = open(directory + "/Dictionary.txt", "w")
if file_info_dictionaries == None:
return False
for line in self.vectorizer.get_feature_names():
......@@ -272,7 +279,7 @@ class Classifier:
# save tags
file_info_tags = open("tags.txt", "w")
file_info_tags = open(directory + "/tags.txt", "w")
if file_info_tags == None:
return False
for text1, text2 in self.tags:
......@@ -314,41 +321,41 @@ if __name__ == "__main__":
tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
# the method is not implicated
# exit(0)
# ## _______ test from indentifyed news ______________
# sql1 = '''SELECT rez.news_id , rez.title, rez.description
# SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
# FROM wp_esi_news, wp_esi_tag_news WHERE
# ORDER BY wp_esi_tag_news.tag_id ) AS rez
# ORDER BY rand() LIMIT 30'''
# tags_classif.db.query(sql1)
# news_results = tags_classif.db.store_result()
# news_results = news_results.fetch_row(maxrows=0)
# data = list()
# for i, (id_news, title, description) in enumerate(news_results):
# text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
# # print("\n#", str(i))
# print("\nNews title: " + title.decode('ascii', 'ignore'))
# tags_classif.classify(text_for_analis)
# print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
# print(tags_classif.tag_accordance)
# sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
# # print(sql)
# results = tags_classif.db.query(sql)
# tags = tags_classif.db.store_result()
# tags = tags.fetch_row(maxrows=0)
# # print(tags)
# # print(tags_classif.tags)
# print("User classified tags for present news:")
# for (tag,) in tags:
# tag = int(tag)
# # print(tag)
# print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
# # exit(0)
## _______ test from indentifyed news ______________
sql1 = '''SELECT rez.news_id , rez.title, rez.description
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand() LIMIT 30'''
news_results = tags_classif.db.store_result()
news_results = news_results.fetch_row(maxrows=0)
data = list()
for i, (id_news, title, description) in enumerate(news_results):
text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
# print("\n#", str(i))
print("\nNews title: " + title.decode('ascii', 'ignore'))
print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
# print(sql)
results = tags_classif.db.query(sql)
tags = tags_classif.db.store_result()
tags = tags.fetch_row(maxrows=0)
# print(tags)
# print(tags_classif.tags)
print("User classified tags for present news:")
for (tag,) in tags:
tag = int(tag)
# print(tag)
print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
# print("\n\n")
tags_classif.db.query("SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
