Commit b66f9757 authored by Tags's avatar Tags

Make Save Method for data analysis:

- info .txt
- Frequency_matrix.csv -- model matrix
- Dictionary.txt
- tags.txt
parent a50a79e6
......@@ -4,6 +4,7 @@
import numpy as np
import _mysql
# test submodule commit
# noinspection PyUnresolvedReferences
......@@ -17,7 +18,7 @@ class Classifier:
except ImportError:
print ('You have import flowing packages: sklearn & nltk & re.')
stopwords = set (nltk.corpus.stopwords.words('english'))
stopwords = set (nltk.corpus.stopwords.words ('english'))
stopwords.update (['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
if corpus is not None:
......@@ -113,8 +114,8 @@ class Classifier:
if data_text == '':
data_text = self.tag_train_text
self.trained_model = self.vectorizer.fit_transform (data_text)
print ("number of feaches =", len (self.vectorizer.get_feature_names ()))
print ("Quontety of tags = ", len (self.tags))
print ("number of features =", len (self.vectorizer.get_feature_names ()))
print ("Quantity of tags = ", len (self.tags))
self.tag_frequency_matrix_compute ()
return self.trained_model
......@@ -172,6 +173,21 @@ class Classifier:
def save(self):
import os
# save info data
file_info = open ("info_model.txt", 'w')
if file_info == None:
return False
file_info.write ("number of features = " + str (len (self.vectorizer.get_feature_names ())) + '\n')
file_info.write ("Quantity of tags = " + str (len (self.tags)) + '\n')
file_info.write ("\ndata_tag_frequency.csv ---- Model data matrix --- " + \
str (self.trained_model.shape) + "\n")
file_info.write ("\nDictionary.txt -- Total number words in is---" + str (
len (self.vectorizer.get_feature_names ())) + '\n')
file_info.write ("\ntags.txt --- Total number is ---" + str (len (self.tags)) + '\n')
file_info.close ()
# save model matrix
file_tag_frequency = 'data_tag_frequency.csv'
if os.path.isfile (file_tag_frequency):
os.remove (file_tag_frequency)
......@@ -182,6 +198,22 @@ class Classifier:
np.savetxt (file_tag_frequency, self.tag_frequency_matrix, delimiter=';', fmt='%1.4f')
print ("Data is saved into file: " + file_tag_frequency + " " + str (
int (os.stat (file_tag_frequency).st_size / 1024)) + 'kB')
# save dictionary
file_info_dictionaries = open ("Dictionary.txt", "w")
if file_info_dictionaries == None:
return False
for line in self.vectorizer.get_feature_names ():
file_info_dictionaries.write (line + "\n")
file_info_dictionaries.close ()
# save tags
file_info_tags = open ("tags.txt", "w")
if file_info_tags == None:
return False
for text1, text2 in self.tags:
file_info_tags.write (text1 + " " + text2.decode ('ascii', "ignore") + "\n")
file_info_tags.close ()
return True
......@@ -218,7 +250,7 @@ if __name__ == "__main__":
tags_classif = Classifier (host='localhost', port=8080, user='root', password='password', db='news')
# the method is not implicated
tags_classif.teach_model ()
# tags_classif.save ()
tags_classif.save ()
# exit(0)
sql1 = '''SELECT rez.news_id , rez.title, rez.description
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment