Commit 6c4536c4 authored by Tags's avatar Tags

Method Cross validation

parent 7a5d8e24
......@@ -92,7 +92,7 @@ def save():
def graph_results(in_data, title=''):
# import numpy as np
from pylab import figure, hist, savefig
from pylab import figure, hist, savefig, show
data = np.asarray(in_data)
figure()
......@@ -152,11 +152,16 @@ if __name__ == '__main__':
# the are some words with great frequency
very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 100])
print('There are ', len(very_frequent_words), ' very frequent words: ')
for item in sorted(very_frequent_words, key=very_frequent_words.get, reverse=True):
print(item, ":", very_frequent_words[item], end=", ")
un_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value < 10])
print('\nThere are ', len(un_frequent_words), ' very seldom words: ')
for item in sorted(un_frequent_words, key=un_frequent_words.get, reverse=True):
print(item, ":", un_frequent_words[item], end=", ")
# compare words set from news to words of DICTIONARY
dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
dict_standart1 = set([word for word in open('large.txt', 'r').read().split('\n')])
......
......@@ -6,8 +6,6 @@ import numpy as np
import _mysql
# test submodule commit
# noinspection PyUnresolvedReferences
class Classifier:
def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
......@@ -120,7 +118,7 @@ class Classifier:
texts = [word.lower() for word in texts]
texts = [word for word in texts if word not in stopwords]
# Addition incorrect words
# and word in self.correct_words]
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts = [stem(word) for word in texts]
texts = " ".join(texts)
return texts
......@@ -130,11 +128,12 @@ class Classifier:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.tfidf = TfidfTransformer()
# self.classifier = MultinomialNB()
self.classifier = KNeighborsClassifier()
# self.classifier = KNeighborsClassifier()
self.classifier = MLPClassifier()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
# ('clf', MultinomialNB()),
......@@ -332,8 +331,8 @@ def cross_validation():
if len(rez_accordance & tags_in_article) != 0:
total_score += 1
else:
print("\n#", str(i))
print("\nNews title: " + title.decode('ascii', 'ignore'))
print("\n\n#", str(i))
print("News title: " + title.decode('ascii', 'ignore'))
print("Model calculated Accordance :", end=" ")
for item_tag in tags_classif.tags:
if int(item_tag[0]) in rez_accordance:
......@@ -345,7 +344,7 @@ def cross_validation():
print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
print("\n\nThe Model was tasted on ", len(news_results), " news. ")
print("The total result is:", total_score / len(news_results))
print("The total accuracy is:", total_score / len(news_results))
# exit(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment