Commit ca1fa8a0 authored by Tags's avatar Tags

Fix Init issues clear: DB info.

Clear main, cross-validation Methods.
parent 9e95fb2f
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp(u'This is the most interesting story about USA and Nokia. All is Ok. Samsung is bad company.')
for item in doc.sentence:
print(item)
......@@ -8,7 +8,7 @@ import _mysql
# noinspection PyUnresolvedReferences
class Classifier:
def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
def __init__(self, corpus=None, host="", user="", password="", db="", port=0):
try:
# import nltk
import operator
......@@ -23,6 +23,9 @@ class Classifier:
# self.train_data = corpus
# return
# Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
# load external dictionary
try:
directory = os.path.dirname(os.path.abspath(__file__))
......@@ -36,9 +39,6 @@ class Classifier:
for word in file_dict:
self.correct_words.add(word[:-1])
# Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
# getting tags
self.db.query("SELECT id, name FROM wp_esi_tag")
rez = self.db.store_result()
......@@ -46,7 +46,6 @@ class Classifier:
for id, description in rez.fetch_row(maxrows=0):
tags.append((int(id), description))
self.tags_dict = dict(tags)
self.tags = tags
del tags
......@@ -61,12 +60,10 @@ class Classifier:
texts = re.sub(r'\s', " ", texts)
texts = texts.split(" ")
texts = [word.lower() for word in texts if word not in stopwords]
# texts = [word for word in texts ]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts = [stem(word) for word in texts]
texts = " ".join(texts)
# print(texts)
return texts
def teach_model(self, data_text=''):
......@@ -130,7 +127,11 @@ class Classifier:
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.tfidf = TfidfTransformer()
self.classifier = KNeighborsClassifier()
# k-mean model
# self.classifier = KNeighborsClassifier()
# The Naive Bayes
self.classifier = MultinomialNB(alpha=2)
# Multilyer Perseptron
# self.classifier = MLPClassifier()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
......@@ -219,7 +220,7 @@ class Classifier:
for item_num, tag_id, in enumerate(self.links_tags):
if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
self.likelihood_list[item_num] += 0.1
# print (self.likelihood_list)
# print (self.likelihood_list)
def graph_results(self):
# import numpy as np
......@@ -309,93 +310,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
out_file.write('\n')
out_file.close()
def cross_validation():
## _______ test from indentifyed news ______________
sql1 = '''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand() LIMIT 20'''
tags_classif.db.query(sql1)
news_results = tags_classif.db.store_result().fetch_row(maxrows=0)
total_score = 0
for i, (id_news, title, description) in enumerate(news_results):
text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
tags_classif.classify(text_for_analis)
rez_accordance = {item[0] for item in tags_classif.tag_accordance}
sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
results = tags_classif.db.query(sql)
tags_in_article = {int(tag) for (tag, ) in tags_classif.db.store_result().fetch_row(maxrows=0)}
if len(rez_accordance & tags_in_article) != 0:
total_score += 1
else:
print("\n\n#", str(i))
print("News title: " + title.decode('ascii', 'ignore'))
print("Model calculated Accordance :", end=" ")
for item_tag in tags_classif.tags:
if int(item_tag[0]) in rez_accordance:
print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
print("\nUser classified tags for news: ", end="")
# print(set(rez_accordance))
for item_tag in tags_classif.tags:
if int(item_tag[0]) in tags_in_article:
print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
print("\n\nThe Model was tasted on ", len(news_results), " news. ")
print("The total accuracy is:", total_score / len(news_results))
# exit(0)
if __name__ == "__main__":
try:
# if localhost database is not available then use server
tags_classif = Classifier()
print('\nI use SERVER DataBase.\n')
except _mysql.OperationalError:
# use server DataBase
print('\nI use local DataBase.\n')
tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
# the method is not implicated
from time import time
now = time()
tags_classif.teach_model()
print("_"*40, "\n TRAINING TIME IS ", (time()-now), 's\n', "_"*40)
# tags_classif.save()
# exit(0)
cross_validation()
exit(0)
# print("\n\n")
tags_classif.db.query(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 20")
result = tags_classif.db.store_result()
# news analysis witt title + description + text + company_id
for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
text_for_analys = ''
if title is not None:
text_for_analys += title.decode("ascii", 'ignore') + '\n'
if description is not None:
text_for_analys += description.decode('ascii', 'ignore') + " "
if text is not None:
text_for_analys += text.decode("ascii", 'ignore')
print("\n#", str(i))
print('Title: ', title.decode("ascii", 'ignore'))
print("Descr.:", text.decode("ascii", 'ignore')[:80])
print('entity :', entity_id)
tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)
# accordance %
print("Accordance(#tag, %-accordance, tag_description): ")
print(tags_classif.tag_accordance)
# log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
# if i > 10: break
# Graph presentation results
# tags_classif.graph_results()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment