Commit d71c4654 authored by Andrii Marynets's avatar Andrii Marynets
parents 9b6fec7b f505b55e
......@@ -8,7 +8,7 @@ import _mysql
# noinspection PyUnresolvedReferences
class Classifier:
def __init__(self, corpus=None, host='176.58.117.151', user='esi', password='esi12345', db='esi', port=3306):
def __init__(self, corpus=None, host="", user="", password="", db="", port=0):
try:
# import nltk
import operator
......@@ -16,12 +16,17 @@ class Classifier:
except ImportError:
print('You have import flowing packages: sklearn & nltk & re.')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
if corpus is not None:
self.train_data = corpus
return
# stopwords for text_clearner Method
self.stopwords = set(nltk.corpus.stopwords.words('english'))
self.stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
# if corpus is not None:
# self.train_data = corpus
# return
# Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
# load external dictionary
try:
......@@ -36,8 +41,11 @@ class Classifier:
for word in file_dict:
self.correct_words.add(word[:-1])
# Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
#
# from stemming.porter2 import stem
from nltk.stem import PorterStemmer
self.stem = PorterStemmer().stem
# self.stem.
# getting tags
self.db.query("SELECT id, name FROM wp_esi_tag")
......@@ -46,31 +54,37 @@ class Classifier:
for id, description in rez.fetch_row(maxrows=0):
tags.append((int(id), description))
self.tags_dict = dict(tags)
self.tags = tags
# print('tags array = ', len(self.tags), self.tags)
# print(self.tags_dict)
# print(self.tags_dict[355])
# exit(0)
del tags
# train_data = []
# text_id = []
# sql1 = '''SELECT wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
# FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
# ORDER BY wp_esi_tag_news.tag_id LIMIT 1000 '''
# self.db.query(sql1)
# result = self.db.store_result()
# data = list()
# for tag_id, title, description in result.fetch_row(maxrows=0):
# data.append((tag_id, title, description))
def text_clear(self, texts):
import re
from nltk.tokenize import word_tokenize
texts = re.sub(r'\d', "", texts)
texts = texts.translate(str.maketrans("?!,.+-:;\/", 10*" "))
texts = word_tokenize(texts) #texts.split(" ")
texts = [word.lower() for word in texts if word not in self.stopwords]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
try:
texts = [self.stem(word) for word in texts if len(word) > 0]
except:
pass
texts = " ".join(texts)
return texts
def teach_model(self, data_text=''):
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier
sql1 = '''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
ORDER BY wp_esi_ml_data.tag_id'''
FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
ORDER BY wp_esi_ml_data.tag_id'''
self.db.query(sql1)
result = self.db.store_result()
self.X_text_data = list()
self.y_data = list()
for tag_id, title, description, text in result.fetch_row(maxrows=0):
......@@ -84,6 +98,8 @@ class Classifier:
if texts != '':
self.X_text_data.append(self.text_clear(texts))
self.y_data.append(int(tag_id))
# print(self.X_text_data)
# print(self.y_data)
# _____________ entity develop ______________
sql_entity = "SELECT id, name FROM wp_esi_entity"
......@@ -94,13 +110,12 @@ class Classifier:
del result_entity
# print("total entitys # is ", len(self.entity))
sql_entity_news = """SELECT wp_esi_news_entity.entity_id, concat(wp_esi_news.title, wp_esi_news.description)
FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
self.db.query(sql_entity_news)
result_entity_news = self.db.store_result()
self.entity_news = list(
(entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
# print("total # of news linked with entity is", len(self.entity_news))
sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
# ________Entitys -- tags _________________
sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
......@@ -116,33 +131,14 @@ class Classifier:
# print(len(self.entity_tags))
# exit(0)
def text_clear(self, texts):
import re
from stemming.porter2 import stem
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
texts = re.sub(r'\d', " ", texts)
texts = texts.split(" ")
texts = [word.lower() for word in texts]
texts = [word for word in texts if word not in stopwords]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts = [stem(word) for word in texts]
texts = " ".join(texts)
return texts
def teach_model(self, data_text=''):
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1))
self.tfidf = TfidfTransformer()
self.classifier = KNeighborsClassifier()
# k-mean model
# self.classifier = KNeighborsClassifier()
# The Naive Bayes
self.classifier = MultinomialNB(alpha=2)
# Multilyer Perseptron
# self.classifier = MLPClassifier()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
......@@ -173,7 +169,7 @@ class Classifier:
# exit(0)
@property
def tag_accordance(self, persantage=50):
def tag_accordance(self, persantage=20):
"""
Class method for computing % af tag accordance
:return: tuple of % af tag accordance
......@@ -218,6 +214,7 @@ class Classifier:
if type(text) is bytes:
text.decode('ascii', 'ignore')
matrix_test_data = self.vectorizer.transform([self.text_clear(text_test)])
matrix_test_data = self.tfidf.fit_transform(matrix_test_data)
rez = self.classifier.predict_proba(matrix_test_data)
# exit(0)
self.likelihood_list = rez[0]
......@@ -231,7 +228,7 @@ class Classifier:
for item_num, tag_id, in enumerate(self.links_tags):
if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
self.likelihood_list[item_num] += 0.1
# print (self.likelihood_list)
# print (self.likelihood_list)
def graph_results(self):
# import numpy as np
......@@ -321,90 +318,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
out_file.write('\n')
out_file.close()
def cross_validation():
## _______ test from indentifyed news ______________
sql1 = '''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand()'''
tags_classif.db.query(sql1)
news_results = tags_classif.db.store_result().fetch_row(maxrows=0)
total_score = 0
for i, (id_news, title, description) in enumerate(news_results):
text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
tags_classif.classify(text_for_analis)
rez_accordance = {item[0] for item in tags_classif.tag_accordance}
sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
results = tags_classif.db.query(sql)
tags_in_article = {int(tag) for (tag, ) in tags_classif.db.store_result().fetch_row(maxrows=0)}
if len(rez_accordance & tags_in_article) != 0:
total_score += 1
else:
print("\n\n#", str(i))
print("News title: " + title.decode('ascii', 'ignore'))
print("Model calculated Accordance :", end=" ")
for item_tag in tags_classif.tags:
if int(item_tag[0]) in rez_accordance:
print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
print("\nUser classified tags for news: ", end="")
# print(set(rez_accordance))
for item_tag in tags_classif.tags:
if int(item_tag[0]) in tags_in_article:
print(item_tag[0], item_tag[1].decode('ascii', 'ignore'), end='; ')
print("\n\nThe Model was tasted on ", len(news_results), " news. ")
print("The total accuracy is:", total_score / len(news_results))
# exit(0)
if __name__ == "__main__":
try:
# if localhost database is not available then use server
tags_classif = Classifier()
print('\nI use SERVER DataBase.\n')
except _mysql.OperationalError:
# use server DataBase
print('\nI use local DataBase.\n')
tags_classif = Classifier(host='localhost', port=8080, user='root', password='password', db='news')
# the method is not implicated
tags_classif.teach_model()
# tags_classif.save()
# exit(0)
cross_validation()
exit(0)
# print("\n\n")
tags_classif.db.query(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
result = tags_classif.db.store_result()
# news analysis witt title + description + text + company_id
for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
text_for_analys = ''
if title is not None:
text_for_analys += title.decode("ascii", 'ignore') + '\n'
if description is not None:
text_for_analys += description.decode('ascii', 'ignore') + " "
if text is not None:
text_for_analys += text.decode("ascii", 'ignore')
print("\n#", str(i))
print('Title: ', title.decode("ascii", 'ignore'))
print("Descr.:", text.decode("ascii", 'ignore')[:80])
print('entity :', entity_id)
tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)
# accordance %
print("Accordance(#tag, %-accordance, tag_description): ")
print(tags_classif.tag_accordance)
# log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
# if i > 10: break
# Graph presentation results
# tags_classif.graph_results()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment