Commit 1ca27792 authored by Tags's avatar Tags

Add

1) read entities from Db
2) read tags - entities into entity_tags DICT
3) Chaned Clasifier into naive Bayes
4) Into classify- Method add 'entity_id' int field to increase tag likelihood.
parent a1b6dbe9
......@@ -73,14 +73,42 @@ class Classifier:
if description is not None:
texts += description.decode('ascii', 'ignore')
if text is not None:
# if type(text) is bytes:
texts += text.decode('ascii', 'ignore')
# else:
# texts
if texts != '':
self.X_text_data.append(self.text_clear(texts))
self.y_data.append(int(tag_id))
# _____________ entity develop ______________
sql_entity = "SELECT id, name FROM wp_esi_entity"
self.db.query(sql_entity)
result_entity = self.db.store_result()
self.entity = list(
(id_entity, name.decode('ascii', "ignore")) for id_entity, name in result_entity.fetch_row(maxrows=0))
del result_entity
# print("total entitys # is ", len(self.entity))
sql_entity_news = """SELECT wp_esi_news_entity.entity_id, concat(wp_esi_news.title, wp_esi_news.description)
FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
self.db.query(sql_entity_news)
result_entity_news = self.db.store_result()
self.entity_news = list(
(entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
# print("total # of news linked with entity is", len(self.entity_news))
sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
# ________Entitys -- tags _________________
sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
self.db.query(sql_entity)
rez = self.db.store_result()
self.entity_tags = dict()
for entity, item_tag in rez.fetch_row(maxrows=0):
if entity not in self.entity_tags.keys():
self.entity_tags[entity] = "" + item_tag
else:
self.entity_tags[entity] += " " + item_tag
# print(self.entity_tags)
# print(len(self.entity_tags))
# exit(0)
def text_clear(self, texts):
import re
from stemming.porter2 import stem
......@@ -103,7 +131,8 @@ class Classifier:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
self.vectorizer = CountVectorizer(min_df=2, max_df=15, stop_words='english')
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.tfidf = TfidfTransformer()
self.classifier = MultinomialNB()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
......@@ -115,6 +144,7 @@ class Classifier:
# _________ ML learning posses ___________
X_data = self.vectorizer.fit_transform(self.X_text_data)
# X_data = self.tfidf.fit_transform(X_data)
self.classifier.fit(X_data, self.y_data)
self.links_tags = set(self.y_data)
......@@ -152,7 +182,7 @@ class Classifier:
return vector_accordance
def classify(self, text, test_corpus=None):
def classify(self, text, test_corpus=None, entity_id=0):
"""
:type text: is text for analysis
......@@ -174,7 +204,17 @@ class Classifier:
rez = self.classifier.predict_proba(matrix_test_data)
# exit(0)
self.likelihood_list = rez[0]
# print (self.likelihood_list)
print("Entitys tags:", end="")
if entity_id != 0:
if entity_id in self.entity_tags.keys():
rez = str(self.entity_tags[entity_id])
tags_list_from_entity = list(int(item) for item in rez.split(" "))
print(tags_list_from_entity)
for item_num, tag_id, in enumerate(self.links_tags):
if tag_id in tags_list_from_entity and self.likelihood_list[item_num] < 0.1:
self.likelihood_list[item_num] += 0.1
# print (self.likelihood_list)
def graph_results(self):
# import numpy as np
......@@ -275,43 +315,45 @@ if __name__ == "__main__":
# tags_classif.save()
# exit(0)
sql1 = '''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand() LIMIT 30'''
tags_classif.db.query(sql1)
news_results = tags_classif.db.store_result()
news_results = news_results.fetch_row(maxrows=0)
data = list()
for i, (id_news, title, description) in enumerate(news_results):
text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
# print("\n#", str(i))
print("\nNews title: " + title.decode('ascii', 'ignore'))
tags_classif.classify(text_for_analis)
print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
print(tags_classif.tag_accordance)
sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
# print(sql)
results = tags_classif.db.query(sql)
tags = tags_classif.db.store_result()
tags = tags.fetch_row(maxrows=0)
# print(tags)
# print(tags_classif.tags)
print("User classified tags for present news:")
for (tag,) in tags:
tag = int(tag)
# print(tag)
print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
# exit(0)
print("\n\n")
tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
# ## _______ test from indentifyed news ______________
# sql1 = '''SELECT rez.news_id , rez.title, rez.description
# FROM(
# SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
# FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
# ORDER BY wp_esi_tag_news.tag_id ) AS rez
# ORDER BY rand() LIMIT 30'''
#
# tags_classif.db.query(sql1)
# news_results = tags_classif.db.store_result()
# news_results = news_results.fetch_row(maxrows=0)
# data = list()
# for i, (id_news, title, description) in enumerate(news_results):
# text_for_analis = title.decode('ascii', 'ignore') + '\n' + description.decode('ascii', 'ignore')
# # print("\n#", str(i))
# print("\nNews title: " + title.decode('ascii', 'ignore'))
# tags_classif.classify(text_for_analis)
# print("Model calculated Accordance(#tag, %-accordance, tag_description): ", end=" ")
# print(tags_classif.tag_accordance)
# sql = " SELECT tag_id FROM wp_esi_tag_news WHERE news_id =" + str(id_news)
# # print(sql)
# results = tags_classif.db.query(sql)
# tags = tags_classif.db.store_result()
# tags = tags.fetch_row(maxrows=0)
# # print(tags)
# # print(tags_classif.tags)
# print("User classified tags for present news:")
# for (tag,) in tags:
# tag = int(tag)
# # print(tag)
# print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
# # exit(0)
# print("\n\n")
tags_classif.db.query("SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50")
result = tags_classif.db.store_result()
# news analysis witt title + description + text
for i, (title, description, text) in enumerate(result.fetch_row(maxrows=0)):
# news analysis witt title + description + text + company_id
for i, (title, description, text, entity_id) in enumerate(result.fetch_row(maxrows=0)):
text_for_analys = ''
if title is not None:
text_for_analys += title.decode("ascii", 'ignore') + '\n'
......@@ -322,7 +364,8 @@ if __name__ == "__main__":
print("\n#", str(i))
print('Title: ', title.decode("ascii", 'ignore'))
print("Descr.:", text.decode("ascii", 'ignore')[:80])
tags_results = tags_classif.classify(text_for_analys)
print('entity :', entity_id)
tags_results = tags_classif.classify(text_for_analys, entity_id=entity_id)
# accordance %
print("Accordance(#tag, %-accordance, tag_description): ")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment