Commit 566e05d9 authored by Tags's avatar Tags

Fix Init issues clear: DB info.

Clear main, cross-validation Methods.
parent ca1fa8a0
......@@ -16,8 +16,10 @@ class Classifier:
except ImportError:
print('You have import flowing packages: sklearn & nltk & re.')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
# stopwords for text_clearner Method
self.stopwords = set(nltk.corpus.stopwords.words('english'))
self.stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
# if corpus is not None:
# self.train_data = corpus
......@@ -39,6 +41,12 @@ class Classifier:
for word in file_dict:
# from stemming.porter2 import stem
from nltk.stem import PorterStemmer
self.stem = PorterStemmer().stem
# self.stem.
# getting tags
self.db.query("SELECT id, name FROM wp_esi_tag")
rez = self.db.store_result()
......@@ -51,18 +59,17 @@ class Classifier:
def text_clear(self, texts):
import re
from stemming.porter2 import stem
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
from nltk.tokenize import word_tokenize
texts = re.sub(r'\d', "", texts)
texts = re.sub(r'\s', " ", texts)
texts = texts.split(" ")
texts = [word.lower() for word in texts if word not in stopwords]
texts = texts.translate(str.maketrans("?!,.+-:;\/", 10*" "))
texts = word_tokenize(texts) #texts.split(" ")
texts = [word.lower() for word in texts if word not in self.stopwords]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts = [stem(word) for word in texts]
texts = [self.stem(word) for word in texts if len(word) > 0]
texts = " ".join(texts)
return texts
......@@ -71,14 +78,13 @@ class Classifier:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
# from sklearn.neural_network import MLPClassifier
sql1 = '''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
FROM wp_esi_news_accept, wp_esi_ml_data WHERE
ORDER BY wp_esi_ml_data.tag_id'''
result = self.db.store_result()
self.X_text_data = list()
self.y_data = list()
for tag_id, title, description, text in result.fetch_row(maxrows=0):
......@@ -92,6 +98,8 @@ class Classifier:
if texts != '':
# print(self.X_text_data)
# print(self.y_data)
# _____________ entity develop ______________
sql_entity = "SELECT id, name FROM wp_esi_entity"
......@@ -108,7 +116,6 @@ class Classifier:
self.entity_news = list(
(entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
# print("total # of news linked with entity is", len(self.entity_news))
sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
# ________Entitys -- tags _________________
sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
......@@ -125,7 +132,7 @@ class Classifier:
# exit(0)
self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
self.vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1))
self.tfidf = TfidfTransformer()
# k-mean model
# self.classifier = KNeighborsClassifier()
......@@ -162,7 +169,7 @@ class Classifier:
# exit(0)
def tag_accordance(self, persantage=50):
def tag_accordance(self, persantage=20):
Class method for computing % af tag accordance
:return: tuple of % af tag accordance
......@@ -207,6 +214,7 @@ class Classifier:
if type(text) is bytes:
text.decode('ascii', 'ignore')
matrix_test_data = self.vectorizer.transform([self.text_clear(text_test)])
matrix_test_data = self.tfidf.fit_transform(matrix_test_data)
rez = self.classifier.predict_proba(matrix_test_data)
# exit(0)
self.likelihood_list = rez[0]
