Commit a459780e authored by Tags's avatar Tags

Resolve issues

parent a0879988
# import sys
# from _mysql import OperationalError
# from pprint import pprint
import nltk
import numpy as np
import _mysql
......@@ -12,7 +13,7 @@ class Classifier:
def __init__(self, corpus=None, host='176.58.117.151', \
user='esi', password='esi12345', db='esi', port=3306):
try:
import nltk
# import nltk
import operator
from stemming.porter2 import stem
except ImportError:
......@@ -65,7 +66,7 @@ class Classifier:
def text_clear(self, texts):
import re
import nltk
# import nltk
from stemming.porter2 import stem
stopwords = set (nltk.corpus.stopwords.words ('english'))
......@@ -156,7 +157,7 @@ class Classifier:
self.matrix_test_data = matrix_test_data
def graph_results(self):
import numpy as np
# import numpy as np
from pylab import figure, show, hist
print (self.total_results[3])
data = []
......
DateTime==4.2
mysqlclient==1.3.10
nltk==3.2.4
numpy==1.13.1
pkg-resources==0.0.0
pytz==2017.2
scikit-learn==0.18.2
scipy==0.19.1
six==1.10.0
sklearn==0.0
stemming==1.0.1
zope.interface==4.4.2
import sys
import nltk
from stemming.porter2 import stem
stopwords = set (nltk.corpus.stopwords.words ('english'))
stopwords.update (['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
class Classifier ():
""""
Clasiffier class which get texts data from dataBase
and build classifaer vector.
"""
def __init__(self, corpus=None, host='localhost', port=8000, user='root', password='password', db='news'):
try:
from MySQLdb import connect
# import sklearn.datasets
import nltk.stem
import re
from stemming.porter2 import stem
except ImportError:
print ('You have import flowing packages: sklearn & nltk & re.')
english_steamer = nltk.stem.SnowballStemmer ('english')
stopwords = set (nltk.corpus.stopwords.words ('english'))
stopwords.update (['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
if corpus is not None:
self.train_data = corpus
return
# Extract data form DataBase
conn = connect (host=host, port=port, user=user, password=password, db=db)
cur = conn.cursor ()
self.cursor = cur
result = cur.execute ("select id, name from wp_esi_tag")
tag = list ()
tag_description = list ()
for item in cur.fetchall ():
if item is not None:
tag.append (item[0])
tag_description.append (item[1])
# tags
tags = list ()
for item in zip (tag, tag_description):
tags.append (item)
self.tags = tuple (tags)
# print (tags)
train_data = []
# text_id = []
for id, tag_description in tags:
sql1 = "select wp_esi_news.title, wp_esi_news.description from wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id AND wp_esi_tag_news.tag_id=" + str (
id)
result = cur.execute (sql1)
# posses text by regexp
# text_id = id
for title, decription in cur.fetchall ():
texts = str (title) + str (decription)
# texts =
texts = self.text_clear (texts)
# text_id[id] = texts
train_data.append (texts)
# train_data= list id of text list
self.train_data_text = train_data
self.train_data_text
@staticmethod
def text_clear(texts):
import re
texts = re.sub (r'\d', " ", texts)
texts = texts.split (" ")
texts = [word.lower () for word in texts]
texts = [word for word in texts if len (word) > 3 and word not in stopwords]
texts = [stem (word) for word in texts]
texts = " ".join (texts)
return texts
def __repr__(self):
pass
def clasif_fit(self, data_text=''):
from sklearn.feature_extraction.text import CountVectorizer
self.vectorizer = CountVectorizer (min_df=1)
if data_text == '':
data_text = self.train_data_text
matrix_train_data = self.vectorizer.fit_transform (data_text)
self.matrix_fited = matrix_train_data
print ("number of feaches =", len (self.vectorizer.get_feature_names ()), self.vectorizer.get_feature_names ())
print ("Quontety of tags = ", len (self.tags), self.tags)
# print (matrix_train_data.toarray ().transpose ())
return matrix_train_data
def test_cllasifyer(self, test_corpus=None):
if test_corpus is None:
cur = self.cursor
result = cur.execute ("select title, description from wp_esi_news_accept")
title, description = cur.fetchone ()
text_test = title + " " + description
else:
text_test = test_corpus
nltk.pprint ("Test text: " + test_corpus)
text_test = self.text_clear (text_test)
matrix_test_data = self.vectorizer.transform ([text_test])
self.matrix_test = matrix_test_data
# print (matrix_test_data)
# print (matrix_test_data.toarray ())
return matrix_test_data
def dist(v1, v2):
import scipy as sp
# delta = v1 - v2
v1_norm = v1 / sp.linalg.norm (v1.toarray ())
v2_norm = v2 / sp.linalg.norm (v2.toarray ())
delta = v1_norm - v2_norm
return sp.linalg.norm (delta.toarray ())
if __name__ == "__main__":
# text = Classifier()
text = Classifier (host='176.58.117.151', user='esi', password='esi12345)',\
db='esi', port=3306) # host="", password=, user=, bd=
# print (text.train_data_text)
trained_model = text.clasif_fit ()
num_samples, num_features = trained_model.shape
# print("# samples: ", num_samples, "# features", num_features)
# find best fit
print()
cur = text.cursor
result = cur.execute ("select title, description from wp_esi_news_accept")
numb_of_news_to_show = 10
news_item=0
for title, description in cur.fetchall():
if numb_of_news_to_show < news_item:
break
else:
news_item += 1
print("\n #" + str(news_item))
# nltk.pprint (title + " " + description)
text_test = title + " " + description
# print ("Test text: ")
tested_model = text.test_cllasifyer (test_corpus=text_test)
best_doc = None
best_dist = sys.maxsize
best_i = None
fit_array = []
# Calculating distance function
for i in range (0, num_samples):
post_vect = trained_model.getrow (i)
d = dist (post_vect, tested_model)
# print(i, "\t Distanse = ", d)
fit_array.append (((i, d)))
if d < best_dist:
best_dist = d
best_i = i
# info results
print ("=====================================")
print ("Best fit for this news is ", best_i, ' - tag')
# print(text.tags[best_i])
import operator
sorted_fit_array = sorted (fit_array, key=operator.itemgetter (1))
# import collections
# od =
i = 0
print("=====================================")
print("#-tag |\tdistance \t | tag label")
print("=====================================")
for numb, val in sorted_fit_array:
print (numb, " | " , val," | " ,text.tags[numb][1])
i += 1
if i > 10:
break
# pprint(sorted_fit_array)
print ("=====================================")
exit (0)
# all_data = sklearn.datasets.fetch_20newsgroups (subset="all")
# # Number of total posts: 18846
#
# groups = [
# 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
# train_data = sklearn.datasets.fetch_20newsgroups (subset="train", categories=groups)
#
# train_dat_my = sklearn.feature_extraction
# num_clusters = 50 # sp.unique(labels).shape[0]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment