Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
d71c4654
Commit
d71c4654
authored
Nov 04, 2017
by
Andrii Marynets
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
https://gitlab.com/taraslut78/esi_news_classification
parents
9b6fec7b
f505b55e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
142 deletions
+52
-142
news_classify_tag.py
news_classify_tag.py
+52
-142
No files found.
news_classify_tag.py
View file @
d71c4654
...
...
@@ -8,7 +8,7 @@ import _mysql
# noinspection PyUnresolvedReferences
class
Classifier
:
def
__init__
(
self
,
corpus
=
None
,
host
=
'176.58.117.151'
,
user
=
'esi'
,
password
=
'esi12345'
,
db
=
'esi'
,
port
=
3306
):
def
__init__
(
self
,
corpus
=
None
,
host
=
""
,
user
=
""
,
password
=
""
,
db
=
""
,
port
=
0
):
try
:
# import nltk
import
operator
...
...
@@ -16,12 +16,17 @@ class Classifier:
except
ImportError
:
print
(
'You have import flowing packages: sklearn & nltk & re.'
)
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
if
corpus
is
not
None
:
self
.
train_data
=
corpus
return
# stopwords for text_clearner Method
self
.
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
self
.
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
# if corpus is not None:
# self.train_data = corpus
# return
# Extract data form DataBase
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
# load external dictionary
try
:
...
...
@@ -36,8 +41,11 @@ class Classifier:
for
word
in
file_dict
:
self
.
correct_words
.
add
(
word
[:
-
1
])
# Extract data form DataBase
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
#
# from stemming.porter2 import stem
from
nltk.stem
import
PorterStemmer
self
.
stem
=
PorterStemmer
()
.
stem
# self.stem.
# getting tags
self
.
db
.
query
(
"SELECT id, name FROM wp_esi_tag"
)
...
...
@@ -46,31 +54,37 @@ class Classifier:
for
id
,
description
in
rez
.
fetch_row
(
maxrows
=
0
):
tags
.
append
((
int
(
id
),
description
))
self
.
tags_dict
=
dict
(
tags
)
self
.
tags
=
tags
# print('tags array = ', len(self.tags), self.tags)
# print(self.tags_dict)
# print(self.tags_dict[355])
# exit(0)
del
tags
# train_data = []
# text_id = []
# sql1 = '''SELECT wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
# FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
# ORDER BY wp_esi_tag_news.tag_id LIMIT 1000 '''
# self.db.query(sql1)
# result = self.db.store_result()
# data = list()
# for tag_id, title, description in result.fetch_row(maxrows=0):
# data.append((tag_id, title, description))
def
text_clear
(
self
,
texts
):
import
re
from
nltk.tokenize
import
word_tokenize
texts
=
re
.
sub
(
r'\d'
,
""
,
texts
)
texts
=
texts
.
translate
(
str
.
maketrans
(
"?!,.+-:;
\
/"
,
10
*
" "
))
texts
=
word_tokenize
(
texts
)
#texts.split(" ")
texts
=
[
word
.
lower
()
for
word
in
texts
if
word
not
in
self
.
stopwords
]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
try
:
texts
=
[
self
.
stem
(
word
)
for
word
in
texts
if
len
(
word
)
>
0
]
except
:
pass
texts
=
" "
.
join
(
texts
)
return
texts
def
teach_model
(
self
,
data_text
=
''
):
from
sklearn.feature_extraction.text
import
CountVectorizer
,
TfidfVectorizer
,
TfidfTransformer
from
sklearn.pipeline
import
Pipeline
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neighbors
import
KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier
sql1
=
'''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
ORDER BY wp_esi_ml_data.tag_id'''
FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
ORDER BY wp_esi_ml_data.tag_id'''
self
.
db
.
query
(
sql1
)
result
=
self
.
db
.
store_result
()
self
.
X_text_data
=
list
()
self
.
y_data
=
list
()
for
tag_id
,
title
,
description
,
text
in
result
.
fetch_row
(
maxrows
=
0
):
...
...
@@ -84,6 +98,8 @@ class Classifier:
if
texts
!=
''
:
self
.
X_text_data
.
append
(
self
.
text_clear
(
texts
))
self
.
y_data
.
append
(
int
(
tag_id
))
# print(self.X_text_data)
# print(self.y_data)
# _____________ entity develop ______________
sql_entity
=
"SELECT id, name FROM wp_esi_entity"
...
...
@@ -94,13 +110,12 @@ class Classifier:
del
result_entity
# print("total entitys # is ", len(self.entity))
sql_entity_news
=
"""SELECT wp_esi_news_entity.entity_id, concat(wp_esi_news.title, wp_esi_news.description)
FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
FROM wp_esi_news_entity, wp_esi_news WHERE wp_esi_news.id = wp_esi_news_entity.news_id ORDER BY wp_esi_news_entity.entity_id """
self
.
db
.
query
(
sql_entity_news
)
result_entity_news
=
self
.
db
.
store_result
()
self
.
entity_news
=
list
(
(
entity_id
,
news_text
)
for
entity_id
,
news_text
in
result_entity_news
.
fetch_row
(
maxrows
=
0
))
# print("total # of news linked with entity is", len(self.entity_news))
sql_entity_news
=
""" SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
# ________Entitys -- tags _________________
sql_entity
=
"SELECT entity_id, tag_id FROM wp_esi_tag_entity"
...
...
@@ -116,33 +131,14 @@ class Classifier:
# print(len(self.entity_tags))
# exit(0)
def
text_clear
(
self
,
texts
):
import
re
from
stemming.porter2
import
stem
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
texts
=
re
.
sub
(
r'\d'
,
" "
,
texts
)
texts
=
texts
.
split
(
" "
)
texts
=
[
word
.
lower
()
for
word
in
texts
]
texts
=
[
word
for
word
in
texts
if
word
not
in
stopwords
]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts
=
[
stem
(
word
)
for
word
in
texts
]
texts
=
" "
.
join
(
texts
)
return
texts
def
teach_model
(
self
,
data_text
=
''
):
from
sklearn.feature_extraction.text
import
CountVectorizer
,
TfidfVectorizer
,
TfidfTransformer
from
sklearn.pipeline
import
Pipeline
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.neural_network
import
MLPClassifier
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
vectorizer
=
CountVectorizer
(
min_df
=
1
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
tfidf
=
TfidfTransformer
()
self
.
classifier
=
KNeighborsClassifier
()
# k-mean model
# self.classifier = KNeighborsClassifier()
# The Naive Bayes
self
.
classifier
=
MultinomialNB
(
alpha
=
2
)
# Multilyer Perseptron
# self.classifier = MLPClassifier()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
...
...
@@ -173,7 +169,7 @@ class Classifier:
# exit(0)
@
property
def
tag_accordance
(
self
,
persantage
=
5
0
):
def
tag_accordance
(
self
,
persantage
=
2
0
):
"""
Class method for computing
%
af tag accordance
:return: tuple of
%
af tag accordance
...
...
@@ -218,6 +214,7 @@ class Classifier:
if
type
(
text
)
is
bytes
:
text
.
decode
(
'ascii'
,
'ignore'
)
matrix_test_data
=
self
.
vectorizer
.
transform
([
self
.
text_clear
(
text_test
)])
matrix_test_data
=
self
.
tfidf
.
fit_transform
(
matrix_test_data
)
rez
=
self
.
classifier
.
predict_proba
(
matrix_test_data
)
# exit(0)
self
.
likelihood_list
=
rez
[
0
]
...
...
@@ -231,7 +228,7 @@ class Classifier:
for
item_num
,
tag_id
,
in
enumerate
(
self
.
links_tags
):
if
tag_id
in
tags_list_from_entity
and
self
.
likelihood_list
[
item_num
]
<
0.1
:
self
.
likelihood_list
[
item_num
]
+=
0.1
# print (self.likelihood_list)
# print (self.likelihood_list)
def
graph_results
(
self
):
# import numpy as np
...
...
@@ -321,90 +318,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
out_file
.
write
(
'
\n
'
)
out_file
.
close
()
def
cross_validation
():
## _______ test from indentifyed news ______________
sql1
=
'''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand()'''
tags_classif
.
db
.
query
(
sql1
)
news_results
=
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)
total_score
=
0
for
i
,
(
id_news
,
title
,
description
)
in
enumerate
(
news_results
):
text_for_analis
=
title
.
decode
(
'ascii'
,
'ignore'
)
+
'
\n
'
+
description
.
decode
(
'ascii'
,
'ignore'
)
tags_classif
.
classify
(
text_for_analis
)
rez_accordance
=
{
item
[
0
]
for
item
in
tags_classif
.
tag_accordance
}
sql
=
" SELECT tag_id FROM wp_esi_tag_news WHERE news_id ="
+
str
(
id_news
)
results
=
tags_classif
.
db
.
query
(
sql
)
tags_in_article
=
{
int
(
tag
)
for
(
tag
,
)
in
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)}
if
len
(
rez_accordance
&
tags_in_article
)
!=
0
:
total_score
+=
1
else
:
print
(
"
\n\n
#"
,
str
(
i
))
print
(
"News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
print
(
"Model calculated Accordance :"
,
end
=
" "
)
for
item_tag
in
tags_classif
.
tags
:
if
int
(
item_tag
[
0
])
in
rez_accordance
:
print
(
item_tag
[
0
],
item_tag
[
1
]
.
decode
(
'ascii'
,
'ignore'
),
end
=
'; '
)
print
(
"
\n
User classified tags for news: "
,
end
=
""
)
# print(set(rez_accordance))
for
item_tag
in
tags_classif
.
tags
:
if
int
(
item_tag
[
0
])
in
tags_in_article
:
print
(
item_tag
[
0
],
item_tag
[
1
]
.
decode
(
'ascii'
,
'ignore'
),
end
=
'; '
)
print
(
"
\n\n
The Model was tasted on "
,
len
(
news_results
),
" news. "
)
print
(
"The total accuracy is:"
,
total_score
/
len
(
news_results
))
# exit(0)
if
__name__
==
"__main__"
:
try
:
# if localhost database is not available then use server
tags_classif
=
Classifier
()
print
(
'
\n
I use SERVER DataBase.
\n
'
)
except
_mysql
.
OperationalError
:
# use server DataBase
print
(
'
\n
I use local DataBase.
\n
'
)
tags_classif
=
Classifier
(
host
=
'localhost'
,
port
=
8080
,
user
=
'root'
,
password
=
'password'
,
db
=
'news'
)
# the method is not implicated
tags_classif
.
teach_model
()
# tags_classif.save()
# exit(0)
cross_validation
()
exit
(
0
)
# print("\n\n")
tags_classif
.
db
.
query
(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50"
)
result
=
tags_classif
.
db
.
store_result
()
# news analysis witt title + description + text + company_id
for
i
,
(
title
,
description
,
text
,
entity_id
)
in
enumerate
(
result
.
fetch_row
(
maxrows
=
0
)):
text_for_analys
=
''
if
title
is
not
None
:
text_for_analys
+=
title
.
decode
(
"ascii"
,
'ignore'
)
+
'
\n
'
if
description
is
not
None
:
text_for_analys
+=
description
.
decode
(
'ascii'
,
'ignore'
)
+
" "
if
text
is
not
None
:
text_for_analys
+=
text
.
decode
(
"ascii"
,
'ignore'
)
print
(
"
\n
#"
,
str
(
i
))
print
(
'Title: '
,
title
.
decode
(
"ascii"
,
'ignore'
))
print
(
"Descr.:"
,
text
.
decode
(
"ascii"
,
'ignore'
)[:
80
])
print
(
'entity :'
,
entity_id
)
tags_results
=
tags_classif
.
classify
(
text_for_analys
,
entity_id
=
entity_id
)
# accordance %
print
(
"Accordance(#tag,
%-
accordance, tag_description): "
)
print
(
tags_classif
.
tag_accordance
)
# log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
# if i > 10: break
# Graph presentation results
# tags_classif.graph_results()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment