Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
ca1fa8a0
Commit
ca1fa8a0
authored
Sep 18, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix Init issues clear: DB info.
Clear main, cross-validation Methods.
parent
9e95fb2f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
99 deletions
+18
-99
teach_1.py
Spacy/teach_1.py
+8
-0
news_classify_tag.py
news_classify_tag.py
+10
-99
No files found.
Spacy/teach_1.py
0 → 100644
View file @
ca1fa8a0
import
spacy
nlp
=
spacy
.
load
(
'en_core_web_md'
)
doc
=
nlp
(
u'This is the most interesting story about USA and Nokia. All is Ok. Samsung is bad company.'
)
for
item
in
doc
.
sentence
:
print
(
item
)
news_classify_tag.py
View file @
ca1fa8a0
...
...
@@ -8,7 +8,7 @@ import _mysql
# noinspection PyUnresolvedReferences
class
Classifier
:
def
__init__
(
self
,
corpus
=
None
,
host
=
'176.58.117.151'
,
user
=
'esi'
,
password
=
'esi12345'
,
db
=
'esi'
,
port
=
3306
):
def
__init__
(
self
,
corpus
=
None
,
host
=
""
,
user
=
""
,
password
=
""
,
db
=
""
,
port
=
0
):
try
:
# import nltk
import
operator
...
...
@@ -23,6 +23,9 @@ class Classifier:
# self.train_data = corpus
# return
# Extract data form DataBase
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
# load external dictionary
try
:
directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
@@ -36,9 +39,6 @@ class Classifier:
for
word
in
file_dict
:
self
.
correct_words
.
add
(
word
[:
-
1
])
# Extract data form DataBase
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
# getting tags
self
.
db
.
query
(
"SELECT id, name FROM wp_esi_tag"
)
rez
=
self
.
db
.
store_result
()
...
...
@@ -46,7 +46,6 @@ class Classifier:
for
id
,
description
in
rez
.
fetch_row
(
maxrows
=
0
):
tags
.
append
((
int
(
id
),
description
))
self
.
tags_dict
=
dict
(
tags
)
self
.
tags
=
tags
del
tags
...
...
@@ -61,12 +60,10 @@ class Classifier:
texts
=
re
.
sub
(
r'\s'
,
" "
,
texts
)
texts
=
texts
.
split
(
" "
)
texts
=
[
word
.
lower
()
for
word
in
texts
if
word
not
in
stopwords
]
# texts = [word for word in texts ]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts
=
[
stem
(
word
)
for
word
in
texts
]
texts
=
" "
.
join
(
texts
)
# print(texts)
return
texts
def
teach_model
(
self
,
data_text
=
''
):
...
...
@@ -130,7 +127,11 @@ class Classifier:
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
tfidf
=
TfidfTransformer
()
self
.
classifier
=
KNeighborsClassifier
()
# k-mean model
# self.classifier = KNeighborsClassifier()
# The Naive Bayes
self
.
classifier
=
MultinomialNB
(
alpha
=
2
)
# Multilyer Perseptron
# self.classifier = MLPClassifier()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
...
...
@@ -219,7 +220,7 @@ class Classifier:
for
item_num
,
tag_id
,
in
enumerate
(
self
.
links_tags
):
if
tag_id
in
tags_list_from_entity
and
self
.
likelihood_list
[
item_num
]
<
0.1
:
self
.
likelihood_list
[
item_num
]
+=
0.1
# print (self.likelihood_list)
# print (self.likelihood_list)
def
graph_results
(
self
):
# import numpy as np
...
...
@@ -309,93 +310,3 @@ def log_data(i=None, text_to_analise=None, tags=None):
out_file
.
write
(
'
\n
'
)
out_file
.
close
()
def
cross_validation
():
## _______ test from indentifyed news ______________
sql1
=
'''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand() LIMIT 20'''
tags_classif
.
db
.
query
(
sql1
)
news_results
=
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)
total_score
=
0
for
i
,
(
id_news
,
title
,
description
)
in
enumerate
(
news_results
):
text_for_analis
=
title
.
decode
(
'ascii'
,
'ignore'
)
+
'
\n
'
+
description
.
decode
(
'ascii'
,
'ignore'
)
tags_classif
.
classify
(
text_for_analis
)
rez_accordance
=
{
item
[
0
]
for
item
in
tags_classif
.
tag_accordance
}
sql
=
" SELECT tag_id FROM wp_esi_tag_news WHERE news_id ="
+
str
(
id_news
)
results
=
tags_classif
.
db
.
query
(
sql
)
tags_in_article
=
{
int
(
tag
)
for
(
tag
,
)
in
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)}
if
len
(
rez_accordance
&
tags_in_article
)
!=
0
:
total_score
+=
1
else
:
print
(
"
\n\n
#"
,
str
(
i
))
print
(
"News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
print
(
"Model calculated Accordance :"
,
end
=
" "
)
for
item_tag
in
tags_classif
.
tags
:
if
int
(
item_tag
[
0
])
in
rez_accordance
:
print
(
item_tag
[
0
],
item_tag
[
1
]
.
decode
(
'ascii'
,
'ignore'
),
end
=
'; '
)
print
(
"
\n
User classified tags for news: "
,
end
=
""
)
# print(set(rez_accordance))
for
item_tag
in
tags_classif
.
tags
:
if
int
(
item_tag
[
0
])
in
tags_in_article
:
print
(
item_tag
[
0
],
item_tag
[
1
]
.
decode
(
'ascii'
,
'ignore'
),
end
=
'; '
)
print
(
"
\n\n
The Model was tasted on "
,
len
(
news_results
),
" news. "
)
print
(
"The total accuracy is:"
,
total_score
/
len
(
news_results
))
# exit(0)
if
__name__
==
"__main__"
:
try
:
# if localhost database is not available then use server
tags_classif
=
Classifier
()
print
(
'
\n
I use SERVER DataBase.
\n
'
)
except
_mysql
.
OperationalError
:
# use server DataBase
print
(
'
\n
I use local DataBase.
\n
'
)
tags_classif
=
Classifier
(
host
=
'localhost'
,
port
=
8080
,
user
=
'root'
,
password
=
'password'
,
db
=
'news'
)
# the method is not implicated
from
time
import
time
now
=
time
()
tags_classif
.
teach_model
()
print
(
"_"
*
40
,
"
\n
TRAINING TIME IS "
,
(
time
()
-
now
),
's
\n
'
,
"_"
*
40
)
# tags_classif.save()
# exit(0)
cross_validation
()
exit
(
0
)
# print("\n\n")
tags_classif
.
db
.
query
(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 20"
)
result
=
tags_classif
.
db
.
store_result
()
# news analysis witt title + description + text + company_id
for
i
,
(
title
,
description
,
text
,
entity_id
)
in
enumerate
(
result
.
fetch_row
(
maxrows
=
0
)):
text_for_analys
=
''
if
title
is
not
None
:
text_for_analys
+=
title
.
decode
(
"ascii"
,
'ignore'
)
+
'
\n
'
if
description
is
not
None
:
text_for_analys
+=
description
.
decode
(
'ascii'
,
'ignore'
)
+
" "
if
text
is
not
None
:
text_for_analys
+=
text
.
decode
(
"ascii"
,
'ignore'
)
print
(
"
\n
#"
,
str
(
i
))
print
(
'Title: '
,
title
.
decode
(
"ascii"
,
'ignore'
))
print
(
"Descr.:"
,
text
.
decode
(
"ascii"
,
'ignore'
)[:
80
])
print
(
'entity :'
,
entity_id
)
tags_results
=
tags_classif
.
classify
(
text_for_analys
,
entity_id
=
entity_id
)
# accordance %
print
(
"Accordance(#tag,
%-
accordance, tag_description): "
)
print
(
tags_classif
.
tag_accordance
)
# log_data(i=i, text_to_analise=text_for_analis, tags=tags_results)
# if i > 10: break
# Graph presentation results
# tags_classif.graph_results()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment