Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
1b3a12f2
Commit
1b3a12f2
authored
Jul 27, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Make changes according the requirements.
parent
d531aef4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
77 additions
and
42 deletions
+77
-42
news_classify_tag.py
news_classify_tag.py
+77
-42
No files found.
news_classify_tag.py
View file @
1b3a12f2
import
sys
from
_mysql
import
OperationalError
from
pprint
import
pprint
#
import sys
#
from _mysql import OperationalError
#
from pprint import pprint
import
numpy
as
np
import
_mysql
# noinspection PyUnresolvedReferences
...
...
@@ -11,7 +12,6 @@ class Classifier:
try
:
import
nltk
import
operator
from
MySQLdb
import
connect
from
stemming.porter2
import
stem
except
ImportError
:
print
(
'You have import flowing packages: sklearn & nltk & re.'
)
...
...
@@ -24,27 +24,29 @@ class Classifier:
return
# Extract data form DataBase
conn
=
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
password
=
password
,
db
=
db
)
self
.
cursor
=
conn
.
cursor
()
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
#geting tags
result
=
self
.
cursor
.
execute
(
"select id, name from wp_esi_tag"
)
# geting tags
self
.
db
.
query
(
"SELECT id, name FROM wp_esi_tag"
)
rez
=
self
.
db
.
store_result
()
# result =
tags
=
list
()
for
id
,
description
in
self
.
cursor
.
fetchall
(
):
tags
.
append
((
id
,
description
))
for
id
,
description
in
rez
.
fetch_row
(
maxrows
=
0
):
tags
.
append
((
id
,
description
))
self
.
tags
=
tags
# print (tags)
# print (
len(tags),
tags)
del
tags
train_data
=
[]
# text_id = []
sql1
=
'''select wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
from wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
order by wp_esi_tag_news.tag_id LIMIT 1000 '''
result
=
self
.
cursor
.
execute
(
sql1
)
data
=
list
()
for
tag_id
,
title
,
description
in
self
.
cursor
.
fetchall
():
data
.
append
((
tag_id
,
title
,
description
))
sql1
=
'''SELECT wp_esi_tag_news.tag_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id LIMIT 1000 '''
self
.
db
.
query
(
sql1
)
result
=
self
.
db
.
store_result
()
data
=
list
()
for
tag_id
,
title
,
description
in
result
.
fetch_row
(
maxrows
=
0
):
data
.
append
((
tag_id
,
title
,
description
))
for
tag_id
,
_
in
self
.
tags
:
texts
=
''
...
...
@@ -95,13 +97,13 @@ class Classifier:
# normalise rowsfrequency matrix
for
j
in
range
(
columns
):
suma
=
0
suma
=
0
suma
=
frequency_matrix
[
j
]
.
sum
()
if
suma
==
0
:
continue
for
i
in
range
(
rows
):
frequency_matrix
[
j
,
i
]
/=
suma
self
.
tag_frequency_matrix
=
frequency_matrix
self
.
tag_frequency_matrix
=
frequency_matrix
return
frequency_matrix
def
teach_model
(
self
,
data_text
=
''
):
...
...
@@ -122,12 +124,12 @@ class Classifier:
:return: tuple of
%
af tag accordance
"""
vector_accordance
=
[]
for
row
in
range
(
self
.
tag_frequency_matrix
.
shape
[
0
]):
for
row
in
range
(
self
.
tag_frequency_matrix
.
shape
[
0
]):
temp_matrix
=
self
.
tag_frequency_matrix
[
row
]
rez_summ
=
temp_matrix
[
self
.
matrix_test_data
.
toarray
()[
0
]
>
0
]
.
sum
()
rez_summ
=
temp_matrix
[
self
.
matrix_test_data
.
toarray
()[
0
]
>
0
]
.
sum
()
if
rez_summ
>
(
persantage
/
100
):
vector_accordance
.
append
((
row
,
int
(
rez_summ
*
1000
)
/
10
,
self
.
tags
[
row
][
1
]))
vector_accordance
.
sort
(
key
=
lambda
tup
:
tup
[
1
],
reverse
=
True
)
vector_accordance
.
append
((
row
,
int
(
rez_summ
*
1000
)
/
10
,
self
.
tags
[
row
][
1
]))
vector_accordance
.
sort
(
key
=
lambda
tup
:
tup
[
1
],
reverse
=
True
)
return
vector_accordance
...
...
@@ -168,15 +170,16 @@ class Classifier:
def
save
(
self
):
import
os
file_tag_frequency
=
'data_tag_frequency.csv'
if
os
.
path
.
isfile
(
file_tag_frequency
):
os
.
remove
(
file_tag_frequency
)
file_data
=
open
(
file_tag_frequency
,
"w"
)
file_tag_frequency
=
'data_tag_frequency.csv'
if
os
.
path
.
isfile
(
file_tag_frequency
):
os
.
remove
(
file_tag_frequency
)
file_data
=
open
(
file_tag_frequency
,
"w"
)
if
file_data
==
None
:
print
(
"Can't create data storage file"
)
print
(
"Can't create data storage file"
)
return
False
np
.
savetxt
(
file_tag_frequency
,
self
.
tag_frequency_matrix
,
delimiter
=
';'
,
fmt
=
'
%1.4
f'
)
print
(
"Data is saved into file: "
+
file_tag_frequency
+
" "
+
str
(
int
(
os
.
stat
(
file_tag_frequency
)
.
st_size
/
1024
))
+
'kB'
)
np
.
savetxt
(
file_tag_frequency
,
self
.
tag_frequency_matrix
,
delimiter
=
';'
,
fmt
=
'
%1.4
f'
)
print
(
"Data is saved into file: "
+
file_tag_frequency
+
" "
+
str
(
int
(
os
.
stat
(
file_tag_frequency
)
.
st_size
/
1024
))
+
'kB'
)
return
True
...
...
@@ -206,28 +209,60 @@ if __name__ == "__main__":
try
:
# if localhost database is not available then use server
tags_classif
=
Classifier
()
print
(
'
\n
I use SERVER DataBase.
\n
'
)
except
OperationalError
:
print
(
'
\n
I use SERVER DataBase.
\n
'
)
except
_mysql
.
OperationalError
:
# use server DataBase
print
(
'
\n
I use local DataBase.
\n
'
)
print
(
'
\n
I use local DataBase.
\n
'
)
tags_classif
=
Classifier
(
host
=
'localhost'
,
port
=
8080
,
user
=
'root'
,
password
=
'password'
,
db
=
'news'
)
# the method is not implicated
tags_classif
.
teach_model
()
tags_classif
.
save
()
#
tags_classif.save ()
# exit(0)
result
=
tags_classif
.
cursor
.
execute
(
"select title, description from wp_esi_news ORDER BY RAND() limit 25"
)
for
i
,
(
title
,
description
)
in
enumerate
(
tags_classif
.
cursor
.
fetchall
()):
sql1
=
'''SELECT rez.news_id , rez.title, rez.description
FROM (
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) as rez
ORDER BY rand() LIMIT 15'''
tags_classif
.
db
.
query
(
sql1
)
news_results
=
tags_classif
.
db
.
store_result
()
news_results
=
news_results
.
fetch_row
(
maxrows
=
0
)
data
=
list
()
for
i
,
(
id_news
,
title
,
description
)
in
enumerate
(
news_results
):
text_for_analis
=
title
.
decode
(
'ascii'
,
'ignore'
)
+
'
\n
'
+
description
.
decode
(
'ascii'
,
'ignore'
)
# print ("\n#", str (i))
print
(
"
\n
News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
tags_classif
.
classify
(
text_for_analis
)
print
(
"Model calculated Accordance (#tag,
%-
accordance, tag_description): "
,
end
=
" "
)
print
(
tags_classif
.
teg_accordance
)
sql
=
" select tag_id from wp_esi_tag_news where news_id ="
+
str
(
id_news
)
# print(sql)
results
=
tags_classif
.
db
.
query
(
sql
)
tags
=
tags_classif
.
db
.
store_result
()
tags
=
tags
.
fetch_row
(
maxrows
=
0
)
# print(tags)
# print(tags_classif.tags)
print
(
"User classified tags for present news:"
)
for
(
tag
,
)
in
tags
:
tag
=
int
(
tag
)
# print(tag)
print
(
str
(
tag
-
1
)
+
" "
+
tags_classif
.
tags
[
tag
-
1
][
1
]
.
decode
(
'ascii'
,
'ignore'
))
exit
(
0
)
tags_classif
.
db
.
query
(
"SELECT title, description FROM wp_esi_news ORDER BY RAND() LIMIT 25"
)
result
=
tags_classif
.
db
.
store_result
()
for
i
,
(
title
,
description
)
in
enumerate
(
result
.
fetch_row
(
maxrows
=
0
)):
text_for_analis
=
title
+
'
\n
'
+
description
print
(
"
\n
\n
#"
,
str
(
i
))
p
p
rint
(
text_for_analis
)
print
(
"
\n
#"
,
str
(
i
))
print
(
text_for_analis
)
tags_results
=
tags_classif
.
classify
(
text_for_analis
)
# pprint (tags_results[:5])
# accordance %
print
(
"Accordance (#tag,
%-
accordance, tag_description): "
)
print
(
tags_classif
.
teg_accordance
)
print
(
"Accordance (#tag,
%-
accordance, tag_description): "
)
print
(
tags_classif
.
teg_accordance
)
# log_data (i=i, text_to_analise=text_for_analis, tags=tags_results)
# if i > 10: break
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment