Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
5dd6300d
Commit
5dd6300d
authored
Aug 22, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Method Cross validation
parent
f9f1cbfb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
42 additions
and
36 deletions
+42
-36
news_classify_tag.py
news_classify_tag.py
+42
-36
No files found.
news_classify_tag.py
View file @
5dd6300d
...
...
@@ -105,9 +105,9 @@ class Classifier:
self
.
entity_tags
[
entity
]
=
""
+
item_tag
else
:
self
.
entity_tags
[
entity
]
+=
" "
+
item_tag
# print(self.entity_tags)
# print(len(self.entity_tags))
# exit(0)
# print(self.entity_tags)
# print(len(self.entity_tags))
# exit(0)
def
text_clear
(
self
,
texts
):
import
re
...
...
@@ -132,7 +132,7 @@ class Classifier:
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neighbors
import
KNeighborsClassifier
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
2
))
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
tfidf
=
TfidfTransformer
()
# self.classifier = MultinomialNB()
self
.
classifier
=
KNeighborsClassifier
()
...
...
@@ -310,6 +310,41 @@ def log_data(i=None, text_to_analise=None, tags=None):
out_file
.
close
()
def
cross_validation
():
## _______ test from indentifyed news ______________
sql1
=
'''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand()'''
tags_classif
.
db
.
query
(
sql1
)
news_results
=
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)
total_score
=
0
for
i
,
(
id_news
,
title
,
description
)
in
enumerate
(
news_results
):
text_for_analis
=
title
.
decode
(
'ascii'
,
'ignore'
)
+
'
\n
'
+
description
.
decode
(
'ascii'
,
'ignore'
)
tags_classif
.
classify
(
text_for_analis
)
rez_accordance
=
{
item
[
0
]
for
item
in
tags_classif
.
tag_accordance
}
sql
=
" SELECT tag_id FROM wp_esi_tag_news WHERE news_id ="
+
str
(
id_news
)
results
=
tags_classif
.
db
.
query
(
sql
)
tags_in_article
=
{
tag
for
(
tag
,
)
in
tags_classif
.
db
.
store_result
()
.
fetch_row
(
maxrows
=
0
)}
print
(
rez_accordance
,
tags_in_article
,
len
(
rez_accordance
&
tags_in_article
))
if
len
(
rez_accordance
&
tags_in_article
)
!=
0
:
total_score
+=
1
else
:
print
(
"
\n
#"
,
str
(
i
))
print
(
"
\n
News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
print
(
"Model calculated Accordance :"
,
end
=
" "
)
print
(
set
(
rez_accordance
))
print
(
"User classified tags for news:"
,
set
(
tags_in_article
))
print
(
"
\n\n
The Model was tasted on "
,
len
(
news_results
),
" news. "
)
print
(
" The total result is:"
,
total_score
/
len
(
news_results
))
# exit(0)
if
__name__
==
"__main__"
:
try
:
# if localhost database is not available then use server
...
...
@@ -324,41 +359,12 @@ if __name__ == "__main__":
tags_classif
.
save
()
# exit(0)
## _______ test from indentifyed news ______________
sql1
=
'''SELECT rez.news_id , rez.title, rez.description
FROM(
SELECT wp_esi_tag_news.news_id, wp_esi_news.title, wp_esi_news.description
FROM wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id
ORDER BY wp_esi_tag_news.tag_id ) AS rez
ORDER BY rand() LIMIT 30'''
tags_classif
.
db
.
query
(
sql1
)
news_results
=
tags_classif
.
db
.
store_result
()
news_results
=
news_results
.
fetch_row
(
maxrows
=
0
)
data
=
list
()
for
i
,
(
id_news
,
title
,
description
)
in
enumerate
(
news_results
):
text_for_analis
=
title
.
decode
(
'ascii'
,
'ignore'
)
+
'
\n
'
+
description
.
decode
(
'ascii'
,
'ignore'
)
# print("\n#", str(i))
print
(
"
\n
News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
tags_classif
.
classify
(
text_for_analis
)
print
(
"Model calculated Accordance(#tag,
%-
accordance, tag_description): "
,
end
=
" "
)
print
(
tags_classif
.
tag_accordance
)
sql
=
" SELECT tag_id FROM wp_esi_tag_news WHERE news_id ="
+
str
(
id_news
)
# print(sql)
results
=
tags_classif
.
db
.
query
(
sql
)
tags
=
tags_classif
.
db
.
store_result
()
tags
=
tags
.
fetch_row
(
maxrows
=
0
)
# print(tags)
# print(tags_classif.tags)
print
(
"User classified tags for present news:"
)
for
(
tag
,)
in
tags
:
tag
=
int
(
tag
)
# print(tag)
print
(
str
(
tag
)
+
" "
+
tags_classif
.
tags
[
tag
-
1
][
1
]
.
decode
(
'ascii'
,
'ignore'
))
cross_validation
()
exit
(
0
)
# print("\n\n")
tags_classif
.
db
.
query
(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50"
)
tags_classif
.
db
.
query
(
"SELECT title, description, text, company_id FROM wp_esi_news_accept ORDER BY RAND() LIMIT 50"
)
result
=
tags_classif
.
db
.
store_result
()
# news analysis witt title + description + text + company_id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment