Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
6c4536c4
Commit
6c4536c4
authored
Aug 23, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Method Cross validation
parent
7a5d8e24
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
10 deletions
+14
-10
model_data_analysis.py
model_data_analysis.py
+7
-2
news_classify_tag.py
news_classify_tag.py
+7
-8
No files found.
model_data_analysis.py
View file @
6c4536c4
...
...
@@ -92,7 +92,7 @@ def save():
def
graph_results
(
in_data
,
title
=
''
):
# import numpy as np
from
pylab
import
figure
,
hist
,
savefig
from
pylab
import
figure
,
hist
,
savefig
,
show
data
=
np
.
asarray
(
in_data
)
figure
()
...
...
@@ -152,11 +152,16 @@ if __name__ == '__main__':
# the are some words with great frequency
very_frequent_words
=
dict
([(
key
,
value
)
for
key
,
value
in
words_frequency
.
items
()
if
value
>
2
0
])
very_frequent_words
=
dict
([(
key
,
value
)
for
key
,
value
in
words_frequency
.
items
()
if
value
>
10
0
])
print
(
'There are '
,
len
(
very_frequent_words
),
' very frequent words: '
)
for
item
in
sorted
(
very_frequent_words
,
key
=
very_frequent_words
.
get
,
reverse
=
True
):
print
(
item
,
":"
,
very_frequent_words
[
item
],
end
=
", "
)
un_frequent_words
=
dict
([(
key
,
value
)
for
key
,
value
in
words_frequency
.
items
()
if
value
<
10
])
print
(
'
\n
There are '
,
len
(
un_frequent_words
),
' very seldom words: '
)
for
item
in
sorted
(
un_frequent_words
,
key
=
un_frequent_words
.
get
,
reverse
=
True
):
print
(
item
,
":"
,
un_frequent_words
[
item
],
end
=
", "
)
# compare words set from news to words of DICTIONARY
dict_standart
=
set
([
text_clear
(
word
)
for
word
in
open
(
'large.txt'
,
'r'
)
.
read
()
.
split
(
'
\n
'
)])
dict_standart1
=
set
([
word
for
word
in
open
(
'large.txt'
,
'r'
)
.
read
()
.
split
(
'
\n
'
)])
...
...
news_classify_tag.py
View file @
6c4536c4
...
...
@@ -6,8 +6,6 @@ import numpy as np
import
_mysql
# test submodule commit
# noinspection PyUnresolvedReferences
class
Classifier
:
def
__init__
(
self
,
corpus
=
None
,
host
=
'176.58.117.151'
,
user
=
'esi'
,
password
=
'esi12345'
,
db
=
'esi'
,
port
=
3306
):
...
...
@@ -120,7 +118,7 @@ class Classifier:
texts
=
[
word
.
lower
()
for
word
in
texts
]
texts
=
[
word
for
word
in
texts
if
word
not
in
stopwords
]
# Addition incorrect words
# and word in self.correct_words]
#
texts = [word for word in texts if word not in stopwords
and word in self.correct_words]
texts
=
[
stem
(
word
)
for
word
in
texts
]
texts
=
" "
.
join
(
texts
)
return
texts
...
...
@@ -130,11 +128,12 @@ class Classifier:
from
sklearn.pipeline
import
Pipeline
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.neural_network
import
MLPClassifier
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
tfidf
=
TfidfTransformer
()
# self.classifier =
MultinomialNB
()
self
.
classifier
=
KNeighbors
Classifier
()
# self.classifier =
KNeighborsClassifier
()
self
.
classifier
=
MLP
Classifier
()
# self.classifier = Pipeline([
# ('vect', TfidfVectorizer(stop_words='english')),
# ('clf', MultinomialNB()),
...
...
@@ -332,8 +331,8 @@ def cross_validation():
if
len
(
rez_accordance
&
tags_in_article
)
!=
0
:
total_score
+=
1
else
:
print
(
"
\n
#"
,
str
(
i
))
print
(
"
\n
News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
print
(
"
\n
\n
#"
,
str
(
i
))
print
(
"News title: "
+
title
.
decode
(
'ascii'
,
'ignore'
))
print
(
"Model calculated Accordance :"
,
end
=
" "
)
for
item_tag
in
tags_classif
.
tags
:
if
int
(
item_tag
[
0
])
in
rez_accordance
:
...
...
@@ -345,7 +344,7 @@ def cross_validation():
print
(
item_tag
[
0
],
item_tag
[
1
]
.
decode
(
'ascii'
,
'ignore'
),
end
=
'; '
)
print
(
"
\n\n
The Model was tasted on "
,
len
(
news_results
),
" news. "
)
print
(
"The total
result
is:"
,
total_score
/
len
(
news_results
))
print
(
"The total
accuracy
is:"
,
total_score
/
len
(
news_results
))
# exit(0)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment