Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
566e05d9
Commit
566e05d9
authored
Sep 19, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix Init issues clear: DB info.
Clear main, cross-validation Methods.
parent
ca1fa8a0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
16 deletions
+24
-16
news_classify_tag.py
news_classify_tag.py
+24
-16
No files found.
news_classify_tag.py
View file @
566e05d9
...
...
@@ -16,8 +16,10 @@ class Classifier:
except
ImportError
:
print
(
'You have import flowing packages: sklearn & nltk & re.'
)
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
# stopwords for text_clearner Method
self
.
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
self
.
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
# if corpus is not None:
# self.train_data = corpus
...
...
@@ -39,6 +41,12 @@ class Classifier:
for
word
in
file_dict
:
self
.
correct_words
.
add
(
word
[:
-
1
])
#
# from stemming.porter2 import stem
from
nltk.stem
import
PorterStemmer
self
.
stem
=
PorterStemmer
()
.
stem
# self.stem.
# getting tags
self
.
db
.
query
(
"SELECT id, name FROM wp_esi_tag"
)
rez
=
self
.
db
.
store_result
()
...
...
@@ -51,18 +59,17 @@ class Classifier:
def
text_clear
(
self
,
texts
):
import
re
from
stemming.porter2
import
stem
stopwords
=
set
(
nltk
.
corpus
.
stopwords
.
words
(
'english'
))
stopwords
.
update
([
'from:'
,
'subject:'
,
'writes:'
,
'writes'
,
'click'
,
'here'
,
'page'
,
'origin'
])
from
nltk.tokenize
import
word_tokenize
texts
=
re
.
sub
(
r'\d'
,
""
,
texts
)
texts
=
re
.
sub
(
r'\s'
,
" "
,
texts
)
texts
=
texts
.
split
(
" "
)
texts
=
[
word
.
lower
()
for
word
in
texts
if
word
not
in
stopwords
]
texts
=
texts
.
translate
(
str
.
maketrans
(
"?!,.+-:;
\
/"
,
10
*
" "
)
)
texts
=
word_tokenize
(
texts
)
#
texts.split(" ")
texts
=
[
word
.
lower
()
for
word
in
texts
if
word
not
in
s
elf
.
s
topwords
]
# Addition incorrect words
# texts = [word for word in texts if word not in stopwords and word in self.correct_words]
texts
=
[
stem
(
word
)
for
word
in
texts
]
try
:
texts
=
[
self
.
stem
(
word
)
for
word
in
texts
if
len
(
word
)
>
0
]
except
:
pass
texts
=
" "
.
join
(
texts
)
return
texts
...
...
@@ -71,14 +78,13 @@ class Classifier:
from
sklearn.pipeline
import
Pipeline
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.neural_network
import
MLPClassifier
#
from sklearn.neural_network import MLPClassifier
sql1
=
'''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
ORDER BY wp_esi_ml_data.tag_id'''
self
.
db
.
query
(
sql1
)
result
=
self
.
db
.
store_result
()
self
.
X_text_data
=
list
()
self
.
y_data
=
list
()
for
tag_id
,
title
,
description
,
text
in
result
.
fetch_row
(
maxrows
=
0
):
...
...
@@ -92,6 +98,8 @@ class Classifier:
if
texts
!=
''
:
self
.
X_text_data
.
append
(
self
.
text_clear
(
texts
))
self
.
y_data
.
append
(
int
(
tag_id
))
# print(self.X_text_data)
# print(self.y_data)
# _____________ entity develop ______________
sql_entity
=
"SELECT id, name FROM wp_esi_entity"
...
...
@@ -108,7 +116,6 @@ class Classifier:
self
.
entity_news
=
list
(
(
entity_id
,
news_text
)
for
entity_id
,
news_text
in
result_entity_news
.
fetch_row
(
maxrows
=
0
))
# print("total # of news linked with entity is", len(self.entity_news))
sql_entity_news
=
""" SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """
# ________Entitys -- tags _________________
sql_entity
=
"SELECT entity_id, tag_id FROM wp_esi_tag_entity"
...
...
@@ -125,7 +132,7 @@ class Classifier:
# exit(0)
self
.
vectorizer
=
CountVectorizer
(
min_df
=
2
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
vectorizer
=
CountVectorizer
(
min_df
=
1
,
stop_words
=
'english'
,
ngram_range
=
(
1
,
1
))
self
.
tfidf
=
TfidfTransformer
()
# k-mean model
# self.classifier = KNeighborsClassifier()
...
...
@@ -162,7 +169,7 @@ class Classifier:
# exit(0)
@
property
def
tag_accordance
(
self
,
persantage
=
5
0
):
def
tag_accordance
(
self
,
persantage
=
2
0
):
"""
Class method for computing
%
af tag accordance
:return: tuple of
%
af tag accordance
...
...
@@ -207,6 +214,7 @@ class Classifier:
if
type
(
text
)
is
bytes
:
text
.
decode
(
'ascii'
,
'ignore'
)
matrix_test_data
=
self
.
vectorizer
.
transform
([
self
.
text_clear
(
text_test
)])
matrix_test_data
=
self
.
tfidf
.
fit_transform
(
matrix_test_data
)
rez
=
self
.
classifier
.
predict_proba
(
matrix_test_data
)
# exit(0)
self
.
likelihood_list
=
rez
[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment