Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
exa_news_classificator
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Vasyl Bodnaruk
exa_news_classificator
Commits
07129d22
Commit
07129d22
authored
Aug 15, 2017
by
Tags
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add External dictionary to analysis model
parent
5a945d1a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
8 deletions
+42
-8
stemm_test.py
.idea/stemm_test.py
+16
-0
model_data_analysis.py
model_data_analysis.py
+18
-6
news_classify_tag.py
news_classify_tag.py
+8
-2
No files found.
.idea/stemm_test.py
0 → 100644
View file @
07129d22
def
text_clear
(
word
):
from
stemming.porter2
import
stem
word
=
word
.
lower
()
texts
=
stem
(
word
)
return
texts
# import nltk
from
nltk.stem.lancaster
import
LancasterStemmer
if
__name__
==
"__main__"
:
st
=
LancasterStemmer
()
test_list
=
[
'little'
,
'long'
,
'longer'
,
'longest'
,
'family'
,
'trendline'
,
'unable'
,
'able'
,
'understable'
,
"TomSoyr"
,
'largest'
,
'aumobiles'
]
for
item_text
in
test_list
:
print
(
item_text
,
':'
,
text_clear
(
item_text
),
":"
,
st
.
stem
(
item_text
))
\ No newline at end of file
model_data_analysis.py
View file @
07129d22
...
...
@@ -128,15 +128,16 @@ if __name__ == '__main__':
np_matrix
=
np
.
delete
(
np_matrix
,
tags_empty
,
axis
=
0
)
print
(
'
\n
'
)
print
(
'Now tags are'
,
len
(
tags
))
print
(
tags
)
print
(
np_matrix
.
shape
)
np_transpose
=
np_matrix
.
transpose
()
print
(
np_transpose
.
shape
)
#
print(np_transpose.shape)
# print(np_transpose[2][25])
for
i
in
range
(
np_transpose
.
shape
[
0
]):
for
j
in
range
(
np_transpose
.
shape
[
1
]):
if
np_transpose
[
i
][
j
]
!=
0
:
np_transpose
[
i
][
j
]
=
1
# print(i,j)
# print(i,j)
# words frequency measurement
words_frequency
=
{}
...
...
@@ -144,21 +145,32 @@ if __name__ == '__main__':
# print(row, sum(np_transpose[row]), data_dictionary[row])
words_frequency
[
data_dictionary
[
row
]]
=
(
int
(
sum
(
np_transpose
[
row
])))
# print(words_frequency)
frequency
=
list
(
words_frequency
.
values
())
# !!!!graph results
graph_results
(
frequency
,
"The Words Frequencies"
)
frequency
=
list
(
words_frequency
.
values
())
# graph_results(frequency, "The Words Frequencies")
# the are some words with great frequency
very_frequent_words
=
dict
([(
key
,
value
)
for
key
,
value
in
words_frequency
.
items
()
if
value
>
20
])
print
(
'There are '
,
len
(
very_frequent_words
),
' very frequent words: '
)
print
(
very_frequent_words
)
for
item
in
sorted
(
very_frequent_words
,
key
=
very_frequent_words
.
get
,
reverse
=
True
):
print
(
item
,
":"
,
very_frequent_words
[
item
],
end
=
", "
)
# compare words set from news to words of DICTIONARY
dict_standart
=
set
([
text_clear
(
word
)
for
word
in
open
(
'large.txt'
,
'r'
)
.
read
()
.
split
(
'
\n
'
)])
dict_standart1
=
set
([
word
for
word
in
open
(
'large.txt'
,
'r'
)
.
read
()
.
split
(
'
\n
'
)])
# print(len(dict_standart), dict_standart)
diff_dict_sets
=
set
(
data_dictionary
)
-
dict_standart
print
(
'The difference of extracted dictionary and common dictionary is'
,
len
(
diff_dict_sets
))
print
(
'
\n
The difference of extracted dictionary and common dictionary is'
,
len
(
diff_dict_sets
))
print
(
diff_dict_sets
)
diff_dict_sets1
=
set
(
data_dictionary
)
-
dict_standart1
print
(
'
\n
2
\n
The difference of extracted dictionary and common dictionary is'
,
len
(
diff_dict_sets1
))
print
(
diff_dict_sets1
)
print
(
"And the diff of sets are: "
,
len
(
diff_dict_sets1
^
diff_dict_sets
))
print
(
diff_dict_sets1
^
diff_dict_sets
)
diff_diff_dictionary
=
set
(
data_dictionary
)
-
dict_standart
-
dict_standart1
print
(
"If double diffs to do we get: "
,
len
(
diff_diff_dictionary
))
print
(
diff_diff_dictionary
)
\ No newline at end of file
news_classify_tag.py
View file @
07129d22
...
...
@@ -26,6 +26,13 @@ class Classifier:
self
.
train_data
=
corpus
return
# load external f=dictionary
file_name_dictionary
=
'large.txt'
file_dict
=
open
(
file_name_dictionary
,
"r"
)
self
.
correct_words
=
set
()
for
word
in
file_dict
:
self
.
correct_words
.
add
(
word
[:
-
1
])
# Extract data form DataBase
self
.
db
=
_mysql
.
connect
(
host
=
host
,
port
=
port
,
user
=
user
,
passwd
=
password
,
db
=
db
)
...
...
@@ -75,7 +82,7 @@ class Classifier:
texts
=
re
.
sub
(
r'\d'
,
" "
,
texts
)
texts
=
texts
.
split
(
" "
)
texts
=
[
word
.
lower
()
for
word
in
texts
]
texts
=
[
word
for
word
in
texts
if
len
(
word
)
>
3
and
word
not
in
stopwords
]
texts
=
[
word
for
word
in
texts
if
(
len
(
word
)
>
1
)
and
(
word
not
in
stopwords
)
and
(
word
in
self
.
correct_words
)
]
# texts = [word for word in texts if word not in stopwords]
texts
=
[
stem
(
word
)
for
word
in
texts
]
texts
=
" "
.
join
(
texts
)
...
...
@@ -284,7 +291,6 @@ if __name__ == "__main__":
# tag = int(tag)
# # print(tag)
# print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
#
# exit(0)
tags_classif
.
db
.
query
(
"SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 25"
)
result
=
tags_classif
.
db
.
store_result
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment