Commit 07129d22 authored by Tags's avatar Tags

Add External dictionary to analysis model

parent 5a945d1a
def text_clear(word):
from stemming.porter2 import stem
word = word.lower()
texts = stem(word)
return texts
# import nltk
from nltk.stem.lancaster import LancasterStemmer
if __name__ == "__main__":
st = LancasterStemmer()
test_list = ['little', 'long', 'longer', 'longest', 'family', 'trendline', 'unable', 'able', 'understable', "TomSoyr", 'largest', 'aumobiles' ]
for item_text in test_list:
print(item_text, ':', text_clear(item_text), ":", st.stem(item_text))
\ No newline at end of file
...@@ -128,9 +128,10 @@ if __name__ == '__main__': ...@@ -128,9 +128,10 @@ if __name__ == '__main__':
np_matrix = np.delete(np_matrix, tags_empty, axis=0) np_matrix = np.delete(np_matrix, tags_empty, axis=0)
print('\n') print('\n')
print('Now tags are', len(tags)) print('Now tags are', len(tags))
print(tags)
print(np_matrix.shape) print(np_matrix.shape)
np_transpose = np_matrix.transpose() np_transpose = np_matrix.transpose()
print(np_transpose.shape) # print(np_transpose.shape)
# print(np_transpose[2][25]) # print(np_transpose[2][25])
for i in range(np_transpose.shape[0]): for i in range(np_transpose.shape[0]):
for j in range(np_transpose.shape[1]): for j in range(np_transpose.shape[1]):
...@@ -144,21 +145,32 @@ if __name__ == '__main__': ...@@ -144,21 +145,32 @@ if __name__ == '__main__':
# print(row, sum(np_transpose[row]), data_dictionary[row]) # print(row, sum(np_transpose[row]), data_dictionary[row])
words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row]))) words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row])))
# print(words_frequency) # print(words_frequency)
frequency = list(words_frequency.values())
# !!!!graph results # !!!!graph results
graph_results(frequency, "The Words Frequencies") frequency = list(words_frequency.values())
# graph_results(frequency, "The Words Frequencies")
# the are some words with great frequency # the are some words with great frequency
very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20]) very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
print('There are ', len(very_frequent_words), ' very frequent words: ') print('There are ', len(very_frequent_words), ' very frequent words: ')
print(very_frequent_words) for item in sorted(very_frequent_words, key=very_frequent_words.get, reverse=True):
print(item, ":", very_frequent_words[item], end=", ")
# compare words set from news to words of DICTIONARY # compare words set from news to words of DICTIONARY
dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')]) dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
dict_standart1 = set([word for word in open('large.txt', 'r').read().split('\n')])
# print(len(dict_standart), dict_standart) # print(len(dict_standart), dict_standart)
diff_dict_sets = set(data_dictionary) - dict_standart diff_dict_sets = set(data_dictionary) - dict_standart
print('The difference of extracted dictionary and common dictionary is', len(diff_dict_sets)) print('\nThe difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
print(diff_dict_sets) print(diff_dict_sets)
diff_dict_sets1 = set(data_dictionary) - dict_standart1
print ('\n2\nThe difference of extracted dictionary and common dictionary is', len (diff_dict_sets1))
print (diff_dict_sets1)
print("And the diff of sets are: ", len(diff_dict_sets1 ^ diff_dict_sets))
print(diff_dict_sets1 ^ diff_dict_sets)
diff_diff_dictionary= set(data_dictionary) - dict_standart - dict_standart1
print("If double diffs to do we get: ", len(diff_diff_dictionary))
print(diff_diff_dictionary)
\ No newline at end of file
...@@ -26,6 +26,13 @@ class Classifier: ...@@ -26,6 +26,13 @@ class Classifier:
self.train_data = corpus self.train_data = corpus
return return
# load external f=dictionary
file_name_dictionary = 'large.txt'
file_dict = open(file_name_dictionary, "r")
self.correct_words = set()
for word in file_dict:
self.correct_words.add(word[:-1])
# Extract data form DataBase # Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db) self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
...@@ -75,7 +82,7 @@ class Classifier: ...@@ -75,7 +82,7 @@ class Classifier:
texts = re.sub(r'\d', " ", texts) texts = re.sub(r'\d', " ", texts)
texts = texts.split(" ") texts = texts.split(" ")
texts = [word.lower() for word in texts] texts = [word.lower() for word in texts]
texts = [word for word in texts if len(word) > 3 and word not in stopwords] texts = [word for word in texts if (len(word) > 1) and (word not in stopwords) and (word in self.correct_words)]
# texts = [word for word in texts if word not in stopwords] # texts = [word for word in texts if word not in stopwords]
texts = [stem(word) for word in texts] texts = [stem(word) for word in texts]
texts = " ".join(texts) texts = " ".join(texts)
...@@ -284,7 +291,6 @@ if __name__ == "__main__": ...@@ -284,7 +291,6 @@ if __name__ == "__main__":
# tag = int(tag) # tag = int(tag)
# # print(tag) # # print(tag)
# print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore')) # print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
#
# exit(0) # exit(0)
tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 25") tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 25")
result = tags_classif.db.store_result() result = tags_classif.db.store_result()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment