Commit 07129d22 authored by Tags's avatar Tags

Add External dictionary to analysis model

parent 5a945d1a
def text_clear(word):
from stemming.porter2 import stem
word = word.lower()
texts = stem(word)
return texts
# import nltk
from nltk.stem.lancaster import LancasterStemmer
if __name__ == "__main__":
st = LancasterStemmer()
test_list = ['little', 'long', 'longer', 'longest', 'family', 'trendline', 'unable', 'able', 'understable', "TomSoyr", 'largest', 'aumobiles' ]
for item_text in test_list:
print(item_text, ':', text_clear(item_text), ":", st.stem(item_text))
\ No newline at end of file
......@@ -128,15 +128,16 @@ if __name__ == '__main__':
np_matrix = np.delete(np_matrix, tags_empty, axis=0)
print('\n')
print('Now tags are', len(tags))
print(tags)
print(np_matrix.shape)
np_transpose = np_matrix.transpose()
print(np_transpose.shape)
# print(np_transpose.shape)
# print(np_transpose[2][25])
for i in range(np_transpose.shape[0]):
for j in range(np_transpose.shape[1]):
if np_transpose[i][j] != 0:
np_transpose[i][j] = 1
# print(i,j)
# print(i,j)
# words frequency measurement
words_frequency={}
......@@ -144,21 +145,32 @@ if __name__ == '__main__':
# print(row, sum(np_transpose[row]), data_dictionary[row])
words_frequency[data_dictionary[row]]= (int(sum(np_transpose[row])))
# print(words_frequency)
frequency = list(words_frequency.values())
# !!!!graph results
graph_results(frequency, "The Words Frequencies")
frequency = list(words_frequency.values())
# graph_results(frequency, "The Words Frequencies")
# the are some words with great frequency
very_frequent_words = dict([(key, value) for key, value in words_frequency.items() if value > 20])
print('There are ', len(very_frequent_words), ' very frequent words: ')
print(very_frequent_words)
for item in sorted(very_frequent_words, key=very_frequent_words.get, reverse=True):
print(item, ":", very_frequent_words[item], end=", ")
# compare words set from news to words of DICTIONARY
dict_standart = set([text_clear(word) for word in open('large.txt', 'r').read().split('\n')])
dict_standart1 = set([word for word in open('large.txt', 'r').read().split('\n')])
# print(len(dict_standart), dict_standart)
diff_dict_sets = set(data_dictionary) - dict_standart
print('The difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
print('\nThe difference of extracted dictionary and common dictionary is', len(diff_dict_sets))
print(diff_dict_sets)
diff_dict_sets1 = set(data_dictionary) - dict_standart1
print ('\n2\nThe difference of extracted dictionary and common dictionary is', len (diff_dict_sets1))
print (diff_dict_sets1)
print("And the diff of sets are: ", len(diff_dict_sets1 ^ diff_dict_sets))
print(diff_dict_sets1 ^ diff_dict_sets)
diff_diff_dictionary= set(data_dictionary) - dict_standart - dict_standart1
print("If double diffs to do we get: ", len(diff_diff_dictionary))
print(diff_diff_dictionary)
\ No newline at end of file
......@@ -26,6 +26,13 @@ class Classifier:
self.train_data = corpus
return
# load external f=dictionary
file_name_dictionary = 'large.txt'
file_dict = open(file_name_dictionary, "r")
self.correct_words = set()
for word in file_dict:
self.correct_words.add(word[:-1])
# Extract data form DataBase
self.db = _mysql.connect(host=host, port=port, user=user, passwd=password, db=db)
......@@ -75,7 +82,7 @@ class Classifier:
texts = re.sub(r'\d', " ", texts)
texts = texts.split(" ")
texts = [word.lower() for word in texts]
texts = [word for word in texts if len(word) > 3 and word not in stopwords]
texts = [word for word in texts if (len(word) > 1) and (word not in stopwords) and (word in self.correct_words)]
# texts = [word for word in texts if word not in stopwords]
texts = [stem(word) for word in texts]
texts = " ".join(texts)
......@@ -284,7 +291,6 @@ if __name__ == "__main__":
# tag = int(tag)
# # print(tag)
# print(str(tag) + " " + tags_classif.tags[tag - 1][1].decode('ascii', 'ignore'))
#
# exit(0)
tags_classif.db.query("SELECT title, description, text FROM wp_esi_news_accept ORDER BY RAND() LIMIT 25")
result = tags_classif.db.store_result()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment