Fix Init issues clear: DB info.

Clear main, cross-validation Methods.

Fix Init issues clear: DB info.
Clear main, cross-validation Methods.
566e05d9 · Tags · ca1fa8a0 · 566e05d9
Commit 566e05d9 authored Sep 19, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 16 deletions

news_classify_tag.py news_classify_tag.py +24 -16

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
@@ -16,8 +16,10 @@ class Classifier:
        except ImportError:
            print('You have import flowing packages: sklearn & nltk & re.')

-        stopwords = set(nltk.corpus.stopwords.words('english'))
-        stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
+
+        # stopwords for text_clearner Method
+        self.stopwords = set(nltk.corpus.stopwords.words('english'))
+        self.stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])

        # if corpus is not None:
        #     self.train_data = corpus
@@ -39,6 +41,12 @@ class Classifier:
        for word in file_dict:
            self.correct_words.add(word[:-1])

+        #
+        # from stemming.porter2 import stem
+        from nltk.stem import PorterStemmer
+        self.stem = PorterStemmer().stem
+        # self.stem.
+
        # getting tags
        self.db.query("SELECT id, name FROM wp_esi_tag")
        rez = self.db.store_result()
@@ -51,18 +59,17 @@ class Classifier:

    def text_clear(self, texts):
        import re
-        from stemming.porter2 import stem
-
-        stopwords = set(nltk.corpus.stopwords.words('english'))
-        stopwords.update(['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
-
+        from nltk.tokenize import word_tokenize
        texts = re.sub(r'\d', "", texts)
-        texts = re.sub(r'\s', " ", texts)
-        texts = texts.split(" ")
-        texts = [word.lower() for word in texts if word not in stopwords]
+        texts = texts.translate(str.maketrans("?!,.+-:;\/", 10*" "))
+        texts = word_tokenize(texts) #texts.split(" ")
+        texts = [word.lower() for word in texts if word not in self.stopwords]
        # Addition incorrect words
        # texts = [word for word in texts if word not in stopwords and word in self.correct_words]
-        texts = [stem(word) for word in texts]
+        try:
+            texts = [self.stem(word) for word in texts if len(word) > 0]
+        except:
+            pass
        texts = " ".join(texts)
        return texts

@@ -71,14 +78,13 @@ class Classifier:
        from sklearn.pipeline import Pipeline
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.neighbors import KNeighborsClassifier
-        from sklearn.neural_network import MLPClassifier
+        # from sklearn.neural_network import MLPClassifier

        sql1 = '''SELECT wp_esi_ml_data.tag_id, wp_esi_news_accept.title, wp_esi_news_accept.description, wp_esi_news_accept.text
                                       FROM wp_esi_news_accept, wp_esi_ml_data WHERE wp_esi_ml_data.news_id=wp_esi_news_accept.id
                                          ORDER BY wp_esi_ml_data.tag_id'''
        self.db.query(sql1)
        result = self.db.store_result()
-
        self.X_text_data = list()
        self.y_data = list()
        for tag_id, title, description, text in result.fetch_row(maxrows=0):
@@ -92,6 +98,8 @@ class Classifier:
            if texts != '':
                self.X_text_data.append(self.text_clear(texts))
                self.y_data.append(int(tag_id))
+        # print(self.X_text_data)
+        # print(self.y_data)

        # _____________ entity develop ______________
        sql_entity = "SELECT id, name FROM wp_esi_entity"
@@ -108,7 +116,6 @@ class Classifier:
        self.entity_news = list(
            (entity_id, news_text) for entity_id, news_text in result_entity_news.fetch_row(maxrows=0))
        # print("total # of news linked with entity is", len(self.entity_news))
-        sql_entity_news = """ SELECT company_id, concat(title, text) FROM wp_esi_news_accept ORDER BY company_id """

        # ________Entitys -- tags _________________
        sql_entity = "SELECT entity_id, tag_id FROM wp_esi_tag_entity"
@@ -125,7 +132,7 @@ class Classifier:
                # exit(0)


-        self.vectorizer = CountVectorizer(min_df=2, stop_words='english', ngram_range=(1, 1))
+        self.vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1))
        self.tfidf = TfidfTransformer()
        # k-mean model
        # self.classifier = KNeighborsClassifier()
@@ -162,7 +169,7 @@ class Classifier:
        # exit(0)

    @property
-    def tag_accordance(self, persantage=50):
+    def tag_accordance(self, persantage=20):
        """
        Class method for computing % af tag accordance 
        :return: tuple of % af tag accordance
@@ -207,6 +214,7 @@ class Classifier:
        if type(text) is bytes:
            text.decode('ascii', 'ignore')
        matrix_test_data = self.vectorizer.transform([self.text_clear(text_test)])
+        matrix_test_data = self.tfidf.fit_transform(matrix_test_data)
        rez = self.classifier.predict_proba(matrix_test_data)
        # exit(0)
        self.likelihood_list = rez[0]