Resolve issues

a459780e · Tags · a0879988 · a459780e · a0879988 · a459780e
Commit a459780e authored Aug 01, 2017 by Tags
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 205 deletions

news_classify_tag.py news_classify_tag.py +4 -3

requaiments.txt requaiments.txt +0 -0

requirements.txt requirements.txt +3 -0

test_db.py test_db.py +0 -202

No files found.
--- a/news_classify_tag.py
+++ b/news_classify_tag.py
 # import sys
 # from _mysql import OperationalError
 # from pprint import pprint
+import nltk
 import numpy as np
 import _mysql

@@ -12,7 +13,7 @@ class Classifier:
    def __init__(self, corpus=None, host='176.58.117.151', \
                 user='esi', password='esi12345', db='esi', port=3306):
        try:
-            import nltk
+            # import nltk
            import operator
            from stemming.porter2 import stem
        except ImportError:
@@ -65,7 +66,7 @@ class Classifier:

    def text_clear(self, texts):
        import re
-        import nltk
+        # import nltk
        from stemming.porter2 import stem

        stopwords = set (nltk.corpus.stopwords.words ('english'))
@@ -156,7 +157,7 @@ class Classifier:
        self.matrix_test_data = matrix_test_data

    def graph_results(self):
-        import numpy as np
+        # import numpy as np
        from pylab import figure, show, hist
        print (self.total_results[3])
        data = []

--- a/requaiments.txt
+++ b/requaiments.txt
--- a/requirements.txt
+++ b/requirements.txt
+DateTime==4.2
 mysqlclient==1.3.10
 nltk==3.2.4
 numpy==1.13.1
 pkg-resources==0.0.0
+pytz==2017.2
 scikit-learn==0.18.2
 scipy==0.19.1
 six==1.10.0
 sklearn==0.0
 stemming==1.0.1
+zope.interface==4.4.2
--- a/test_db.py
+++ b/test_db.py
-import sys
-
-import nltk
-from stemming.porter2 import stem
-
-stopwords = set (nltk.corpus.stopwords.words ('english'))
-stopwords.update (['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
-
-
-class Classifier ():
-    """" 
-    Clasiffier class which get texts data from dataBase 
-    and build classifaer vector.
-    """
-
-    def __init__(self, corpus=None, host='localhost', port=8000, user='root', password='password', db='news'):
-        try:
-            from MySQLdb import connect
-            # import sklearn.datasets
-            import nltk.stem
-            import re
-            from stemming.porter2 import stem
-        except ImportError:
-            print ('You have import flowing packages: sklearn & nltk & re.')
-
-        english_steamer = nltk.stem.SnowballStemmer ('english')
-        stopwords = set (nltk.corpus.stopwords.words ('english'))
-        stopwords.update (['from:', 'subject:', 'writes:', 'writes', 'click', 'here', 'page', 'origin'])
-
-        if corpus is not None:
-            self.train_data = corpus
-            return
-
-        # Extract data form DataBase
-        conn = connect (host=host, port=port, user=user, password=password, db=db)
-        cur = conn.cursor ()
-        self.cursor = cur
-
-        result = cur.execute ("select id, name from wp_esi_tag")
-        tag = list ()
-        tag_description = list ()
-        for item in cur.fetchall ():
-            if item is not None:
-                tag.append (item[0])
-                tag_description.append (item[1])
-
-        # tags
-        tags = list ()
-        for item in zip (tag, tag_description):
-            tags.append (item)
-        self.tags = tuple (tags)
-        # print (tags)
-
-        train_data = []
-        # text_id = []
-        for id, tag_description in tags:
-            sql1 = "select wp_esi_news.title, wp_esi_news.description  from wp_esi_news, wp_esi_tag_news WHERE wp_esi_tag_news.news_id=wp_esi_news.id AND wp_esi_tag_news.tag_id=" + str (
-                id)
-            result = cur.execute (sql1)
-            # posses text by regexp
-            # text_id = id
-            for title, decription in cur.fetchall ():
-                texts = str (title) + str (decription)
-                # texts =
-                texts = self.text_clear (texts)
-                # text_id[id] = texts
-            train_data.append (texts)
-
-        # train_data= list id of text list
-        self.train_data_text = train_data
-        self.train_data_text
-
-    @staticmethod
-    def text_clear(texts):
-        import re
-        texts = re.sub (r'\d', " ", texts)
-        texts = texts.split (" ")
-        texts = [word.lower () for word in texts]
-        texts = [word for word in texts if len (word) > 3 and word not in stopwords]
-        texts = [stem (word) for word in texts]
-        texts = " ".join (texts)
-        return texts
-
-    def __repr__(self):
-        pass
-
-    def clasif_fit(self, data_text=''):
-        from sklearn.feature_extraction.text import CountVectorizer
-        self.vectorizer = CountVectorizer (min_df=1)
-        if data_text == '':
-            data_text = self.train_data_text
-        matrix_train_data = self.vectorizer.fit_transform (data_text)
-        self.matrix_fited = matrix_train_data
-        print ("number of feaches =", len (self.vectorizer.get_feature_names ()), self.vectorizer.get_feature_names ())
-        print ("Quontety of tags = ", len (self.tags), self.tags)
-        # print (matrix_train_data.toarray ().transpose ())
-        return matrix_train_data
-
-    def test_cllasifyer(self, test_corpus=None):
-        if test_corpus is None:
-            cur = self.cursor
-            result = cur.execute ("select title, description from wp_esi_news_accept")
-            title, description = cur.fetchone ()
-
-            text_test = title + " " + description
-        else:
-            text_test = test_corpus
-
-        nltk.pprint ("Test text: " + test_corpus)
-        text_test = self.text_clear (text_test)
-        matrix_test_data = self.vectorizer.transform ([text_test])
-        self.matrix_test = matrix_test_data
-        # print (matrix_test_data)
-        # print (matrix_test_data.toarray ())
-        return matrix_test_data
-
-
-def dist(v1, v2):
-    import scipy as sp
-    # delta = v1 - v2
-    v1_norm = v1 / sp.linalg.norm (v1.toarray ())
-    v2_norm = v2 / sp.linalg.norm (v2.toarray ())
-    delta = v1_norm - v2_norm
-    return sp.linalg.norm (delta.toarray ())
-
-
-if __name__ == "__main__":
-    # text = Classifier()
-    text = Classifier (host='176.58.117.151', user='esi', password='esi12345)',\
-                       db='esi', port=3306)  # host="", password=, user=, bd=
-    # print (text.train_data_text)
-    trained_model = text.clasif_fit ()
-
-    num_samples, num_features = trained_model.shape
-    # print("# samples: ", num_samples, "# features", num_features)
-    # find  best fit
-
-    print()
-
-    cur = text.cursor
-    result = cur.execute ("select title, description from wp_esi_news_accept")
-    numb_of_news_to_show = 10
-    news_item=0
-    for title, description in cur.fetchall():
-        if numb_of_news_to_show < news_item:
-            break
-        else:
-            news_item += 1
-        print("\n #" + str(news_item))
-        # nltk.pprint (title + " " + description)
-        text_test = title + " " + description
-        # print ("Test text: ")
-        tested_model = text.test_cllasifyer (test_corpus=text_test)
-
-        best_doc = None
-        best_dist = sys.maxsize
-        best_i = None
-        fit_array = []
-        # Calculating distance function
-        for i in range (0, num_samples):
-            post_vect = trained_model.getrow (i)
-            d = dist (post_vect, tested_model)
-            # print(i, "\t Distanse = ", d)
-            fit_array.append (((i, d)))
-            if d < best_dist:
-                best_dist = d
-                best_i = i
-        # info results
-        print ("=====================================")
-        print ("Best fit for this news is ", best_i, ' - tag')
-        # print(text.tags[best_i])
-        import operator
-
-        sorted_fit_array = sorted (fit_array, key=operator.itemgetter (1))
-        # import collections
-        # od =
-        i = 0
-        print("=====================================")
-        print("#-tag |\tdistance \t |  tag label")
-        print("=====================================")
-        for numb, val in sorted_fit_array:
-            print (numb, " | " , val," | " ,text.tags[numb][1])
-            i += 1
-            if i > 10:
-                break
-                # pprint(sorted_fit_array)
-                print ("=====================================")
-
-
-    exit (0)
-
-
-    # all_data = sklearn.datasets.fetch_20newsgroups (subset="all")
-    # # Number of total posts: 18846
-    #
-    # groups = [
-    #     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
-    #     'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
-    # train_data = sklearn.datasets.fetch_20newsgroups (subset="train", categories=groups)
-    #
-    # train_dat_my = sklearn.feature_extraction
-    # num_clusters = 50  # sp.unique(labels).shape[0]