update.py 2.01 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
import json
from scrapy.utils.project import get_project_settings
from newspaper import Article
from exa.helpers import Database
from exa.esi_news_classification.news_classify_tag import Classifier

db_cred = get_project_settings().get('DB')


class NewsUpdater:
    def __init__(self):
        self.classifier = Classifier()
        self.classifier.teach_model()
        self.db = Database(**db_cred)

    def select_news(self, query):
        return self.db.select(query)

    def load_text(self, url):
        article = Article(url)
        article.download()
        article.parse()
        return article.text

    def get_tags(self, text):
        self.classifier.classify(text)
        tags = list()
        for i in self.classifier.teg_accordance:
            tags.append(i[0])
        return json.dumps(tags)

32 33
    def update_news(self, query, data):
        self.db.update(query, data)
34 35

    # this bad way
36 37
    def update_all_tags(self, condition='where 1'):
        for i in self.select_news('select id, url from wp_esi_news_accept ' + condition):
38 39 40
            try:
                text = self.load_text(i[1])
                tags = self.get_tags(text)
41 42 43
                if len(tags) == 0:
                    tags = None
                self.update_news('update wp_esi_news_accept set tags_id=%s where id=%s', (tags, i[0]))
44 45 46 47
                print('News id={} was updated'.format(i[0]))
            except BaseException as e:
                print(e.with_traceback())

48 49
    def update_all_text(self, condition='where 1'):
        for i in self.select_news('select id, url from wp_esi_news_accept ' + condition):
50 51
            # try:
                text = self.load_text(i[1])
52
                data = (text.encode('ascii', 'ignore'), i[0])
53
                self.update_news('update wp_esi_news_accept set text=%s where id=%s', data)
54 55 56 57
                print('News id={} was updated'.format(i[0]))
            # except BaseException as e:
            #     print(e.with_traceback())

58 59 60

if __name__ == '__main__':
    ml = NewsUpdater()
61
    ml.update_all_tags()