update.py 3.1 KB
Newer Older
1 2 3 4
import json
from scrapy.utils.project import get_project_settings
from newspaper import Article
from exa.helpers import Database
5
from exa.helpers import MLDataMaker, News
Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
6
from exa.classificator.news_classify_tag import Classifier
7

8
db = get_project_settings().get('DB')
9 10


Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
11
class Updater:
12
    def __init__(self):
13
        self.classifier = None
14
        self.db = Database(**db)
15 16 17 18 19 20 21 22 23 24 25

    def select_news(self, query):
        return self.db.select(query)

    def load_text(self, url):
        article = Article(url)
        article.download()
        article.parse()
        return article.text

    def get_tags(self, text):
26

27 28
        self.classifier.classify(text)
        tags = list()
29
        for i in self.classifier.tag_accordance:
30 31 32
            tags.append(i[0])
        return json.dumps(tags)

33 34
    def update_news(self, query, data):
        self.db.update(query, data)
35

36
    def update_all_tags(self, condition='where 1'):
37 38
        self.classifier = Classifier(host=db['host'], user=db['user'], password=db['pwd'], db=db['database'], port=3306)
        self.classifier.teach_model()
39
        for i in self.select_news('select id, url from wp_esi_news_accept ' + condition):
40 41 42
            try:
                text = self.load_text(i[1])
                tags = self.get_tags(text)
43 44 45
                if len(tags) == 0:
                    tags = None
                self.update_news('update wp_esi_news_accept set tags_id=%s where id=%s', (tags, i[0]))
46 47
                print('News id={} was updated'.format(i[0]))
            except BaseException as e:
48
                print(e.with_traceback(e))
49

50 51
    def update_all_text(self, condition='where 1'):
        for i in self.select_news('select id, url from wp_esi_news_accept ' + condition):
Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
52
            try:
53
                text = self.load_text(i[1])
54
                data = (text.encode('ascii', 'ignore'), i[0])
55
                self.update_news('update wp_esi_news_accept set text=%s where id=%s', data)
56
                print('News id={} was updated'.format(i[0]))
Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
57 58
            except BaseException as e:
                print(e.with_traceback(e))
59

60
    def update_ml_data(self, query='where 1'):
Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
61
        m = MLDataMaker(self.db)
62
        n = self.select_news('select id, temp_tags from wp_esi_news_accept {}'.format(query))
63 64 65 66 67 68 69 70 71 72 73 74 75 76

        def build_data(news_list):
            res = []
            for i in (News(i, j) for i, j in news_list):
                temp = m.make_ml_data(i)
                if temp:
                    res.extend(temp)
            return res
        data = build_data(n)
        query = 'insert into wp_esi_ml_data(news_id, tag_id) values(%s, %s)'
        data = [(i.news_id, i.tags_id) for i in data]
        self.db.insert_many(query, data)

        print(len(data))
77

78 79 80 81
    def update_radar_ids(self):
        tag_radar = {i[0]: i[1] for i in self.db.select('select tag_id, radar_id from wp_esi_radar_tag')}
        news = self.db.select('select id, tags_id from wp_esi_news_accept')
        for i, tags in news:
82
            print(i, tags)
83 84 85
            if tags:
                radar = [tag_radar[i] for i in json.loads(tags) if i in tag_radar]

86 87

if __name__ == '__main__':
Vasyl Bodnaruk's avatar
Vasyl Bodnaruk committed
88
    ml = Updater()
89
    ml.update_radar_ids()