Commit 6d59a9b1 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add ML to pipeline

parent caf4548d
Subproject commit 1b3a12f2c104ad6165cc64276c67ea19558eb83a
Subproject commit 0e7fc2495576fb6859b35d0db90df9be5b693704
......@@ -5,9 +5,13 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime
from .helpers.db import Database
import json
from scrapy.utils.project import get_project_settings
import redis
from newspaper import Article
from .esi_news_classification.news_classify_tag import Classifier
from .helpers.db import Database
db = get_project_settings().get('DB')
......@@ -15,6 +19,8 @@ db = get_project_settings().get('DB')
class ExaPipeline(object):
def __init__(self):
self.db = Database(**db)
self.classifier = Classifier()
self.classifier.teach_model()
self.buffer = redis.StrictRedis()
for i in self.db.select('select url from wp_esi_news_accept'):
self.buffer.set(i[0], True)
......@@ -39,11 +45,13 @@ class ExaPipeline(object):
return item
def insert_news(self, item):
tags = self.get_tags(item['url'])
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0,
item['tags'])
item['tags'], tags)
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted, temp_tags) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
publish_date, record_date, company_id, is_accepted, temp_tags, tags_id)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
self.db.insert(query, data)
self.add_url_to_block(item['url'])
......@@ -57,3 +65,14 @@ class ExaPipeline(object):
def add_url_to_block(self, url):
self.buffer.set(url, True)
def get_tags(self, url):
article = Article(url)
article.download()
article.parse()
self.classifier.classify(article.text)
tags = list()
for i in self.classifier.teg_accordance:
tags.append(i[0])
return json.dumps(tags)
......@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0
python-scrapyd-api==2.0.1
scrapyd-client==1.1.0
scrapy-proxies==0.3
newspaper3k==0.2.2
PyVirtualDisplay==0.2.1
selenium==3.4.1
dateparser==0.6.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment