Commit 6d59a9b1 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add ML to pipeline

parent caf4548d
Subproject commit 1b3a12f2c104ad6165cc64276c67ea19558eb83a Subproject commit 0e7fc2495576fb6859b35d0db90df9be5b693704
...@@ -5,9 +5,13 @@ ...@@ -5,9 +5,13 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime from datetime import datetime
from .helpers.db import Database import json
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
import redis import redis
from newspaper import Article
from .esi_news_classification.news_classify_tag import Classifier
from .helpers.db import Database
db = get_project_settings().get('DB') db = get_project_settings().get('DB')
...@@ -15,6 +19,8 @@ db = get_project_settings().get('DB') ...@@ -15,6 +19,8 @@ db = get_project_settings().get('DB')
class ExaPipeline(object): class ExaPipeline(object):
def __init__(self): def __init__(self):
self.db = Database(**db) self.db = Database(**db)
self.classifier = Classifier()
self.classifier.teach_model()
self.buffer = redis.StrictRedis() self.buffer = redis.StrictRedis()
for i in self.db.select('select url from wp_esi_news_accept'): for i in self.db.select('select url from wp_esi_news_accept'):
self.buffer.set(i[0], True) self.buffer.set(i[0], True)
...@@ -39,11 +45,13 @@ class ExaPipeline(object): ...@@ -39,11 +45,13 @@ class ExaPipeline(object):
return item return item
def insert_news(self, item): def insert_news(self, item):
tags = self.get_tags(item['url'])
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'], data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0, item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0,
item['tags']) item['tags'], tags)
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id, query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted, temp_tags) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n""" publish_date, record_date, company_id, is_accepted, temp_tags, tags_id)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
self.db.insert(query, data) self.db.insert(query, data)
self.add_url_to_block(item['url']) self.add_url_to_block(item['url'])
...@@ -57,3 +65,14 @@ class ExaPipeline(object): ...@@ -57,3 +65,14 @@ class ExaPipeline(object):
def add_url_to_block(self, url): def add_url_to_block(self, url):
self.buffer.set(url, True) self.buffer.set(url, True)
def get_tags(self, url):
article = Article(url)
article.download()
article.parse()
self.classifier.classify(article.text)
tags = list()
for i in self.classifier.teg_accordance:
tags.append(i[0])
return json.dumps(tags)
...@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0 ...@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0
python-scrapyd-api==2.0.1 python-scrapyd-api==2.0.1
scrapyd-client==1.1.0 scrapyd-client==1.1.0
scrapy-proxies==0.3 scrapy-proxies==0.3
newspaper3k==0.2.2
PyVirtualDisplay==0.2.1 PyVirtualDisplay==0.2.1
selenium==3.4.1 selenium==3.4.1
dateparser==0.6.0 dateparser==0.6.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment