Commit d23cdbbe authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add redis to pipeline

parent 98bf7249
......@@ -7,6 +7,7 @@
from datetime import datetime
from .helpers.db import Database
from scrapy.utils.project import get_project_settings
import redis
db = get_project_settings().get('DB')
......@@ -14,6 +15,9 @@ db = get_project_settings().get('DB')
class ExaPipeline(object):
def __init__(self):
self.db = Database(**db)
self.buffer = redis.StrictRedis()
for i in (for i in self.db.select('select url from wp_esi_news_accept')):
self.buffer.set(i, True)
self.urls = {i[0] for i in self.db.select('select url from wp_esi_news_accept')}
super(ExaPipeline, self).__init__()
......@@ -27,6 +31,11 @@ class ExaPipeline(object):
if item['tags']:
item['tags'] = ','.join(item['tags']).replace('\n', '').replace('\t', '')
if self.check_url(item['url']):
print("DUPLICATE")
else:
print("UNIQUE")
self.insert_news(item)
return item
......@@ -36,12 +45,9 @@ class ExaPipeline(object):
item['tags'])
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted, temp_tags) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
if self.check_url(item['url']):
print("DUPLICATE", item)
else:
print("UNIQUE", item)
self.db.insert(query, data)
self.urls.add(item['url'])
self.add_url_to_block(item['url'])
def check_url(self, url):
if url in self.urls:
......@@ -49,3 +55,5 @@ class ExaPipeline(object):
else:
return False
def add_url_to_block(self, url):
self.urls.add(url)
......@@ -23,6 +23,3 @@ class BaseSpider(scrapy.Spider):
companies = CompanyMaker(db.select(self.query))
companies.make_companies(name)
return companies.get_companies()
def check_buffer(self, url):
pass
......@@ -12,3 +12,5 @@ django-cors-headers==2.1.0
celery==4.0.2
flower==0.9.2
django-celery-beat==1.0.1
redis==2.10.5
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment