Commit c5d3c31d authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Scrap freshness on parse_by_title_and_description

parent ece88c78
...@@ -34,7 +34,7 @@ class AitopSpider(BaseSpider): ...@@ -34,7 +34,7 @@ class AitopSpider(BaseSpider):
if url_for_parse_tag: if url_for_parse_tag:
return scrapy.Request(url_for_parse_tag, callback=self.parse_by_tag, meta=response.meta) return scrapy.Request(url_for_parse_tag, callback=self.parse_by_tag, meta=response.meta)
else: else:
return scrapy.Request(response.url, callback=self.parse_by_title_description, meta=response.meta) return scrapy.Request(response.url, callback=self.parse_by_title_description, dont_filter=True, meta=response.meta)
def parse_by_title_description(self, response): def parse_by_title_description(self, response):
company = response.meta['company'] company = response.meta['company']
...@@ -64,10 +64,13 @@ class AitopSpider(BaseSpider): ...@@ -64,10 +64,13 @@ class AitopSpider(BaseSpider):
item['tags'] = i.xpath( item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract() ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
if is_company_in_item(item): if is_company_in_item(item):
yield i if self.pipeline.check_url(item['url']):
is_duplicate = True
break
yield item
next_url = self.next_url(response) next_url = self.next_url(response)
if next_url: if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse_by_title_description, meta=response.meta) yield scrapy.Request(next_url, callback=self.parse_by_title_description, meta=response.meta)
else: else:
print("DUPLICATE NEWS") print("DUPLICATE NEWS")
......
...@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider): ...@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
def __init__(self, query=None, *args, **kwargs): def __init__(self, query=None, *args, **kwargs):
super(BaseSpider, self).__init__(*args, **kwargs) super(BaseSpider, self).__init__(*args, **kwargs)
self.condition = query self.condition = query
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13" self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
if self.condition: if self.condition:
print(self.condition) print(self.condition)
self.query += ' or {}'.format(self.condition) self.query += ' or {}'.format(self.condition)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment