Commit ece88c78 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Scrap freshness on parse_by_tag

parent dec262cb
...@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'exa.pipelines.ExaPipeline': 300, 'exa.pipelines.ExaPipeline': 300,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
......
...@@ -89,11 +89,17 @@ class AitopSpider(BaseSpider): ...@@ -89,11 +89,17 @@ class AitopSpider(BaseSpider):
item['post_id'] = response.meta['post_id'] item['post_id'] = response.meta['post_id']
item['tags'] = i.xpath( item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract() ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
yield item yield item
next_url = self.next_url(response) next_url = self.next_url(response)
if next_url: if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta) yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
else:
print("DUPLICATE NEWS")
except: except:
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment