Commit 87b3f20f authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add freshness news to TechCrunch spider

parent 1d35b5b6
......@@ -26,6 +26,7 @@ class TechcrunchSpider(BaseSpider):
try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
is_duplicate = False
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
......@@ -37,11 +38,15 @@ class TechcrunchSpider(BaseSpider):
item['post_id'] = response.meta['post_id']
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
if item['title']:
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
if self.can_follow(has_next, is_duplicate):
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse_tag,
meta=response.meta)
......@@ -65,6 +70,7 @@ class TechcrunchSpider(BaseSpider):
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment