Commit 802fee77 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Stop parse news when we found first duplicate

parent 98c9b2f3
...@@ -35,11 +35,9 @@ class MobiHealthNewsSpider(BaseSpider): ...@@ -35,11 +35,9 @@ class MobiHealthNewsSpider(BaseSpider):
if self.pipeline.check_url(item['url']): if self.pipeline.check_url(item['url']):
is_duplicate = True is_duplicate = True
if not is_duplicate: break
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
else: else:
# print(item) yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
pass
has_next = response.xpath( has_next = response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first() "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath( next_url = 'http://www.mobihealthnews.com' + response.xpath(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment