Commit 802fee77 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Stop parse news when we found first duplicate

parent 98c9b2f3
......@@ -35,11 +35,9 @@ class MobiHealthNewsSpider(BaseSpider):
if self.pipeline.check_url(item['url']):
is_duplicate = True
if not is_duplicate:
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
break
else:
# print(item)
pass
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
has_next = response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment