Commit 1d35b5b6 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Delete debug message

parent 5d7b9c3f
...@@ -44,7 +44,6 @@ class MobiHealthNewsSpider(BaseSpider): ...@@ -44,7 +44,6 @@ class MobiHealthNewsSpider(BaseSpider):
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first() "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
print("COUNT", is_duplicate) print("COUNT", is_duplicate)
if self.can_follow(has_next, is_duplicate): if self.can_follow(has_next, is_duplicate):
print('yepta')
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta) yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
else: else:
print("DUPLICATE NEWS") print("DUPLICATE NEWS")
......
...@@ -53,6 +53,7 @@ class TechcrunchSpider(BaseSpider): ...@@ -53,6 +53,7 @@ class TechcrunchSpider(BaseSpider):
try: try:
company = response.meta['company'] company = response.meta['company']
news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]") news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]")
is_duplicate = False
for i in news_list: for i in news_list:
item = ExaItem() item = ExaItem()
item['date'] = i.xpath("./div/time/@datetime").extract_first() item['date'] = i.xpath("./div/time/@datetime").extract_first()
...@@ -64,14 +65,19 @@ class TechcrunchSpider(BaseSpider): ...@@ -64,14 +65,19 @@ class TechcrunchSpider(BaseSpider):
item.update(self.get_common_items(company)) item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id'] item['post_id'] = response.meta['post_id']
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
if 'Crunch Report' not in item['title']: if 'Crunch Report' not in item['title']:
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item}) yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
has_next = response.xpath( has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first() "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next: if self.can_follow(has_next, is_duplicate):
yield scrapy.Request(has_next + '/', callback=self.parse_company, yield scrapy.Request(has_next + '/', callback=self.parse, meta=response.meta)
meta=response.meta) else:
print("DUPLICATE NEWS")
except BaseException as e: except BaseException as e:
print('We had error') print('We had error')
traceback.print_exc() traceback.print_exc()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment