Delete debug message

1d35b5b6 · Vasyl Bodnaruk · 5d7b9c3f · 1d35b5b6 · 1d35b5b6
Commit 1d35b5b6 authored Jul 24, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 4 deletions

mobihealthnews.py exa/exa/spiders/mobihealthnews.py +0 -1

techcrunch.py exa/exa/spiders/techcrunch.py +9 -3

No files found.
--- a/exa/exa/spiders/mobihealthnews.py
+++ b/exa/exa/spiders/mobihealthnews.py
@@ -44,7 +44,6 @@ class MobiHealthNewsSpider(BaseSpider):
                "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
            print("COUNT", is_duplicate)
            if self.can_follow(has_next, is_duplicate):
-                print('yepta')
                yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
            else:
                print("DUPLICATE NEWS")

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -53,6 +53,7 @@ class TechcrunchSpider(BaseSpider):
        try:
            company = response.meta['company']
            news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]")
+            is_duplicate = False
            for i in news_list:
                item = ExaItem()
                item['date'] = i.xpath("./div/time/@datetime").extract_first()
@@ -64,14 +65,19 @@ class TechcrunchSpider(BaseSpider):
                item.update(self.get_common_items(company))
                item['post_id'] = response.meta['post_id']
+                if self.pipeline.check_url(item['url']):
+                    is_duplicate = True
+                    break
                if 'Crunch Report' not in item['title']:
                    yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
            has_next = response.xpath(
                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-            if has_next:
+            if self.can_follow(has_next, is_duplicate):
-                yield scrapy.Request(has_next + '/', callback=self.parse_company,
+                yield scrapy.Request(has_next + '/', callback=self.parse, meta=response.meta)
-                                     meta=response.meta)
+            else:
+                print("DUPLICATE NEWS")
        except BaseException as e:
            print('We had error')
            traceback.print_exc()