Add freshness news to TechCrunch spider

87b3f20f · Vasyl Bodnaruk · 1d35b5b6 · 87b3f20f
Commit 87b3f20f authored Jul 24, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

techcrunch.py exa/exa/spiders/techcrunch.py +7 -1

No files found.
--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -26,6 +26,7 @@ class TechcrunchSpider(BaseSpider):
        try:
            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
            company = response.meta['company']
+            is_duplicate = False
            for i in news_list:
                item = ExaItem()
                item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
@@ -37,11 +38,15 @@ class TechcrunchSpider(BaseSpider):

                item['post_id'] = response.meta['post_id']

+                if self.pipeline.check_url(item['url']):
+                    is_duplicate = True
+                    break
+
                if item['title']:
                    yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})

            has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-            if has_next:
+            if self.can_follow(has_next, is_duplicate):
                next_url = 'https://techcrunch.com' + has_next + '/'
                yield scrapy.Request(next_url, callback=self.parse_tag,
                                     meta=response.meta)
@@ -65,6 +70,7 @@ class TechcrunchSpider(BaseSpider):
                item.update(self.get_common_items(company))

                item['post_id'] = response.meta['post_id']
+
                if self.pipeline.check_url(item['url']):
                    is_duplicate = True
                    break