Scrap freshness on parse_by_tag

ece88c78 · Vasyl Bodnaruk · dec262cb · ece88c78 · ece88c78
Commit ece88c78 authored Jul 25, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

settings.py exa/exa/settings.py +1 -1

aitop.py exa/exa/spiders/aitop.py +7 -1

No files found.
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    # 'exa.pipelines.ExaPipeline': 300,
+    'exa.pipelines.ExaPipeline': 300,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -89,11 +89,17 @@ class AitopSpider(BaseSpider):
                item['post_id'] = response.meta['post_id']
                item['tags'] = i.xpath(
                    ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
+                if self.pipeline.check_url(item['url']):
+                    is_duplicate = True
+                    break
                yield item
            next_url = self.next_url(response)
-            if next_url:
+            if self.can_follow(next_url, is_duplicate):
                yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
+            else:
+                print("DUPLICATE NEWS")
        except:
            pass