Refactor parse by tag

b83e565f · Vasyl Bodnaruk · c5d3c31d · b83e565f · b83e565f
Commit b83e565f authored Jul 25, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 28 additions and 21 deletions

aitop.py exa/exa/spiders/aitop.py +27 -20

base.py exa/exa/spiders/base.py +1 -1

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -77,8 +77,30 @@ class AitopSpider(BaseSpider):

    def parse_by_tag(self, response):
        try:
+            items, is_duplicate = self.build_items(response)
+            for i in items:
+                yield i
+
+            next_url = self.next_url(response)
+            if self.can_follow(next_url, is_duplicate):
+                yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
+            else:
+                print("DUPLICATE NEWS")
+
+        except:
+            pass
+
+    def next_url(self, response):
+        has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
+        if has_next and has_next != '#':
+            return 'https://aitopics.org/search' + has_next
+        else:
+            return None
+
+    def build_items(self, response):
        rows = response.xpath(".//div[contains(@class, 'summaries')]")
        is_duplicate = False
+        items = list()
        for i in rows:
            item = ExaItem()
            item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
@@ -96,20 +118,5 @@ class AitopSpider(BaseSpider):
            if self.pipeline.check_url(item['url']):
                is_duplicate = True
                break
-                yield item
-
-            next_url = self.next_url(response)
-            if self.can_follow(next_url, is_duplicate):
-                yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
-            else:
-                print("DUPLICATE NEWS")
-
-        except:
-            pass
-
-    def next_url(self, response):
-        has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
-        if has_next and has_next != '#':
-            return 'https://aitopics.org/search' + has_next
-        else:
-            return None
+            items.append(item)
+        return items, is_duplicate
--- a/exa/exa/spiders/base.py
+++ b/exa/exa/spiders/base.py
@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
    def __init__(self, query=None, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
        self.condition = query
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
+        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
        if self.condition:
            print(self.condition)
            self.query += ' or {}'.format(self.condition)