Refactor parse by tag

b83e565f · Vasyl Bodnaruk · c5d3c31d · b83e565f · b83e565f
Commit b83e565f authored Jul 25, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 21 deletions

aitop.py exa/exa/spiders/aitop.py +27 -20

base.py exa/exa/spiders/base.py +1 -1

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -77,26 +77,9 @@ class AitopSpider(BaseSpider):

    def parse_by_tag(self, response):
        try:
-            rows = response.xpath(".//div[contains(@class, 'summaries')]")
-            is_duplicate = False
-            for i in rows:
-                item = ExaItem()
-                item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
-                    tzinfo=None)
-                item['title'] = ''.join(
-                    i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
-                item['description'] = ''.join(
-                    i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
-                item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
-                item.update(self.get_common_items(response.meta['company']))
-                item['post_id'] = response.meta['post_id']
-                item['tags'] = i.xpath(
-                    ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
-
-                if self.pipeline.check_url(item['url']):
-                    is_duplicate = True
-                    break
-                yield item
+            items, is_duplicate = self.build_items(response)
+            for i in items:
+                yield i

            next_url = self.next_url(response)
            if self.can_follow(next_url, is_duplicate):
@@ -113,3 +96,27 @@ class AitopSpider(BaseSpider):
            return 'https://aitopics.org/search' + has_next
        else:
            return None
+
+    def build_items(self, response):
+        rows = response.xpath(".//div[contains(@class, 'summaries')]")
+        is_duplicate = False
+        items = list()
+        for i in rows:
+            item = ExaItem()
+            item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
+                tzinfo=None)
+            item['title'] = ''.join(
+                i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
+            item['description'] = ''.join(
+                i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
+            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
+            item.update(self.get_common_items(response.meta['company']))
+            item['post_id'] = response.meta['post_id']
+            item['tags'] = i.xpath(
+                ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
+
+            if self.pipeline.check_url(item['url']):
+                is_duplicate = True
+                break
+            items.append(item)
+        return items, is_duplicate
--- a/exa/exa/spiders/base.py
+++ b/exa/exa/spiders/base.py
@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
    def __init__(self, query=None, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
        self.condition = query
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
+        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
        if self.condition:
            print(self.condition)
            self.query += ' or {}'.format(self.condition)