Refactor parse by title

16590662 · Vasyl Bodnaruk · b83e565f · 16590662 · 16590662
Commit 16590662 authored Jul 25, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 21 deletions

aitop.py exa/exa/spiders/aitop.py +4 -20

base.py exa/exa/spiders/base.py +1 -1

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -48,26 +48,10 @@ class AitopSpider(BaseSpider):
            else:
                return False

-        rows = response.xpath(".//div[contains(@class, 'summaries')]")
-        is_duplicate = False
-        for i in rows:
-            item = ExaItem()
-            item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
-                tzinfo=None)
-            item['title'] = ''.join(
-                i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
-            item['description'] = ''.join(
-                i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
-            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
-            item.update(self.get_common_items(response.meta['company']))
-            item['post_id'] = response.meta['post_id']
-            item['tags'] = i.xpath(
-                ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
-            if is_company_in_item(item):
-                if self.pipeline.check_url(item['url']):
-                    is_duplicate = True
-                    break
-                yield item
+        items, is_duplicate = self.build_items(response)
+        for i in items:
+            if is_company_in_item(i):
+                yield i

        next_url = self.next_url(response)
        if self.can_follow(next_url, is_duplicate):

--- a/exa/exa/spiders/base.py
+++ b/exa/exa/spiders/base.py
@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
    def __init__(self, query=None, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
        self.condition = query
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
+        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
        if self.condition:
            print(self.condition)
            self.query += ' or {}'.format(self.condition)