Fix searching by description and tag

52f4db11 · Vasyl Bodnaruk · 6ddd556a · 52f4db11 · 52f4db11
Commit 52f4db11 authored Jul 25, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 9 deletions

settings.py exa/exa/settings.py +1 -1

aitop.py exa/exa/spiders/aitop.py +22 -8

No files found.
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'exa.pipelines.ExaPipeline': 300,
+    # 'exa.pipelines.ExaPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -48,8 +48,22 @@ class AitopSpider(BaseSpider):
            else:
                return False

-        items, is_duplicate = self.build_items(response)
-        for i in items:
+        rows = response.xpath(".//div[contains(@class, 'summaries')]")
+        is_duplicate = False
+        for i in rows:
+            item = ExaItem()
+            item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
+                tzinfo=None)
+            item['title'] = ''.join(
+                i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
+            item['description'] = ''.join(
+                i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
+            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
+            item.update(self.get_common_items(response.meta['company']))
+            item['post_id'] = response.meta['post_id']
+            item['tags'] = i.xpath(
+                ".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
+            yield item
            if is_company_in_item(i):
                yield i
        next_url = self.next_url(response)
@@ -63,11 +77,13 @@ class AitopSpider(BaseSpider):
            rows = response.xpath(".//div[contains(@class, 'summaries')]")
            is_duplicate = False
            for i in rows:
-                # i = r.xpath("//div[@class='row']")
                item = ExaItem()
-                item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(tzinfo=None)
-                item['title'] = i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract()
-                item['description'] = i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract()
+                item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
+                    tzinfo=None)
+                item['title'] = ''.join(
+                    i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
+                item['description'] = ''.join(
+                    i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
                item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
                item.update(self.get_common_items(response.meta['company']))
                item['post_id'] = response.meta['post_id']
@@ -89,5 +105,3 @@ class AitopSpider(BaseSpider):
            return 'https://aitopics.org/search' + has_next
        else:
            return None
-
-