Refactor. Add functional for reuse code. Fix bug when we use only search for scrapping

6112944d · Vasyl Bodnaruk · c3d696f4 · 6112944d
Commit 6112944d authored Jul 06, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 19 deletions

aitop.py exa/exa/spiders/aitop.py +29 -19

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -37,32 +37,42 @@ class AitopSpider(scrapy.Spider):
                return True
            else:
                return False
-        rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
-        for i in rows:
-            item = dict()
-            item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
-            item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
-            item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
-            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
-            # print(item)
-        has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
-        if has_next and has_next != '#':
-            next_url = 'https://aitopics.org/search' + has_next
-            return scrapy.Request(next_url, callback=self.parse_by_tag)
+
+        for i in self.build_items(response):
+            # if is_company_in_item(i):
+                print(i)
+        next_url = self.next_url(response)
+        # if next_url:
+            # return scrapy.Request(next_url, callback=self.parse_by_title_description)

    def parse_by_tag(self, response):
        try:
+            for i in self.build_items(response):
+                print(i)
+            next_url = self.next_url(response)
+            if next_url:
+                return scrapy.Request(next_url, callback=self.parse_by_tag)
+        except:
+            pass
+
+    def build_items(self, response):
+        try:
+            items = list()
            rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
            for i in rows:
                item = dict()
                item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
-                item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
-                item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
+                item['title'] = ''.join(i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a//text()").extract())
+                item['description'] = ''.join(i.xpath(".//div[@class='summary-content']/p/text()").extract())
                item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
-                # print(item)
-            has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
-            if has_next and has_next != '#':
-                next_url = 'https://aitopics.org/search' + has_next
-                return scrapy.Request(next_url, callback=self.parse_by_tag)
+                items.append(item)
+            return items
        except:
            pass
+
+    def next_url(self, response):
+        has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
+        if has_next and has_next != '#':
+            return 'https://aitopics.org/search' + has_next
+        else:
+            return None
\ No newline at end of file