Add pagination and base info about a company

47b2e13a · Vasyl Bodnaruk · 03681fe7 · 47b2e13a
Commit 47b2e13a authored Oct 16, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 4 deletions

nana.py exa/exa/spiders/nana.py +17 -4

No files found.
--- a/exa/exa/spiders/nana.py
+++ b/exa/exa/spiders/nana.py
 # -*- coding: utf-8 -*-
 import scrapy
 from .base import BaseSpider
+from ..items import ExaItem


 class NanaSpider(BaseSpider):
    name = "nana"
    allowed_domains = ["nanalyze.com"]
-    start_urls = ['http://www.nanalyze.com/tag/google/']
+    start_urls = ['http://www.nanalyze.com/technology/computing/']

    def start_requests(self):
        for i in self.companies(self.name):
@@ -16,8 +17,20 @@ class NanaSpider(BaseSpider):
                pass

    def parse(self, response):
+        company = response.meta['company']
+        is_duplicate = False
        for i in response.xpath('.//article[@class="tease tease-post"]/div[@class="tease-content"]'):
+            item = ExaItem()
            # url, title, description
-            d = i.xpath('./h2/a/@href | ./h2/a/text() | ./p/text()').extract()
-            date = self.format_date(i.xpath('./text()').extract()[1].strip())
-            print(date)
\ No newline at end of file
+            news = i.xpath('./h2/a/@href | ./h2/a/text() | ./p/text()').extract()
+            item['url'], item['title'], item['description'] = news
+            item['date'] = self.format_date(i.xpath('./text()').extract()[1].strip())
+            item.update(self.get_common_items(company))
+            item['post_id'] = response.meta['post_id']
+            yield item
+            if self.pipeline.check_url(item['url']):
+                is_duplicate = True
+                break
+        next_url = response.xpath('.//div[@class="pagination"]/ul//a[text()="Next"]/@href').extract_first()
+        if self.can_follow(next_url, is_duplicate):
+            yield scrapy.Request(next_url, callback=self.parse)
\ No newline at end of file