add pagination handler

9655cc2e · Vasyl Bodnaruk · 293b7a26 · 9655cc2e · 293b7a26 · 9655cc2e
Commit 9655cc2e authored Jun 09, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 25 deletions

__init__.py exa/exa/helpers/__init__.py +1 -1

decorators.py exa/exa/helpers/decorators.py +0 -0

techcrunch.py exa/exa/spiders/techcrunch.py +28 -24

No files found.
--- a/exa/exa/helpers/__init__.py
+++ b/exa/exa/helpers/__init__.py
 from .company_maker import CompanyMaker
-from .db import Database
\ No newline at end of file
+from .db import Database
--- a/exa/exa/helpers/decorators.py
+++ b/exa/exa/helpers/decorators.py
--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
 # -*- coding: utf-8 -*-
 import scrapy
+import traceback
 from scrapy.utils.project import get_project_settings
 from ..helpers import CompanyMaker, Database
 from ..items import ExaItem
@@ -27,29 +28,32 @@ class TechcrunchSpider(scrapy.Spider):
        companies = CompanyMaker(self.comp)
        companies.make_companies(self.name)
        for i in companies.get_companies():
-            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
-                                                                   'post_id': 0})
+            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})

    def parse(self, response):
-        news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
-        company = response.meta['company']
-        for i in news_list:
-            item = ExaItem()
-            item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
-            item['title'] = i.xpath("./div/h2/a/text()").extract_first()
-            item['description'] = i.xpath("./div/p//text()").extract_first()
-            item['url'] = i.xpath("./div/h2/a/@href").extract_first()
-
-            item['region_id'] = company.region_id
-            item['type_id'] = company.type_id
-            item['media_id'] = company.media_id
-            item['company_id'] = company.id
-
-            item['post_id'] = response.meta['post_id']
-
-            # yield item
-
-        has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
-        next_url = 'https://techcrunch.com' + has_next
-        if has_next:
-            yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
+        try:
+            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
+            company = response.meta['company']
+            for i in news_list:
+                item = ExaItem()
+                item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
+                item['title'] = i.xpath("./div/h2/a/text()").extract_first()
+                item['description'] = i.xpath("./div/p//text()").extract_first()
+                item['url'] = i.xpath("./div/h2/a/@href").extract_first()
+
+                item['region_id'] = company.region_id
+                item['type_id'] = company.type_id
+                item['media_id'] = company.media_id
+                item['company_id'] = company.id
+
+                item['post_id'] = response.meta['post_id']
+
+                # yield item
+
+            has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
+            next_url = 'https://techcrunch.com' + has_next
+            if has_next:
+                yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
+        except BaseException as e:
+            print('We had error')
+            traceback.print_exc()
\ No newline at end of file