Commit 9655cc2e authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

add pagination handler

parent 293b7a26
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
import traceback
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database from ..helpers import CompanyMaker, Database
from ..items import ExaItem from ..items import ExaItem
...@@ -27,10 +28,10 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -27,10 +28,10 @@ class TechcrunchSpider(scrapy.Spider):
companies = CompanyMaker(self.comp) companies = CompanyMaker(self.comp)
companies.make_companies(self.name) companies.make_companies(self.name)
for i in companies.get_companies(): for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
'post_id': 0})
def parse(self, response): def parse(self, response):
try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]") news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company'] company = response.meta['company']
for i in news_list: for i in news_list:
...@@ -53,3 +54,6 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -53,3 +54,6 @@ class TechcrunchSpider(scrapy.Spider):
next_url = 'https://techcrunch.com' + has_next next_url = 'https://techcrunch.com' + has_next
if has_next: if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0}) yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
traceback.print_exc()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment