Commit 9655cc2e authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

add pagination handler

parent 293b7a26
from .company_maker import CompanyMaker
from .db import Database
\ No newline at end of file
from .db import Database
# -*- coding: utf-8 -*-
import scrapy
import traceback
from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database
from ..items import ExaItem
......@@ -27,29 +28,32 @@ class TechcrunchSpider(scrapy.Spider):
companies = CompanyMaker(self.comp)
companies.make_companies(self.name)
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0})
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
def parse(self, response):
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item['region_id'] = company.region_id
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id']
# yield item
has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item['region_id'] = company.region_id
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id']
# yield item
has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
traceback.print_exc()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment