Commit d8450ff8 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add function for following pagination

parent b6418a72
...@@ -22,15 +22,19 @@ class CbSpider(BaseSpider): ...@@ -22,15 +22,19 @@ class CbSpider(BaseSpider):
def parse(self, response): def parse(self, response):
rows = response.xpath("//table/tr")[1:] rows = response.xpath("//table/tr")[1:]
company = response.meta['company']
for i in rows: for i in rows:
item = ExaItem() item = ExaItem()
item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first() item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first()
item['title'] = i.xpath("./td/a/text()").extract_first() item['title'] = i.xpath("./td/a/text()").extract_first()
item['url'] = i.xpath("./td/a/@href").extract_first() item['url'] = i.xpath("./td/a/@href").extract_first()
item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i) item['media_id'] = self._get_media(i)
print(item) print(item)
if len(rows) != 0:
yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta)
def _get_media(self, elem): def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first() media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
media_url = elem.xpath("./td/a/@data_publisher").extract_first() media_url = elem.xpath("./td/a/@data_publisher").extract_first()
...@@ -41,3 +45,8 @@ class CbSpider(BaseSpider): ...@@ -41,3 +45,8 @@ class CbSpider(BaseSpider):
else: else:
media = media[0][0] media = media[0][0]
return media return media
def _next_url(self, url):
pos = url.rfind('=') + 1
next_page = int(url[pos:]) + 1
return url[:pos] + str(next_page)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment