Commit 47b2e13a authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add pagination and base info about a company

parent 03681fe7
# -*- coding: utf-8 -*-
import scrapy
from .base import BaseSpider
from ..items import ExaItem
class NanaSpider(BaseSpider):
name = "nana"
allowed_domains = ["nanalyze.com"]
start_urls = ['http://www.nanalyze.com/tag/google/']
start_urls = ['http://www.nanalyze.com/technology/computing/']
def start_requests(self):
for i in self.companies(self.name):
......@@ -16,8 +17,20 @@ class NanaSpider(BaseSpider):
pass
def parse(self, response):
company = response.meta['company']
is_duplicate = False
for i in response.xpath('.//article[@class="tease tease-post"]/div[@class="tease-content"]'):
item = ExaItem()
# url, title, description
d = i.xpath('./h2/a/@href | ./h2/a/text() | ./p/text()').extract()
date = self.format_date(i.xpath('./text()').extract()[1].strip())
print(date)
\ No newline at end of file
news = i.xpath('./h2/a/@href | ./h2/a/text() | ./p/text()').extract()
item['url'], item['title'], item['description'] = news
item['date'] = self.format_date(i.xpath('./text()').extract()[1].strip())
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
yield item
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
next_url = response.xpath('.//div[@class="pagination"]/ul//a[text()="Next"]/@href').extract_first()
if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment