Commit 7f6e7b0e authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add spider for Crunchbase

parent 4bfd5fad
# -*- coding: utf-8 -*-
import scrapy
from .base import BaseSpider
from ..items import ExaItem
class CbSpider(BaseSpider):
name = "cb"
allowed_domains = ["www.crunchbase.com"]
start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
def parse(self, response):
rows = response.xpath("//table/tr")[1:]
print(rows)
for i in rows:
item = ExaItem()
item['date'] = i.xpath("//td[contains(@class, 'date')]/text()").extract_first()
item['title'] = i.xpath("//td/a/text()").extract_first()
item['url'] = i.xpath("//td/a/@href").extract_first()
print(item)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment