Commit 03681fe7 authored by Andrii Marynets's avatar Andrii Marynets

Add splash to crawler

parent 94e4c2e3
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1
SPLASH_URL = 'http://127.0.0.1:8050'
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
......@@ -54,20 +54,20 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'exa.middlewares.ExaSpiderMiddleware': 543,
# }
SPIDER_MIDDLEWARES = {
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'exa.middlewares.SeleniumDownloadMiddleware': 543,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_crawlera.CrawleraMiddleware': 610
'scrapy_crawlera.CrawleraMiddleware': 750
}
# Enable or disable extensions
......
# -*- coding: utf-8 -*-
import dateparser
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
from pkgutil import get_data
from .base import BaseSpider
from ..items import ExaItem
api_key = get_project_settings().get('CRAWLERA_APIKEY')
class CbSpider(BaseSpider):
name = "cb"
allowed_domains = ["www.crunchbase.com"]
crawlera_enabled = True
crawlera_apikey = get_project_settings().get('CRAWLERA_APIKEY')
# crawlera_enabled = True
# crawlera_apikey = api_key
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
co = 0
def __init__(self, *args, **kwargs):
self.LUA_SOURCE = get_data(
'exa', 'scripts/crawlera.lua'
).decode('utf-8')
super(CbSpider, self).__init__(*args, **kwargs)
def start_requests(self):
for i in self.companies(self.name):
try:
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
yield SplashRequest(url=i.url,
callback=self.parse,
meta={'company': i, 'post_id': 0},
args={'wait': 5,
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
except:
pass
def parse(self, response):
rows = response.xpath("//table/tr")
if 'page=1' in response.url:
rows = rows[1:]
rows = response.xpath(".//div[@class='grid-body']/div")
company = response.meta['company']
is_duplicate = False
for i in rows:
item = ExaItem()
item['date'] = self.format_date(i.xpath("./td[contains(@class, 'date')]/text()").extract_first())
item['title'] = i.xpath("./td/a/text()").extract_first()
item['url'] = i.xpath("./td/a/@href").extract_first()
item['date'] = self.format_date(i.xpath("./div/div/field-formatter/span/@title").extract_first())
item['title'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/text()").extract_first().strip()
item['url'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/@href").extract_first()
item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i)
item['description'] = None
......@@ -50,9 +63,9 @@ class CbSpider(BaseSpider):
if self.fresh:
break
yield item
next_url = self._next_url(response.url)
if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
# next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
......
......@@ -4,6 +4,7 @@ python-scrapyd-api==2.0.1
scrapyd-client==1.1.0
scrapy-proxies==0.3
scrapy-crawlera==1.2.4
scrapy-splash==0.7.2
newspaper3k==0.2.2
PyVirtualDisplay==0.2.1
selenium==3.4.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment