Commit 03681fe7 authored by Andrii Marynets's avatar Andrii Marynets

Add splash to crawler

parent 94e4c2e3
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = False ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1 CONCURRENT_REQUESTS = 1
SPLASH_URL = 'http://127.0.0.1:8050'
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
...@@ -54,20 +54,20 @@ DEFAULT_REQUEST_HEADERS = { ...@@ -54,20 +54,20 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = { SPIDER_MIDDLEWARES = {
# 'exa.middlewares.ExaSpiderMiddleware': 543, # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
# } }
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'exa.middlewares.SeleniumDownloadMiddleware': 543,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100, 'scrapy_crawlera.CrawleraMiddleware': 750
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_crawlera.CrawleraMiddleware': 610
} }
# Enable or disable extensions # Enable or disable extensions
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import dateparser
import scrapy import scrapy
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
from pkgutil import get_data
from .base import BaseSpider from .base import BaseSpider
from ..items import ExaItem from ..items import ExaItem
api_key = get_project_settings().get('CRAWLERA_APIKEY')
class CbSpider(BaseSpider): class CbSpider(BaseSpider):
name = "cb" name = "cb"
allowed_domains = ["www.crunchbase.com"] allowed_domains = ["www.crunchbase.com"]
crawlera_enabled = True # crawlera_enabled = True
crawlera_apikey = get_project_settings().get('CRAWLERA_APIKEY') # crawlera_apikey = api_key
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/'] # start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
co = 0 co = 0
def __init__(self, *args, **kwargs):
self.LUA_SOURCE = get_data(
'exa', 'scripts/crawlera.lua'
).decode('utf-8')
super(CbSpider, self).__init__(*args, **kwargs)
def start_requests(self): def start_requests(self):
for i in self.companies(self.name): for i in self.companies(self.name):
try: try:
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0}) yield SplashRequest(url=i.url,
callback=self.parse,
meta={'company': i, 'post_id': 0},
args={'wait': 5,
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
except: except:
pass pass
def parse(self, response): def parse(self, response):
rows = response.xpath("//table/tr") rows = response.xpath(".//div[@class='grid-body']/div")
if 'page=1' in response.url:
rows = rows[1:]
company = response.meta['company'] company = response.meta['company']
is_duplicate = False is_duplicate = False
for i in rows: for i in rows:
item = ExaItem() item = ExaItem()
item['date'] = self.format_date(i.xpath("./td[contains(@class, 'date')]/text()").extract_first()) item['date'] = self.format_date(i.xpath("./div/div/field-formatter/span/@title").extract_first())
item['title'] = i.xpath("./td/a/text()").extract_first() item['title'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/text()").extract_first().strip()
item['url'] = i.xpath("./td/a/@href").extract_first() item['url'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/@href").extract_first()
item.update(self.get_common_items(company)) item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i) item['media_id'] = self._get_media(i)
item['description'] = None item['description'] = None
...@@ -50,9 +63,9 @@ class CbSpider(BaseSpider): ...@@ -50,9 +63,9 @@ class CbSpider(BaseSpider):
if self.fresh: if self.fresh:
break break
yield item yield item
next_url = self._next_url(response.url) # next_url = self._next_url(response.url)
if len(rows) != 0 and self.can_follow(next_url, is_duplicate): # if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta) # yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def _get_media(self, elem): def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first() media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
......
...@@ -4,6 +4,7 @@ python-scrapyd-api==2.0.1 ...@@ -4,6 +4,7 @@ python-scrapyd-api==2.0.1
scrapyd-client==1.1.0 scrapyd-client==1.1.0
scrapy-proxies==0.3 scrapy-proxies==0.3
scrapy-crawlera==1.2.4 scrapy-crawlera==1.2.4
scrapy-splash==0.7.2
newspaper3k==0.2.2 newspaper3k==0.2.2
PyVirtualDisplay==0.2.1 PyVirtualDisplay==0.2.1
selenium==3.4.1 selenium==3.4.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment