Commit 39c0c968 authored by Andrii Marynets's avatar Andrii Marynets

Fix CB

parent ea06c447
......@@ -55,7 +55,7 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Enable or disable downloader middlewares
......@@ -65,9 +65,9 @@ DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy_crawlera.CrawleraMiddleware': 750
'scrapy_crawlera.CrawleraMiddleware': None
}
# Enable or disable extensions
......
......@@ -14,10 +14,9 @@ api_key = get_project_settings().get('CRAWLERA_APIKEY')
class CbSpider(BaseSpider):
name = "cb"
allowed_domains = ["www.crunchbase.com"]
handle_httpstatus_list = [470]
# crawlera_enabled = True
# crawlera_apikey = api_key
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
co = 0
def __init__(self, *args, **kwargs):
......@@ -31,17 +30,19 @@ class CbSpider(BaseSpider):
try:
yield SplashRequest(url=i.url,
callback=self.parse,
endpoint='execute',
meta={'company': i, 'post_id': 0},
args={'wait': 5,
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
'apikey': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
# cache_args=['lua_source'],
)
except:
pass
def parse(self, response):
print(response.body)
rows = response.xpath(".//div[@class='grid-body']/div")
company = response.meta['company']
is_duplicate = False
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment