Commit 3249334f authored by Andrii Marynets's avatar Andrii Marynets

Request to AJAx directly

parent 07793329
......@@ -65,9 +65,9 @@ DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy_crawlera.CrawleraMiddleware': None
'scrapy_crawlera.CrawleraMiddleware': 710
}
# Enable or disable extensions
......
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy_splash import SplashRequest
......@@ -15,8 +16,8 @@ class CbSpider(BaseSpider):
name = "cb"
allowed_domains = ["www.crunchbase.com"]
handle_httpstatus_list = [470]
# crawlera_enabled = True
# crawlera_apikey = api_key
crawlera_enabled = True
crawlera_apikey = api_key
co = 0
def __init__(self, *args, **kwargs):
......@@ -26,23 +27,58 @@ class CbSpider(BaseSpider):
super(CbSpider, self).__init__(*args, **kwargs)
def start_requests(self):
for i in self.companies(self.name):
for s, i in enumerate(self.companies(self.name)):
try:
yield SplashRequest(url=i.url,
callback=self.parse,
endpoint='execute',
meta={'company': i, 'post_id': 0},
args={'wait': 5,
'lua_source': self.LUA_SOURCE,
'apikey': self.settings['CRAWLERA_APIKEY'],
},
# cache_args=['lua_source'],
)
# yield SplashRequest(url=i.url,
# callback=self.parse,
# endpoint='execute',
# meta={'company': i, 'post_id': 0},
# args={'wait': 5,
# 'lua_source': self.LUA_SOURCE,
# 'apikey': self.settings['CRAWLERA_APIKEY'],
# },
# # cache_args=['lua_source'],
# )
yield scrapy.Request(url=i.url,
callback=self.parse,
meta={'company': i, 'post_id': 0, 'cookiejar': s})
except:
pass
def parse(self, response):
print(response.body)
body = {
"field_ids": [
"activity_properties",
"entity_def_id",
"identifier",
"activity_date",
"activity_entities"
],
"order": [],
"query": [
{
"type": "predicate",
"field_id": "activity_entities",
"operator_id": "includes",
"values": [
"f93d65c7-11da-f085-0bdd-d54510f77a41"
]
}
],
"limit": 100
}
uuid = '"uuid":"'
page = response.body.decode('utf8')
s_uuid = page.find(uuid)
uuid = page[s_uuid + len(uuid):page.find('"', s_uuid + len(uuid))]
body['query'][0]['values'][0] = uuid
yield scrapy.Request(url='https://www.crunchbase.com/v4/data/searches/activities',
method='POST',
body=json.dumps(body),
callback=self.parse_news,
meta={'cookiejar': response.meta['cookiejar']})
rows = response.xpath(".//div[@class='grid-body']/div")
company = response.meta['company']
is_duplicate = False
......@@ -64,9 +100,11 @@ class CbSpider(BaseSpider):
if self.fresh:
break
yield item
# next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
# next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def parse_news(self, response):
print(response.body)
def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
......@@ -74,7 +112,8 @@ class CbSpider(BaseSpider):
query = "select * from wp_esi_media where name like '%{}%' or url like '%{}%'".format(media_name, media_url)
media = self.pipeline.db.select(query)
if len(media) == 0:
media = self.pipeline.db.insert("INSERT INTO wp_esi_media (name, url) VALUES(%s, %s)", (media_name, media_url))
media = self.pipeline.db.insert("INSERT INTO wp_esi_media (name, url) VALUES(%s, %s)",
(media_name, media_url))
else:
media = media[0][0]
return media
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment