Commit 3249334f authored by Andrii Marynets's avatar Andrii Marynets

Request to AJAx directly

parent 07793329
...@@ -65,9 +65,9 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -65,9 +65,9 @@ DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashMiddleware': 725, 'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy_crawlera.CrawleraMiddleware': None 'scrapy_crawlera.CrawleraMiddleware': 710
} }
# Enable or disable extensions # Enable or disable extensions
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
import scrapy import scrapy
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
...@@ -15,8 +16,8 @@ class CbSpider(BaseSpider): ...@@ -15,8 +16,8 @@ class CbSpider(BaseSpider):
name = "cb" name = "cb"
allowed_domains = ["www.crunchbase.com"] allowed_domains = ["www.crunchbase.com"]
handle_httpstatus_list = [470] handle_httpstatus_list = [470]
# crawlera_enabled = True crawlera_enabled = True
# crawlera_apikey = api_key crawlera_apikey = api_key
co = 0 co = 0
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -26,23 +27,58 @@ class CbSpider(BaseSpider): ...@@ -26,23 +27,58 @@ class CbSpider(BaseSpider):
super(CbSpider, self).__init__(*args, **kwargs) super(CbSpider, self).__init__(*args, **kwargs)
def start_requests(self): def start_requests(self):
for i in self.companies(self.name): for s, i in enumerate(self.companies(self.name)):
try: try:
yield SplashRequest(url=i.url, # yield SplashRequest(url=i.url,
callback=self.parse, # callback=self.parse,
endpoint='execute', # endpoint='execute',
meta={'company': i, 'post_id': 0}, # meta={'company': i, 'post_id': 0},
args={'wait': 5, # args={'wait': 5,
'lua_source': self.LUA_SOURCE, # 'lua_source': self.LUA_SOURCE,
'apikey': self.settings['CRAWLERA_APIKEY'], # 'apikey': self.settings['CRAWLERA_APIKEY'],
}, # },
# cache_args=['lua_source'], # # cache_args=['lua_source'],
) # )
yield scrapy.Request(url=i.url,
callback=self.parse,
meta={'company': i, 'post_id': 0, 'cookiejar': s})
except: except:
pass pass
def parse(self, response): def parse(self, response):
print(response.body) print(response.body)
body = {
"field_ids": [
"activity_properties",
"entity_def_id",
"identifier",
"activity_date",
"activity_entities"
],
"order": [],
"query": [
{
"type": "predicate",
"field_id": "activity_entities",
"operator_id": "includes",
"values": [
"f93d65c7-11da-f085-0bdd-d54510f77a41"
]
}
],
"limit": 100
}
uuid = '"uuid":"'
page = response.body.decode('utf8')
s_uuid = page.find(uuid)
uuid = page[s_uuid + len(uuid):page.find('"', s_uuid + len(uuid))]
body['query'][0]['values'][0] = uuid
yield scrapy.Request(url='https://www.crunchbase.com/v4/data/searches/activities',
method='POST',
body=json.dumps(body),
callback=self.parse_news,
meta={'cookiejar': response.meta['cookiejar']})
rows = response.xpath(".//div[@class='grid-body']/div") rows = response.xpath(".//div[@class='grid-body']/div")
company = response.meta['company'] company = response.meta['company']
is_duplicate = False is_duplicate = False
...@@ -64,9 +100,11 @@ class CbSpider(BaseSpider): ...@@ -64,9 +100,11 @@ class CbSpider(BaseSpider):
if self.fresh: if self.fresh:
break break
yield item yield item
# next_url = self._next_url(response.url) # next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate): # if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta) # yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def parse_news(self, response):
print(response.body)
def _get_media(self, elem): def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first() media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
...@@ -74,7 +112,8 @@ class CbSpider(BaseSpider): ...@@ -74,7 +112,8 @@ class CbSpider(BaseSpider):
query = "select * from wp_esi_media where name like '%{}%' or url like '%{}%'".format(media_name, media_url) query = "select * from wp_esi_media where name like '%{}%' or url like '%{}%'".format(media_name, media_url)
media = self.pipeline.db.select(query) media = self.pipeline.db.select(query)
if len(media) == 0: if len(media) == 0:
media = self.pipeline.db.insert("INSERT INTO wp_esi_media (name, url) VALUES(%s, %s)", (media_name, media_url)) media = self.pipeline.db.insert("INSERT INTO wp_esi_media (name, url) VALUES(%s, %s)",
(media_name, media_url))
else: else:
media = media[0][0] media = media[0][0]
return media return media
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment