Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
3249334f
Commit
3249334f
authored
Oct 18, 2017
by
Andrii Marynets
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Request to AJAx directly
parent
07793329
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
58 additions
and
19 deletions
+58
-19
settings.py
exa/exa/settings.py
+2
-2
cb.py
exa/exa/spiders/cb.py
+56
-17
No files found.
exa/exa/settings.py
View file @
3249334f
...
...
@@ -65,9 +65,9 @@ DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashMiddleware'
:
725
,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware'
:
810
,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware'
:
None
,
#
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware'
:
400
,
'scrapy.downloadermiddlewares.retry.RetryMiddleware'
:
90
,
'scrapy_crawlera.CrawleraMiddleware'
:
None
'scrapy_crawlera.CrawleraMiddleware'
:
710
}
# Enable or disable extensions
...
...
exa/exa/spiders/cb.py
View file @
3249334f
# -*- coding: utf-8 -*-
import
json
import
scrapy
from
scrapy.utils.project
import
get_project_settings
from
scrapy_splash
import
SplashRequest
...
...
@@ -15,8 +16,8 @@ class CbSpider(BaseSpider):
name
=
"cb"
allowed_domains
=
[
"www.crunchbase.com"
]
handle_httpstatus_list
=
[
470
]
#
crawlera_enabled = True
#
crawlera_apikey = api_key
crawlera_enabled
=
True
crawlera_apikey
=
api_key
co
=
0
def
__init__
(
self
,
*
args
,
**
kwargs
):
...
...
@@ -26,23 +27,58 @@ class CbSpider(BaseSpider):
super
(
CbSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
def
start_requests
(
self
):
for
i
in
self
.
companies
(
self
.
name
):
for
s
,
i
in
enumerate
(
self
.
companies
(
self
.
name
)
):
try
:
yield
SplashRequest
(
url
=
i
.
url
,
callback
=
self
.
parse
,
endpoint
=
'execute'
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
},
args
=
{
'wait'
:
5
,
'lua_source'
:
self
.
LUA_SOURCE
,
'apikey'
:
self
.
settings
[
'CRAWLERA_APIKEY'
],
},
# cache_args=['lua_source'],
)
# yield SplashRequest(url=i.url,
# callback=self.parse,
# endpoint='execute',
# meta={'company': i, 'post_id': 0},
# args={'wait': 5,
# 'lua_source': self.LUA_SOURCE,
# 'apikey': self.settings['CRAWLERA_APIKEY'],
# },
# # cache_args=['lua_source'],
# )
yield
scrapy
.
Request
(
url
=
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
,
'cookiejar'
:
s
})
except
:
pass
def
parse
(
self
,
response
):
print
(
response
.
body
)
body
=
{
"field_ids"
:
[
"activity_properties"
,
"entity_def_id"
,
"identifier"
,
"activity_date"
,
"activity_entities"
],
"order"
:
[],
"query"
:
[
{
"type"
:
"predicate"
,
"field_id"
:
"activity_entities"
,
"operator_id"
:
"includes"
,
"values"
:
[
"f93d65c7-11da-f085-0bdd-d54510f77a41"
]
}
],
"limit"
:
100
}
uuid
=
'"uuid":"'
page
=
response
.
body
.
decode
(
'utf8'
)
s_uuid
=
page
.
find
(
uuid
)
uuid
=
page
[
s_uuid
+
len
(
uuid
):
page
.
find
(
'"'
,
s_uuid
+
len
(
uuid
))]
body
[
'query'
][
0
][
'values'
][
0
]
=
uuid
yield
scrapy
.
Request
(
url
=
'https://www.crunchbase.com/v4/data/searches/activities'
,
method
=
'POST'
,
body
=
json
.
dumps
(
body
),
callback
=
self
.
parse_news
,
meta
=
{
'cookiejar'
:
response
.
meta
[
'cookiejar'
]})
rows
=
response
.
xpath
(
".//div[@class='grid-body']/div"
)
company
=
response
.
meta
[
'company'
]
is_duplicate
=
False
...
...
@@ -64,9 +100,11 @@ class CbSpider(BaseSpider):
if
self
.
fresh
:
break
yield
item
# next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
# next_url = self._next_url(response.url)
# if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def
parse_news
(
self
,
response
):
print
(
response
.
body
)
def
_get_media
(
self
,
elem
):
media_name
=
elem
.
xpath
(
"./td[contains(@class, 'article')]/span/text()"
)
.
extract_first
()
...
...
@@ -74,7 +112,8 @@ class CbSpider(BaseSpider):
query
=
"select * from wp_esi_media where name like '
%
{}
%
' or url like '
%
{}
%
'"
.
format
(
media_name
,
media_url
)
media
=
self
.
pipeline
.
db
.
select
(
query
)
if
len
(
media
)
==
0
:
media
=
self
.
pipeline
.
db
.
insert
(
"INSERT INTO wp_esi_media (name, url) VALUES(
%
s,
%
s)"
,
(
media_name
,
media_url
))
media
=
self
.
pipeline
.
db
.
insert
(
"INSERT INTO wp_esi_media (name, url) VALUES(
%
s,
%
s)"
,
(
media_name
,
media_url
))
else
:
media
=
media
[
0
][
0
]
return
media
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment