Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
de32a15f
Commit
de32a15f
authored
Oct 19, 2017
by
Andrii Marynets
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Check item on duplicate
parent
da12a613
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
8 deletions
+8
-8
cb.py
exa/exa/spiders/cb.py
+8
-8
No files found.
exa/exa/spiders/cb.py
View file @
de32a15f
...
...
@@ -79,7 +79,8 @@ class CbSpider(BaseSpider):
headers
=
{
'x-requested-with'
:
'XMLHttpRequest'
,
'content-type'
:
'application/json'
},
callback
=
self
.
parse_news
,
meta
=
{
'cookiejar'
:
response
.
meta
[
'cookiejar'
],
'company'
:
response
.
meta
[
'company'
]})
'company'
:
response
.
meta
[
'company'
],
'post_id'
:
response
.
meta
[
'post_id'
]})
rows
=
response
.
xpath
(
".//div[@class='grid-body']/div"
)
company
=
response
.
meta
[
'company'
]
...
...
@@ -107,7 +108,6 @@ class CbSpider(BaseSpider):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def
parse_news
(
self
,
response
):
body
=
json
.
loads
(
response
.
body
.
decode
(
'utf8'
))
print
(
body
)
for
i
in
body
[
'entities'
]:
prop
=
i
[
'properties'
]
if
prop
[
'entity_def_id'
]
==
'press_reference'
:
...
...
@@ -118,7 +118,12 @@ class CbSpider(BaseSpider):
publisher
=
prop
[
'activity_properties'
][
'publisher'
]
item
.
update
(
self
.
get_common_items
(
response
.
meta
[
'company'
]))
item
[
'media_id'
]
=
self
.
_get_media
((
publisher
,
item
[
'url'
]))
print
(
item
)
item
[
'description'
]
=
None
item
[
'post_id'
]
=
response
.
meta
[
'post_id'
]
item
[
'tags'
]
=
None
if
self
.
pipeline
.
check_url
(
item
[
'url'
])
and
self
.
fresh
:
break
yield
item
def
_get_media
(
self
,
site
):
media_name
,
media_url
=
site
...
...
@@ -132,8 +137,3 @@ class CbSpider(BaseSpider):
else
:
media
=
media
[
0
][
0
]
return
media
def
_next_url
(
self
,
url
):
pos
=
url
.
rfind
(
'='
)
+
1
next_page
=
int
(
url
[
pos
:])
+
1
return
url
[:
pos
]
+
str
(
next_page
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment