Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
6ebd5b73
Commit
6ebd5b73
authored
Jun 09, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
simple refactor
parent
9655cc2e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
39 additions
and
9 deletions
+39
-9
techcrunch.py
exa/exa/spiders/techcrunch.py
+39
-9
No files found.
exa/exa/spiders/techcrunch.py
View file @
6ebd5b73
...
@@ -13,7 +13,7 @@ db = Database(**db_settings)
...
@@ -13,7 +13,7 @@ db = Database(**db_settings)
class
TechcrunchSpider
(
scrapy
.
Spider
):
class
TechcrunchSpider
(
scrapy
.
Spider
):
name
=
"tc"
name
=
"tc"
allowed_domains
=
[
"techcrunch.com"
]
allowed_domains
=
[
"techcrunch.com"
]
start_urls
=
[
'https://techcrunch.com/tag/Ericsson/'
]
#
start_urls = ['https://techcrunch.com/tag/Ericsson/']
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
condition
=
kwargs
.
get
(
'query'
)
...
@@ -41,19 +41,49 @@ class TechcrunchSpider(scrapy.Spider):
...
@@ -41,19 +41,49 @@ class TechcrunchSpider(scrapy.Spider):
item
[
'description'
]
=
i
.
xpath
(
"./div/p//text()"
)
.
extract_first
()
item
[
'description'
]
=
i
.
xpath
(
"./div/p//text()"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"./div/h2/a/@href"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"./div/h2/a/@href"
)
.
extract_first
()
item
[
'region_id'
]
=
company
.
region_id
item
.
update
(
self
.
get_common_items
(
company
))
item
[
'type_id'
]
=
company
.
type_id
item
[
'media_id'
]
=
company
.
media_id
item
[
'company_id'
]
=
company
.
id
item
[
'post_id'
]
=
response
.
meta
[
'post_id'
]
item
[
'post_id'
]
=
response
.
meta
[
'post_id'
]
# print(item)
#
yield item
yield
item
has_next
=
response
.
xpath
(
"//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
has_next
=
response
.
xpath
(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
next_url
=
'https://techcrunch.com'
+
has_next
next_url
=
'https://techcrunch.com'
+
has_next
if
has_next
:
if
has_next
:
yield
scrapy
.
Request
(
next_url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
response
.
meta
[
'company'
],
'post_id'
:
0
})
yield
scrapy
.
Request
(
next_url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
response
.
meta
[
'company'
],
'post_id'
:
0
})
except
BaseException
as
e
:
except
BaseException
as
e
:
print
(
'We had error'
)
print
(
'We had error'
)
traceback
.
print_exc
()
traceback
.
print_exc
()
\ No newline at end of file
def
get_common_items
(
self
,
company
):
return
{
'region_id'
:
company
.
region_id
,
'type_id'
:
company
.
type_id
,
'media_id'
:
company
.
media_id
,
'company_id'
:
company
.
id
}
def
parse_tag
(
self
,
response
):
news_list
=
response
.
xpath
(
"..//div[contains(@class, 'block block-thumb ')]"
)
company
=
response
.
meta
[
'company'
]
print
(
'FOOOOOOOOOOOOOOOOOOOOOO'
)
for
i
in
news_list
:
print
(
'GGGGGGGGGGGGGGGGGGGGGG'
)
item
=
ExaItem
()
item
[
'date'
]
=
i
.
xpath
(
"./div/div/time/@datetime"
)
.
extract_first
()
item
[
'title'
]
=
i
.
xpath
(
"./div/h2/a/text()"
)
.
extract_first
()
item
[
'description'
]
=
i
.
xpath
(
"./div/p//text()"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"./div/h2/a/@href"
)
.
extract_first
()
item
.
update
(
self
.
get_common_items
(
company
))
item
[
'post_id'
]
=
response
.
meta
[
'post_id'
]
print
(
item
)
# yield item
has_next
=
response
.
xpath
(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
next_url
=
'https://techcrunch.com'
+
has_next
if
has_next
:
yield
scrapy
.
Request
(
next_url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
response
.
meta
[
'company'
],
'post_id'
:
0
})
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment