Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
293b7a26
Commit
293b7a26
authored
Jun 09, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add functional for scraping techcrunch
First step add name of company as tag in url
parent
227cd025
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
68 additions
and
21 deletions
+68
-21
company_maker.py
exa/exa/helpers/company_maker.py
+13
-12
decorators.py
exa/exa/helpers/decorators.py
+0
-0
middlewares.py
exa/exa/middlewares.py
+2
-0
pipelines.py
exa/exa/pipelines.py
+4
-4
mobihealthnews.py
exa/exa/spiders/mobihealthnews.py
+2
-2
techcrunch.py
exa/exa/spiders/techcrunch.py
+47
-3
No files found.
exa/exa/helpers/company_maker.py
View file @
293b7a26
from
collections
import
namedtuple
from
collections
import
namedtuple
Company
=
namedtuple
(
'Company'
,
'id, url, media_id, type_id, region_id'
)
Company
=
namedtuple
(
'Company'
,
'id, url, media_id, type_id, region_id, name'
)
# 'http://www.mobihealthnews.com/tag/apple'
# 'http://www.mobihealthnews.com/tag/clover-health'
# 'http://www.mobihealthnews.com/tag/MedTronic'
Entity
=
namedtuple
(
'Entity'
,
'id, name, country'
)
Entity
=
namedtuple
(
'Entity'
,
'id, name, country'
)
class
CompanyMaker
:
class
CompanyMaker
:
def
__init__
(
self
,
companies
=
None
):
def
__init__
(
self
,
companies
=
None
):
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/google']
self
.
in_site
=
list
()
self
.
in_site
=
list
()
if
companies
:
if
companies
:
for
i
in
companies
:
for
i
in
companies
:
self
.
in_site
.
append
(
Entity
(
i
[
0
],
i
[
1
],
i
[
2
]))
self
.
in_site
.
append
(
Entity
(
i
[
0
],
i
[
1
],
i
[
2
]))
self
.
companies
=
list
()
self
.
companies
=
list
()
def
make_companies
(
self
):
def
make_companies
(
self
,
media
):
self
.
_make_list
()
if
media
==
'mhn'
:
self
.
_make_list_for_mhn
()
elif
media
==
'tc'
:
self
.
_make_list_for_tc
()
def
get_companies
(
self
):
def
get_companies
(
self
):
return
self
.
companies
return
self
.
companies
def
_make_list
(
self
):
def
_make_list
_for_mhn
(
self
):
for
i
in
self
.
in_site
:
for
i
in
self
.
in_site
:
tag
=
i
.
name
tag
=
i
.
name
if
tag
.
find
(
' '
):
if
tag
.
find
(
' '
):
tag
=
tag
.
replace
(
' '
,
'-'
)
tag
=
tag
.
replace
(
' '
,
'-'
)
if
tag
.
find
(
'.'
):
if
tag
.
find
(
'.'
):
tag
=
tag
.
replace
(
'.'
,
''
)
tag
=
tag
.
replace
(
'.'
,
''
)
self
.
companies
.
append
(
Company
(
i
.
id
,
'http://www.mobihealthnews.com/tag/'
+
tag
,
43
,
2
,
2
))
self
.
companies
.
append
(
Company
(
i
.
id
,
'http://www.mobihealthnews.com/tag/'
+
tag
,
43
,
2
,
2
,
i
.
name
))
def
_make_list_for_tc
(
self
):
for
i
in
self
.
in_site
:
self
.
companies
.
append
(
Company
(
i
.
id
,
'https://techcrunch.com/tag/'
+
i
.
name
,
81
,
2
,
2
,
i
.
name
))
\ No newline at end of file
exa/exa/helpers/decorators.py
0 → 100644
View file @
293b7a26
exa/exa/middlewares.py
View file @
293b7a26
...
@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object):
...
@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object):
from
pyvirtualdisplay
import
Display
from
pyvirtualdisplay
import
Display
self
.
display
=
Display
()
self
.
display
=
Display
()
self
.
display
.
start
()
self
.
display
.
start
()
else
:
self
.
display
=
None
if
middleware
[
'driver'
]
==
'Chrome'
:
if
middleware
[
'driver'
]
==
'Chrome'
:
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.chrome.options
import
Options
...
...
exa/exa/pipelines.py
View file @
293b7a26
...
@@ -15,21 +15,21 @@ class ExaPipeline(object):
...
@@ -15,21 +15,21 @@ class ExaPipeline(object):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
db
=
Database
(
**
db
)
self
.
db
=
Database
(
**
db
)
self
.
urls
=
{
i
[
0
]
for
i
in
self
.
db
.
select
(
'select url from wp_esi_news_accept'
)}
self
.
urls
=
{
i
[
0
]
for
i
in
self
.
db
.
select
(
'select url from wp_esi_news_accept'
)}
print
(
self
.
urls
)
super
(
ExaPipeline
,
self
)
.
__init__
()
super
(
ExaPipeline
,
self
)
.
__init__
()
def
process_item
(
self
,
item
,
spider
):
def
process_item
(
self
,
item
,
spider
):
print
(
item
)
item
[
'title'
]
=
''
.
join
(
item
[
'title'
])
.
replace
(
'
\n
'
,
' '
)
item
[
'description'
]
=
''
.
join
(
item
[
'description'
])
.
replace
(
'
\n
'
,
' '
)
item
[
'description'
]
=
''
.
join
(
item
[
'description'
])
.
replace
(
'
\n
'
,
' '
)
data
=
(
item
[
'title'
],
item
[
'description'
],
item
[
'url'
],
item
[
'media_id'
],
item
[
'type_id'
],
data
=
(
item
[
'title'
],
item
[
'description'
],
item
[
'url'
],
item
[
'media_id'
],
item
[
'type_id'
],
item
[
'region_id'
],
item
[
'post_id'
],
item
[
'date'
],
datetime
.
now
()
.
date
(),
item
[
'company_id'
],
0
)
item
[
'region_id'
],
item
[
'post_id'
],
item
[
'date'
],
datetime
.
now
()
.
date
(),
item
[
'company_id'
],
0
)
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
publish_date, record_date, company_id, is_accepted) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
print
(
item
)
if
item
[
'url'
]
in
self
.
urls
:
if
item
[
'url'
]
in
self
.
urls
:
print
(
"DUPLICATE"
)
print
(
"DUPLICATE"
,
item
)
else
:
else
:
print
(
"UNIQUE"
,
item
)
self
.
db
.
insert
(
query
,
data
)
self
.
db
.
insert
(
query
,
data
)
self
.
urls
.
add
(
item
[
'url'
])
# self._insert_news_entiry(news, item['company_id'])
# self._insert_news_entiry(news, item['company_id'])
# self.out.write(query)
# self.out.write(query)
...
...
exa/exa/spiders/mobihealthnews.py
View file @
293b7a26
...
@@ -12,7 +12,7 @@ db = Database(**db_settings)
...
@@ -12,7 +12,7 @@ db = Database(**db_settings)
class
MobiHealthNewsSpider
(
scrapy
.
Spider
):
class
MobiHealthNewsSpider
(
scrapy
.
Spider
):
name
=
"m
obihealthnews
"
name
=
"m
hn
"
allowed_domains
=
[
"www.mobihealthnews.com"
]
allowed_domains
=
[
"www.mobihealthnews.com"
]
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
...
@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider):
...
@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider):
def
start_requests
(
self
):
def
start_requests
(
self
):
companies
=
CompanyMaker
(
self
.
comp
)
companies
=
CompanyMaker
(
self
.
comp
)
companies
.
make_companies
()
companies
.
make_companies
(
self
.
name
)
for
i
in
companies
.
get_companies
():
for
i
in
companies
.
get_companies
():
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
})
'post_id'
:
0
})
...
...
exa/exa/spiders/techcrunch.py
View file @
293b7a26
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
scrapy
import
scrapy
from
scrapy.utils.project
import
get_project_settings
from
..helpers
import
CompanyMaker
,
Database
from
..items
import
ExaItem
db_settings
=
get_project_settings
()
.
get
(
'DB'
)
db
=
Database
(
**
db_settings
)
class
TechcrunchSpider
(
scrapy
.
Spider
):
class
TechcrunchSpider
(
scrapy
.
Spider
):
name
=
"t
echcrunch
"
name
=
"t
c
"
allowed_domains
=
[
"techcrunch.com"
]
allowed_domains
=
[
"techcrunch.com"
]
start_urls
=
[
'https://techcrunch.com/search/Behold.ai#stq=Behold.ai/'
]
start_urls
=
[
'https://techcrunch.com/tag/Ericsson/'
]
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
if
self
.
condition
:
print
(
self
.
condition
)
self
.
query
+=
' or {}'
.
format
(
self
.
condition
)
self
.
comp
=
db
.
select
(
self
.
query
)
super
(
TechcrunchSpider
,
self
)
.
__init__
()
def
start_requests
(
self
):
companies
=
CompanyMaker
(
self
.
comp
)
companies
.
make_companies
(
self
.
name
)
for
i
in
companies
.
get_companies
():
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
})
def
parse
(
self
,
response
):
def
parse
(
self
,
response
):
pass
news_list
=
response
.
xpath
(
"..//div[contains(@class, 'block block-thumb ')]"
)
company
=
response
.
meta
[
'company'
]
for
i
in
news_list
:
item
=
ExaItem
()
item
[
'date'
]
=
i
.
xpath
(
"./div/div/time/@datetime"
)
.
extract_first
()
item
[
'title'
]
=
i
.
xpath
(
"./div/h2/a/text()"
)
.
extract_first
()
item
[
'description'
]
=
i
.
xpath
(
"./div/p//text()"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"./div/h2/a/@href"
)
.
extract_first
()
item
[
'region_id'
]
=
company
.
region_id
item
[
'type_id'
]
=
company
.
type_id
item
[
'media_id'
]
=
company
.
media_id
item
[
'company_id'
]
=
company
.
id
item
[
'post_id'
]
=
response
.
meta
[
'post_id'
]
# yield item
has_next
=
response
.
xpath
(
"//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
next_url
=
'https://techcrunch.com'
+
has_next
if
has_next
:
yield
scrapy
.
Request
(
next_url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
response
.
meta
[
'company'
],
'post_id'
:
0
})
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment