Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
44e27193
Commit
44e27193
authored
May 16, 2017
by
Vasyl Bodnaruk
Committed by
Andrii Marynets
May 16, 2017
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
writed new scraper
parent
a1df6c29
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
88 additions
and
42 deletions
+88
-42
items.py
exa/exa/items.py
+1
-0
middlewares.py
exa/exa/middlewares.py
+2
-1
pipelines.py
exa/exa/pipelines.py
+12
-3
settings.py
exa/exa/settings.py
+50
-28
exa_news.py
exa/exa/spiders/exa_news.py
+23
-10
No files found.
exa/exa/items.py
View file @
44e27193
...
...
@@ -10,5 +10,6 @@ import scrapy
class
ExaItem
(
scrapy
.
Item
):
date
=
scrapy
.
Field
()
media
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
exa/exa/middlewares.py
View file @
44e27193
...
...
@@ -38,7 +38,8 @@ class SeleniumDownloadMiddleware(object):
self
.
driver
.
set_page_load_timeout
(
60
)
try
:
self
.
driver
.
get
(
request
.
url
)
time
.
sleep
(
4
)
except
BaseException
as
e
:
print
(
'Exception in process loading page'
)
return
None
...
...
exa/exa/pipelines.py
View file @
44e27193
...
...
@@ -8,9 +8,18 @@ import csv
class
ExaPipeline
(
object
):
def
__init__
(
self
):
self
.
out
=
open
(
'out.csv'
,
'w'
,
newline
=
'
\n
'
)
super
(
ExaPipeline
,
self
)
.
__init__
()
def
__del__
(
self
):
self
.
out
.
close
()
def
process_item
(
self
,
item
,
spider
):
with
open
(
'out.csv'
,
'w'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
,
delimiter
=
' '
)
writer
.
writerow
(
str
(
item
))
s
=
"""INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
\n
"""
.
format
(
item
[
'title'
],
item
[
'url'
],
item
[
'media'
],
1
,
3
,
item
[
'date'
])
self
.
out
.
write
(
s
)
return
item
exa/exa/settings.py
View file @
44e27193
...
...
@@ -14,80 +14,102 @@ BOT_NAME = 'exa'
SPIDER_MODULES
=
[
'exa.spiders'
]
NEWSPIDER_MODULE
=
'exa.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'exa (+http://www.yourdomain.com)'
#
USER_AGENT = 'exa (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS
=
1
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
#
CONCURRENT_REQUESTS_PER_DOMAIN = 16
#
CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
#
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
#
TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
'Accept-Language'
:
'en'
,
'Accept-Encoding'
:
'gzip, deflate, sdch'
,
'Connection'
:
'keep-alive'
,
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
,
'Accept-Language'
:
'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2'
,
'Accept-Encoding'
:
'gzip, deflate, sdch, br'
,
'Connection'
:
'keep-alive'
,
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
,
'Upgrade-Insecure-Requests'
:
1
,
'Cookie'
:
'_vdl=1; __uvt=; __qca=P0-1643001621-1493898328387; D_SID=31.134.92.67:3/rJdf3hJG6+TR13YMxSvUiTJ8h57So0kyLM43rDTdg; __cfduid=dadeec73c1d1ff4bd071afb9fb374f3211493899807; uvts=5yfEPqhJPZPFQxyd; multivariate_bot=false; s_sq=
%5
B
%5
BB
%5
D
%5
D; user_intent_path=
%2
Faccount
%2
Fsignup
%3
Fredirect_to
%3
D
%2
Forganization
%2
Fsnappr-2
%2
Fpress
%2
Fedit; user_origin_path=
%2
Forganization
%2
Fsnappr-2; jaco_uid=4ec53bb9-8854-42eb-a433-3ee13728d283; jaco_referer=; _oklv=1494501188795
%2
Cg8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; _okdetect=
%7
B
%22
token
%22%3
A
%2214945011896690%22%2
C
%22
proto
%22%3
A
%22
https
%3
A
%22%2
C
%22
host
%22%3
A
%22
www.crunchbase.com
%22%7
D; olfsk=olfsk5754888747300402; _okbk=cd4
%3
Dtrue
%2
Cvi5
%3
D0
%2
Cvi4
%3
D1494501191139
%2
Cvi3
%3
Dactive
%2
Cvi2
%3
Dfalse
%2
Cvi1
%3
Dfalse
%2
Ccd8
%3
Dchat
%2
Ccd6
%3
D0
%2
Ccd5
%3
Daway
%2
Ccd3
%3
Dfalse
%2
Ccd2
%3
D0
%2
Ccd1
%3
D0
%2
C; _ok=1554-355-10-6773; wcsid=g8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; hblid=fK32w02XYxOB0upN3F6pZ0M0P0REO2B6; AMCV_6B25357E519160E40A490D44
%40
AdobeOrg=1256414278
%7
CMCMID
%7
C86901182656108813444510944813131305330
%7
CMCAAMLH-1495108161
%7
C6
%7
CMCAAMB-1495108161
%7
CNRX38WO0n5BH8Th-nqAG_A
%7
CMCAID
%7
CNONE; _site_session=927d74172f306e57a17c1d078aed0328; _ga=GA1.2.2032591643.1493898326; _gid=GA1.2.154162834.1494583376; _hp2_props.973801186=
%7
B
%22
Logged
%20
In
%22%3
A
%22
false
%22%2
C
%22
Pro
%22%3
Afalse
%7
D; s_pers=
%20
s_getnr
%3
D1494583377399-Repeat
%7
C1557655377399
%3
B
%20
s_nrgvo
%3
DRepeat
%7
C1557655377402
%3
B; s_cc=true; _hp2_ses_props.973801186=
%7
B
%22
ts
%22%3
A1494583361786
%2
C
%22
d
%22%3
A
%22
www.crunchbase.com
%22%2
C
%22
h
%22%3
A
%22%2
Forganization
%2
Fsense-ly
%2
Fpress
%22%7
D; _hp2_id.973801186=
%7
B
%22
userId
%22%3
A
%221725770103252256%22%2
C
%22
pageviewId
%22%3
A
%223361758318735220%22%2
C
%22
sessionId
%22%3
A
%228966052576718307%22%2
C
%22
identity
%22%3
Anull
%2
C
%22
trackerVersion
%22%3
A
%223.0%22%7
D; D_PID=579CC756-6031-3F9A-8537-A12264CBC935; D_IID=4FDC617B-AEC3-339A-8ED0-AD1AE00E2167; D_UID=2FE95751-354F-3057-A1E7-15B142B136FE; D_HID=6ixGrm4H6tT/P1hvQqpycrUcm9v3AYCJ7RG1XjcwPR0; D_ZID=7C440876-9267-3387-8333-D2425576FA59; D_ZUID=390BF5F5-7D40-32EE-A617-F3F2C5669811; _px=eyJzIjp7ImEiOjAsImIiOjB9LCJ0IjoxNDk0NTg0NDM4MDg5LCJoIjoiYzE3YWNiMDFiZjMzNjE0NDA0NGJkZDJjNzY4OWRjNmZlODllZmU2ODY3N2ExZjVjY2U3MjljMWUxOWM0YWQzMCJ9'
}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#
SPIDER_MIDDLEWARES = {
# 'exa.middlewares.ExaSpiderMiddleware': 543,
#}
#
}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES
=
{
'exa.middlewares.SeleniumDownloadMiddleware'
:
543
,
'exa.middlewares.SeleniumDownloadMiddleware'
:
543
,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#
EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
#
}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'exa.pipelines.ExaPipeline'
:
300
,
'exa.pipelines.ExaPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
#
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
#
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
#
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
#
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
#
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES
=
10
RETRY_HTTP_CODES
=
[
500
,
503
,
504
,
416
,
400
,
403
,
404
,
408
]
# Proxy list containing entries like
# http://host1:port
# http://username:password@host2:port
# http://host3:port
# ...
PROXY_LIST
=
'/home/andrii/work/exa/proxy_1000.txt'
# Proxy mode
# 0 = Every requests have different proxy
# 1 = Take only one proxy from the list and assign it to every requests
# 2 = Put a custom proxy to use in the settings
PROXY_MODE
=
0
# If proxy mode is 2 uncomment this sentence :
#CUSTOM_PROXY = "http://host1:port"
exa/exa/spiders/exa_news.py
View file @
44e27193
# -*- coding: utf-8 -*-
import
scrapy
import
dateparser
from
..items
import
ExaItem
class
ExaNewsSpider
(
scrapy
.
Spider
):
name
=
"
exa_
news"
allowed_domains
=
[
"
https://www.crunchbase.com/organization/sense-ly/press/
"
]
start_urls
=
[
'http
s://www.crunchbase.com/organization/sense-ly/press/
'
]
name
=
"
mobihealth
news"
allowed_domains
=
[
"
www.mobihealthnews.com
"
]
start_urls
=
[
'http
://www.mobihealthnews.com/tag/MedTronic
'
]
def
parse
(
self
,
response
):
rows
=
response
.
xpath
(
"..//table/tbody/tr"
)
for
i
in
rows
:
item
=
ExaItem
()
item
[
'date'
]
=
i
.
xpath
(
"//td[contains(@class, 'date')]/text()"
)
.
extract_first
()
item
[
'title'
]
=
i
.
xpath
(
"//td/a/text()"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"//td/a/@href"
)
.
extract_first
()
yield
item
try
:
rows
=
response
.
xpath
(
"..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]"
)
for
i
in
rows
:
item
=
ExaItem
()
item
[
'date'
]
=
dateparser
.
parse
(
i
.
xpath
(
".//span/span[contains(@class, 'day_list')]/text()"
)
.
extract_first
())
.
date
()
item
[
'media'
]
=
'mobihealthnews'
item
[
'title'
]
=
i
.
xpath
(
"..//span/a/text()"
)
.
extract_first
()
item
[
'url'
]
=
'http://www.mobihealthnews.com'
+
i
.
xpath
(
".//span/a/@href"
)
.
extract_first
()
yield
item
has_next
=
response
.
xpath
(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()"
)
.
extract_first
()
next_url
=
'http://www.mobihealthnews.com'
+
response
.
xpath
(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
print
(
has_next
,
next_url
)
if
has_next
:
pass
# yield scrapy.Request(next_url, callback=self.parse)
except
BaseException
:
print
(
'We had error'
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment