Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
a1df6c29
Commit
a1df6c29
authored
May 11, 2017
by
Vasyl Bodnaruk
Committed by
Andrii Marynets
May 11, 2017
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Wrote spider
parent
61045b1d
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
64 additions
and
55 deletions
+64
-55
items.py
exa/exa/items.py
+3
-3
middlewares.py
exa/exa/middlewares.py
+29
-38
pipelines.py
exa/exa/pipelines.py
+5
-0
settings.py
exa/exa/settings.py
+15
-12
exa_news.py
exa/exa/spiders/exa_news.py
+9
-2
requirements.txt
requirements.txt
+3
-0
No files found.
exa/exa/items.py
View file @
a1df6c29
...
...
@@ -9,6 +9,6 @@ import scrapy
class
ExaItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# nam
e = scrapy.Field()
pass
date
=
scrapy
.
Field
()
titl
e
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
exa/exa/middlewares.py
View file @
a1df6c29
...
...
@@ -4,53 +4,44 @@
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
import
time
from
scrapy
import
signals
from
scrapy.http
import
HtmlResponse
from
selenium
import
webdriver
class
ExaSpider
Middleware
(
object
):
class
SeleniumDownload
Middleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
middleware
=
cls
()
crawler
.
signals
.
connect
(
middleware
.
spider_opened
,
signals
.
spider_opened
)
crawler
.
signals
.
connect
(
middleware
.
spider_closed
,
signals
.
spider_closed
)
return
middleware
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
self
.
driver
=
webdriver
.
Chrome
()
self
.
driver
.
maximize_window
()
def
spider_closed
(
self
,
spider
):
if
self
.
driver
:
self
.
driver
.
quit
()
else
:
print
(
'Driver closed by exception or error'
)
def
process_request
(
self
,
request
,
spider
):
self
.
driver
.
set_page_load_timeout
(
60
)
try
:
self
.
driver
.
get
(
request
.
url
)
time
.
sleep
(
4
)
except
BaseException
as
e
:
print
(
'Exception in process loading page'
)
return
None
body
=
str
.
encode
(
self
.
driver
.
page_source
)
return
HtmlResponse
(
self
.
driver
.
current_url
,
body
=
body
,
encoding
=
'utf-8'
,
request
=
request
)
\ No newline at end of file
exa/exa/pipelines.py
View file @
a1df6c29
...
...
@@ -4,8 +4,13 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
csv
class
ExaPipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
with
open
(
'out.csv'
,
'w'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
,
delimiter
=
' '
)
writer
.
writerow
(
str
(
item
))
return
item
exa/exa/settings.py
View file @
a1df6c29
...
...
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'exa.spiders'
#USER_AGENT = 'exa (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
Tru
e
ROBOTSTXT_OBEY
=
Fals
e
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
...
...
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#
DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY
=
3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
...
...
@@ -39,10 +39,13 @@ ROBOTSTXT_OBEY = True
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
DEFAULT_REQUEST_HEADERS
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
'Accept-Language'
:
'en'
,
'Accept-Encoding'
:
'gzip, deflate, sdch'
,
'Connection'
:
'keep-alive'
,
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
,
}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
...
...
@@ -52,9 +55,9 @@ ROBOTSTXT_OBEY = True
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#
DOWNLOADER_MIDDLEWARES = {
# 'exa.middlewares.MyCustomDownloader
Middleware': 543,
#
}
DOWNLOADER_MIDDLEWARES
=
{
'exa.middlewares.SeleniumDownload
Middleware'
:
543
,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
...
...
@@ -64,9 +67,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#
ITEM_PIPELINES = {
#
'exa.pipelines.ExaPipeline': 300,
#
}
ITEM_PIPELINES
=
{
'exa.pipelines.ExaPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
...
...
exa/exa/spiders/exa_news.py
View file @
a1df6c29
# -*- coding: utf-8 -*-
import
scrapy
from
..items
import
ExaItem
class
ExaNewsSpider
(
scrapy
.
Spider
):
name
=
"exa_news"
allowed_domains
=
[
"https://www.crunchbase.com/organization/sense-ly/press/"
]
start_urls
=
[
'http
://https://www.crunchbase.com/organization/sense-ly/press/
/'
]
start_urls
=
[
'http
s://www.crunchbase.com/organization/sense-ly/press
/'
]
def
parse
(
self
,
response
):
pass
rows
=
response
.
xpath
(
"..//table/tbody/tr"
)
for
i
in
rows
:
item
=
ExaItem
()
item
[
'date'
]
=
i
.
xpath
(
"//td[contains(@class, 'date')]/text()"
)
.
extract_first
()
item
[
'title'
]
=
i
.
xpath
(
"//td/a/text()"
)
.
extract_first
()
item
[
'url'
]
=
i
.
xpath
(
"//td/a/@href"
)
.
extract_first
()
yield
item
requirements.txt
View file @
a1df6c29
Scrapy
==1.3.3
selenium
==3.4.1
dateparser
==0.6.0
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment