Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
9e878ace
Commit
9e878ace
authored
May 18, 2017
by
Vasyl Bodnaruk
Committed by
Andrii Marynets
May 18, 2017
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added database wrapper
parent
44e27193
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
117 additions
and
35 deletions
+117
-35
__init__.py
exa/__init__.py
+0
-0
__init__.py
exa/exa/__init__.py
+2
-0
__init__.py
exa/exa/helpers/__init__.py
+2
-0
company_maker.py
exa/exa/helpers/company_maker.py
+21
-0
db.py
exa/exa/helpers/db.py
+41
-0
items.py
exa/exa/items.py
+3
-1
middlewares.py
exa/exa/middlewares.py
+0
-1
pipelines.py
exa/exa/pipelines.py
+10
-6
settings.py
exa/exa/settings.py
+6
-17
mobihealthnews.py
exa/exa/spiders/mobihealthnews.py
+21
-10
techcrunch.py
exa/exa/spiders/techcrunch.py
+11
-0
No files found.
exa/__init__.py
0 → 100644
View file @
9e878ace
exa/exa/__init__.py
View file @
9e878ace
from
.helpers.company_maker
import
CompanyMaker
from
.items
import
ExaItem
\ No newline at end of file
exa/exa/helpers/__init__.py
0 → 100644
View file @
9e878ace
from
.company_maker
import
CompanyMaker
from
.db
import
Database
\ No newline at end of file
exa/exa/helpers/company_maker.py
0 → 100644
View file @
9e878ace
from
collections
import
namedtuple
Company
=
namedtuple
(
'Company'
,
'url, media_id, type_id, region_id'
)
# 'http://www.mobihealthnews.com/tag/apple',
# 'http://www.mobihealthnews.com/tag/clover-health'
class
CompanyMaker
:
def
__init__
(
self
,
companies
=
None
):
self
.
in_site
=
[
'http://www.mobihealthnews.com/tag/MedTronic'
]
self
.
companies
=
list
()
def
make_companies
(
self
):
self
.
_make_list
()
def
get_companies
(
self
):
return
self
.
companies
def
_make_list
(
self
):
for
i
in
self
.
in_site
:
self
.
companies
.
append
(
Company
(
i
,
43
,
1
,
2
))
exa/exa/helpers/db.py
0 → 100644
View file @
9e878ace
from
collections
import
namedtuple
import
MySQLdb
Entity
=
namedtuple
(
'Entity'
,
'id, name, country'
)
class
Database
:
def
__init__
(
self
,
host
,
user
,
pwd
,
database
):
self
.
host
=
host
self
.
user
=
user
self
.
pwd
=
pwd
self
.
database
=
database
self
.
db
=
MySQLdb
.
connect
(
self
.
host
,
self
.
user
,
self
.
pwd
,
self
.
database
)
self
.
db
.
autocommit
(
True
)
self
.
cursor
=
self
.
db
.
cursor
()
def
__del__
(
self
):
self
.
db
.
close
()
def
insert
(
self
,
item
):
try
:
self
.
cursor
.
execute
(
item
)
return
self
.
cursor
.
lastrowid
except
:
self
.
cursor
.
rollback
()
def
select
(
self
,
item
):
try
:
self
.
cursor
.
execute
(
item
)
result
=
list
()
for
i
in
self
.
cursor
.
fetchall
():
result
.
append
(
Entity
(
i
[
0
],
i
[
1
],
i
[
2
]))
return
result
except
:
self
.
cursor
.
rollback
()
if
__name__
==
'__main__'
:
select
=
'SELECT id, name, country FROM wp_esi_entity WHERE id < 10'
db
=
Database
(
'localhost'
,
'root'
,
'andrew'
,
'esi'
)
rows
=
db
.
select
(
select
)
exa/exa/items.py
View file @
9e878ace
...
...
@@ -10,6 +10,8 @@ import scrapy
class
ExaItem
(
scrapy
.
Item
):
date
=
scrapy
.
Field
()
media
=
scrapy
.
Field
()
media
_id
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
region_id
=
scrapy
.
Field
()
type_id
=
scrapy
.
Field
()
exa/exa/middlewares.py
View file @
9e878ace
...
...
@@ -39,7 +39,6 @@ class SeleniumDownloadMiddleware(object):
try
:
self
.
driver
.
get
(
request
.
url
)
except
BaseException
as
e
:
print
(
'Exception in process loading page'
)
return
None
...
...
exa/exa/pipelines.py
View file @
9e878ace
...
...
@@ -4,22 +4,26 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
csv
from
datetime
import
datetime
from
.helpers.db
import
Database
from
scrapy.utils.project
import
get_project_settings
db
=
get_project_settings
()
.
get
(
'DB'
)
class
ExaPipeline
(
object
):
class
ExaPipeline
(
object
):
def
__init__
(
self
):
self
.
out
=
open
(
'out.csv'
,
'w'
,
newline
=
'
\n
'
)
self
.
out
=
open
(
'out{}.txt'
.
format
(
datetime
.
now
()),
'w'
,
newline
=
'
\n
'
)
self
.
db
=
Database
(
**
db
)
super
(
ExaPipeline
,
self
)
.
__init__
()
def
__del__
(
self
):
self
.
out
.
close
()
def
process_item
(
self
,
item
,
spider
):
s
=
"
""INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
\n
"""
.
format
(
item
[
'title'
],
item
[
'url'
],
item
[
'media'
],
1
,
3
,
item
[
'date'
])
s
=
"
INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date) VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}')
\n
"
.
format
(
item
[
'title'
],
item
[
'url'
],
item
[
'media_id'
],
item
[
'type_id'
],
item
[
'region_id'
]
,
item
[
'date'
])
self
.
out
.
write
(
s
)
return
item
exa/exa/settings.py
View file @
9e878ace
...
...
@@ -96,20 +96,9 @@ ITEM_PIPELINES = {
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES
=
10
RETRY_HTTP_CODES
=
[
500
,
503
,
504
,
416
,
400
,
403
,
404
,
408
]
# Proxy list containing entries like
# http://host1:port
# http://username:password@host2:port
# http://host3:port
# ...
PROXY_LIST
=
'/home/andrii/work/exa/proxy_1000.txt'
# Proxy mode
# 0 = Every requests have different proxy
# 1 = Take only one proxy from the list and assign it to every requests
# 2 = Put a custom proxy to use in the settings
PROXY_MODE
=
0
# If proxy mode is 2 uncomment this sentence :
#CUSTOM_PROXY = "http://host1:port"
DB
=
{
'host'
:
'localhost'
,
'user'
:
'root'
,
'pwd'
:
'andrew'
,
'database'
:
'esi'
}
exa/exa/spiders/
exa_
news.py
→
exa/exa/spiders/
mobihealth
news.py
View file @
9e878ace
# -*- coding: utf-8 -*-
import
scrapy
import
dateparser
import
scrapy
from
..helpers
import
CompanyMaker
from
..items
import
ExaItem
class
Exa
NewsSpider
(
scrapy
.
Spider
):
class
MobiHealth
NewsSpider
(
scrapy
.
Spider
):
name
=
"mobihealthnews"
allowed_domains
=
[
"www.mobihealthnews.com"
]
start_urls
=
[
'http://www.mobihealthnews.com/tag/MedTronic'
]
def
start_requests
(
self
):
companies
=
CompanyMaker
()
companies
.
make_companies
()
for
i
in
companies
.
get_companies
():
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'type_id'
:
i
.
type_id
,
'region_id'
:
i
.
region_id
,
'media_id'
:
i
.
media_id
})
def
parse
(
self
,
response
):
try
:
#
try:
rows
=
response
.
xpath
(
"..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]"
)
for
i
in
rows
:
item
=
ExaItem
()
item
[
'date'
]
=
dateparser
.
parse
(
i
.
xpath
(
".//span/span[contains(@class, 'day_list')]/text()"
)
.
extract_first
())
.
date
()
item
[
'media
'
]
=
'mobihealthnews'
item
[
'title'
]
=
i
.
xpath
(
".
.
//span/a/text()"
)
.
extract_first
()
item
[
'media
_id'
]
=
response
.
meta
[
'media_id'
]
item
[
'title'
]
=
i
.
xpath
(
".//span/a/text()"
)
.
extract_first
()
item
[
'url'
]
=
'http://www.mobihealthnews.com'
+
i
.
xpath
(
".//span/a/@href"
)
.
extract_first
()
item
[
'region_id'
]
=
response
.
meta
[
'region_id'
]
item
[
'type_id'
]
=
response
.
meta
[
'type_id'
]
yield
item
has_next
=
response
.
xpath
(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()"
)
.
extract_first
()
next_url
=
'http://www.mobihealthnews.com'
+
response
.
xpath
(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href"
)
.
extract_first
()
print
(
has_next
,
next_url
)
if
has_next
:
pass
# yield scrapy.Request(next_url, callback=self.parse)
yield
scrapy
.
Request
(
next_url
,
callback
=
self
.
parse
,
meta
=
{
'type_id'
:
1
,
'region_id'
:
2
,
'media_id'
:
43
})
except
BaseException
:
print
(
'We had error'
)
\ No newline at end of file
# except BaseException:
# print('We had error')
\ No newline at end of file
exa/exa/spiders/techcrunch.py
0 → 100644
View file @
9e878ace
# -*- coding: utf-8 -*-
import
scrapy
class
TechcrunchSpider
(
scrapy
.
Spider
):
name
=
"techcrunch"
allowed_domains
=
[
"techcrunch.com"
]
start_urls
=
[
'https://techcrunch.com/search/Behold.ai#stq=Behold.ai/'
]
def
parse
(
self
,
response
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment