Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
50b27c03
Commit
50b27c03
authored
Jul 10, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add mixin to spider
parent
5a897b66
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
40 deletions
+8
-40
aitop.py
exa/exa/spiders/aitop.py
+4
-14
mobihealthnews.py
exa/exa/spiders/mobihealthnews.py
+2
-11
techcrunch.py
exa/exa/spiders/techcrunch.py
+2
-15
No files found.
exa/exa/spiders/aitop.py
View file @
50b27c03
...
...
@@ -2,31 +2,23 @@
import
scrapy
import
dateparser
from
scrapy.utils.project
import
get_project_settings
from
..helpers
import
CompanyMaker
,
Database
from
..helpers
import
CompanyMaker
,
Database
,
QueryMixin
from
..items
import
ExaItem
db_settings
=
get_project_settings
()
.
get
(
'DB'
)
db
=
Database
(
**
db_settings
)
class
AitopSpider
(
scrapy
.
Spider
):
class
AitopSpider
(
QueryMixin
,
scrapy
.
Spider
):
name
=
"aitop"
allowed_domains
=
[
"aitopics.org"
]
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
if
self
.
condition
:
print
(
self
.
condition
)
self
.
query
+=
' or {}'
.
format
(
self
.
condition
)
super
(
AitopSpider
,
self
)
.
__init__
()
def
start_requests
(
self
):
companies
=
CompanyMaker
(
db
.
select
(
self
.
query
))
companies
.
make_companies
(
self
.
name
)
for
i
in
companies
.
get_companies
():
try
:
yield
scrapy
.
Request
(
i
.
url
+
'/'
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
})
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
})
except
:
pass
...
...
@@ -102,6 +94,4 @@ class AitopSpider(scrapy.Spider):
else
:
return
None
def
get_common_items
(
self
,
company
):
return
{
'region_id'
:
company
.
region_id
,
'type_id'
:
company
.
type_id
,
'media_id'
:
company
.
media_id
,
'company_id'
:
company
.
id
}
\ No newline at end of file
exa/exa/spiders/mobihealthnews.py
View file @
50b27c03
...
...
@@ -3,7 +3,7 @@ import dateparser
import
scrapy
import
traceback
from
scrapy.utils.project
import
get_project_settings
from
..helpers
import
CompanyMaker
,
Database
from
..helpers
import
CompanyMaker
,
Database
,
QueryMixin
from
..items
import
ExaItem
...
...
@@ -11,19 +11,10 @@ db_settings = get_project_settings().get('DB')
db
=
Database
(
**
db_settings
)
class
MobiHealthNewsSpider
(
scrapy
.
Spider
):
class
MobiHealthNewsSpider
(
QueryMixin
,
scrapy
.
Spider
):
name
=
"mhn"
allowed_domains
=
[
"www.mobihealthnews.com"
]
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if
self
.
condition
:
print
(
self
.
condition
)
self
.
query
+=
' or {}'
.
format
(
self
.
condition
)
print
(
self
.
query
)
super
(
MobiHealthNewsSpider
,
self
)
.
__init__
()
def
start_requests
(
self
):
companies
=
CompanyMaker
(
db
.
select
(
self
.
query
))
companies
.
make_companies
(
self
.
name
)
...
...
exa/exa/spiders/techcrunch.py
View file @
50b27c03
...
...
@@ -2,7 +2,7 @@
import
scrapy
import
traceback
from
scrapy.utils.project
import
get_project_settings
from
..helpers
import
CompanyMaker
,
Database
from
..helpers
import
CompanyMaker
,
Database
,
QueryMixin
from
..items
import
ExaItem
...
...
@@ -10,18 +10,10 @@ db_settings = get_project_settings().get('DB')
db
=
Database
(
**
db_settings
)
class
TechcrunchSpider
(
scrapy
.
Spider
):
class
TechcrunchSpider
(
QueryMixin
,
scrapy
.
Spider
):
name
=
"tc"
allowed_domains
=
[
"techcrunch.com"
]
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
if
self
.
condition
:
print
(
self
.
condition
)
self
.
query
+=
' or {}'
.
format
(
self
.
condition
)
super
(
TechcrunchSpider
,
self
)
.
__init__
()
def
start_requests
(
self
):
companies
=
CompanyMaker
(
db
.
select
(
self
.
query
))
companies
.
make_companies
(
self
.
name
)
...
...
@@ -32,16 +24,11 @@ class TechcrunchSpider(scrapy.Spider):
pass
def
parse
(
self
,
response
):
print
(
response
.
request
.
headers
)
if
'tag'
in
response
.
url
:
return
self
.
parse_tag
(
response
)
if
'company'
in
response
.
url
:
return
self
.
parse_company
(
response
)
def
get_common_items
(
self
,
company
):
return
{
'region_id'
:
company
.
region_id
,
'type_id'
:
company
.
type_id
,
'media_id'
:
company
.
media_id
,
'company_id'
:
company
.
id
}
def
parse_tag
(
self
,
response
):
try
:
news_list
=
response
.
xpath
(
"..//div[contains(@class, 'block block-thumb ')]"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment