Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
7dad524c
Commit
7dad524c
authored
Aug 01, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add custom settings to CB spider
parent
dd3fcca4
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
0 deletions
+60
-0
cb.py
exa/exa/spiders/cb.py
+11
-0
update.py
exa/update.py
+49
-0
No files found.
exa/exa/spiders/cb.py
View file @
7dad524c
# -*- coding: utf-8 -*-
import
random
import
scrapy
from
.base
import
BaseSpider
...
...
@@ -11,11 +12,19 @@ class CbSpider(BaseSpider):
name
=
"cb"
allowed_domains
=
[
"www.crunchbase.com"
]
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
custom_settings
=
{
'DOWNLOAD_DELAY'
:
15
,
'CONCURRENT_REQUESTS'
:
2
,
'CONCURRENT_REQUESTS_PER_DOMAIN'
:
2
,
'exa.middlewares.SeleniumDownloadMiddleware'
:
543
}
co
=
0
def
start_requests
(
self
):
for
i
in
self
.
companies
(
self
.
name
):
print
(
i
)
try
:
self
.
custom_settings
[
'DOWNLOAD_DELAY'
]
=
random
.
random
()
*
random
.
randint
(
1
,
15
)
yield
scrapy
.
Request
(
i
.
url
,
callback
=
self
.
parse
,
meta
=
{
'company'
:
i
,
'post_id'
:
0
})
except
:
pass
...
...
@@ -31,6 +40,8 @@ class CbSpider(BaseSpider):
item
.
update
(
self
.
get_common_items
(
company
))
item
[
'media_id'
]
=
self
.
_get_media
(
i
)
print
(
item
)
self
.
co
+=
1
print
(
self
.
co
)
if
len
(
rows
)
!=
0
:
yield
scrapy
.
Request
(
self
.
_next_url
(
response
.
url
),
callback
=
self
.
parse
,
meta
=
response
.
meta
)
...
...
exa/update.py
0 → 100644
View file @
7dad524c
import
json
from
scrapy.utils.project
import
get_project_settings
from
newspaper
import
Article
from
exa.helpers
import
Database
from
exa.esi_news_classification.news_classify_tag
import
Classifier
db_cred
=
get_project_settings
()
.
get
(
'DB'
)
class
NewsUpdater
:
def
__init__
(
self
):
self
.
classifier
=
Classifier
()
self
.
classifier
.
teach_model
()
self
.
db
=
Database
(
**
db_cred
)
def
select_news
(
self
,
query
):
return
self
.
db
.
select
(
query
)
def
load_text
(
self
,
url
):
article
=
Article
(
url
)
article
.
download
()
article
.
parse
()
return
article
.
text
def
get_tags
(
self
,
text
):
self
.
classifier
.
classify
(
text
)
tags
=
list
()
for
i
in
self
.
classifier
.
teg_accordance
:
tags
.
append
(
i
[
0
])
return
json
.
dumps
(
tags
)
def
update_news
(
self
,
query
):
self
.
db
.
update
(
query
)
# this bad way
def
update_all
(
self
):
for
i
in
self
.
select_news
(
'select id, url from wp_esi_news_accept where id> 80 and id<100'
):
try
:
text
=
self
.
load_text
(
i
[
1
])
tags
=
self
.
get_tags
(
text
)
self
.
update_news
(
'update wp_esi_news_accept set tags_id="{}" where id={}'
.
format
(
tags
,
i
[
0
]))
print
(
'News id={} was updated'
.
format
(
i
[
0
]))
except
BaseException
as
e
:
print
(
e
.
with_traceback
())
if
__name__
==
'__main__'
:
ml
=
NewsUpdater
()
ml
.
update_all
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment