Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
98bf7249
Commit
98bf7249
authored
Jul 20, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Change saving news to DB
parent
76f763cb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
7 deletions
+20
-7
pipelines.py
exa/exa/pipelines.py
+17
-7
base.py
exa/exa/spiders/base.py
+3
-0
No files found.
exa/exa/pipelines.py
View file @
98bf7249
...
...
@@ -17,25 +17,35 @@ class ExaPipeline(object):
self
.
urls
=
{
i
[
0
]
for
i
in
self
.
db
.
select
(
'select url from wp_esi_news_accept'
)}
super
(
ExaPipeline
,
self
)
.
__init__
()
def
open_spider
(
self
,
spider
):
spider
.
pipeline
=
self
def
process_item
(
self
,
item
,
spider
):
item
[
'title'
]
=
''
.
join
(
item
[
'title'
])
.
replace
(
'
\n
'
,
' '
)
if
item
[
'description'
]:
item
[
'description'
]
=
''
.
join
(
item
[
'description'
])
.
replace
(
'
\n
'
,
' '
)
if
item
[
'tags'
]:
item
[
'tags'
]
=
','
.
join
(
item
[
'tags'
])
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\t
'
,
''
)
return
item
def
insert_news
(
self
,
item
):
data
=
(
item
[
'title'
],
item
[
'description'
],
item
[
'url'
],
item
[
'media_id'
],
item
[
'type_id'
],
item
[
'region_id'
],
item
[
'post_id'
],
item
[
'date'
],
datetime
.
now
()
.
date
(),
item
[
'company_id'
],
0
,
item
[
'tags'
])
item
[
'region_id'
],
item
[
'post_id'
],
item
[
'date'
],
datetime
.
now
()
.
date
(),
item
[
'company_id'
],
0
,
item
[
'tags'
])
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted, temp_tags) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
if
item
[
'url'
]
in
self
.
urls
:
if
self
.
check_url
(
item
[
'url'
])
:
print
(
"DUPLICATE"
,
item
)
else
:
print
(
"UNIQUE"
,
item
)
self
.
db
.
insert
(
query
,
data
)
self
.
urls
.
add
(
item
[
'url'
])
return
item
def
check_url
(
self
,
url
):
if
url
in
self
.
urls
:
return
True
else
:
return
False
def
_insert_news_entiry
(
self
,
news
,
entity
):
query
=
'INSERT INTO wp_esi_news_entity (news_id, entity_id) VALUES(
%
s,
%
s)'
self
.
db
.
insert
(
query
,
(
news
,
entity
))
exa/exa/spiders/base.py
View file @
98bf7249
...
...
@@ -23,3 +23,6 @@ class BaseSpider(scrapy.Spider):
companies
=
CompanyMaker
(
db
.
select
(
self
.
query
))
companies
.
make_companies
(
name
)
return
companies
.
get_companies
()
def
check_buffer
(
self
,
url
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment