Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
c3c8d3c2
Commit
c3c8d3c2
authored
Aug 01, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add text file for exists
parent
280e6936
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
9 deletions
+22
-9
pipelines.py
exa/exa/pipelines.py
+10
-6
update.py
exa/update.py
+12
-3
No files found.
exa/exa/pipelines.py
View file @
c3c8d3c2
...
...
@@ -45,13 +45,14 @@ class ExaPipeline(object):
return
item
def
insert_news
(
self
,
item
):
tags
=
self
.
get_tags
(
item
[
'url'
])
article
=
self
.
get_article
(
item
[
'url'
])
tags
=
self
.
get_tags
(
article
)
data
=
(
item
[
'title'
],
item
[
'description'
],
item
[
'url'
],
item
[
'media_id'
],
item
[
'type_id'
],
item
[
'region_id'
],
item
[
'post_id'
],
item
[
'date'
],
datetime
.
now
()
.
date
(),
item
[
'company_id'
],
0
,
item
[
'tags'
],
tags
)
item
[
'tags'
],
tags
,
article
)
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted, temp_tags, tags_id)
VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
publish_date, record_date, company_id, is_accepted, temp_tags, tags_id
, text
)
VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s
,
%
s
);
\n
"""
self
.
db
.
insert
(
query
,
data
)
self
.
add_url_to_block
(
item
[
'url'
])
...
...
@@ -66,11 +67,14 @@ class ExaPipeline(object):
def
add_url_to_block
(
self
,
url
):
self
.
buffer
.
set
(
url
,
True
)
def
get_
tags
(
self
,
url
):
def
get_
article
(
self
,
url
):
article
=
Article
(
url
)
article
.
download
()
article
.
parse
()
self
.
classifier
.
classify
(
article
.
text
)
return
article
.
text
def
get_tags
(
self
,
text
):
self
.
classifier
.
classify
(
text
)
tags
=
list
()
for
i
in
self
.
classifier
.
teg_accordance
:
tags
.
append
(
i
[
0
])
...
...
exa/update.py
View file @
c3c8d3c2
...
...
@@ -33,8 +33,8 @@ class NewsUpdater:
self
.
db
.
update
(
query
)
# this bad way
def
update_all
(
self
):
for
i
in
self
.
select_news
(
'select id, url from wp_esi_news_accept where
id> 80 and id<100
'
):
def
update_all
_tags
(
self
):
for
i
in
self
.
select_news
(
'select id, url from wp_esi_news_accept where
1
'
):
try
:
text
=
self
.
load_text
(
i
[
1
])
tags
=
self
.
get_tags
(
text
)
...
...
@@ -43,7 +43,16 @@ class NewsUpdater:
except
BaseException
as
e
:
print
(
e
.
with_traceback
())
def
update_all_text
(
self
):
for
i
in
self
.
select_news
(
'select id, url from wp_esi_news_accept where id>26500'
):
# try:
text
=
self
.
load_text
(
i
[
1
])
self
.
update_news
(
'update wp_esi_news_accept set text="{}" where id={}'
.
format
(
str
(
text
.
encode
(
'ascii'
,
'ignore'
)),
i
[
0
]))
print
(
'News id={} was updated'
.
format
(
i
[
0
]))
# except BaseException as e:
# print(e.with_traceback())
if
__name__
==
'__main__'
:
ml
=
NewsUpdater
()
ml
.
update_all
()
ml
.
update_all
_text
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment