Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
E
esi-table-data
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
esi-data-scrapping
esi-table-data
Commits
eed7af2e
Commit
eed7af2e
authored
Jun 09, 2017
by
Vasyl Bodnaruk
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changed module DB for reuse
parent
41acec84
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
15 additions
and
12 deletions
+15
-12
company_maker.py
exa/exa/helpers/company_maker.py
+6
-1
db.py
exa/exa/helpers/db.py
+1
-5
pipelines.py
exa/exa/pipelines.py
+6
-5
mobihealthnews.py
exa/exa/spiders/mobihealthnews.py
+2
-1
No files found.
exa/exa/helpers/company_maker.py
View file @
eed7af2e
...
@@ -6,12 +6,17 @@ Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
...
@@ -6,12 +6,17 @@ Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
# 'http://www.mobihealthnews.com/tag/MedTronic'
# 'http://www.mobihealthnews.com/tag/MedTronic'
Entity
=
namedtuple
(
'Entity'
,
'id, name, country'
)
class
CompanyMaker
:
class
CompanyMaker
:
def
__init__
(
self
,
companies
=
None
):
def
__init__
(
self
,
companies
=
None
):
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/google']
# 'http://www.mobihealthnews.com/tag/google']
self
.
in_site
=
companies
self
.
in_site
=
list
()
if
companies
:
for
i
in
companies
:
self
.
in_site
.
append
(
Entity
(
i
[
0
],
i
[
1
],
i
[
2
]))
self
.
companies
=
list
()
self
.
companies
=
list
()
def
make_companies
(
self
):
def
make_companies
(
self
):
...
...
exa/exa/helpers/db.py
View file @
eed7af2e
...
@@ -2,7 +2,6 @@ from collections import namedtuple
...
@@ -2,7 +2,6 @@ from collections import namedtuple
import
traceback
import
traceback
import
MySQLdb
import
MySQLdb
Entity
=
namedtuple
(
'Entity'
,
'id, name, country'
)
class
Database
:
class
Database
:
...
@@ -36,10 +35,7 @@ class Database:
...
@@ -36,10 +35,7 @@ class Database:
def
select
(
self
,
item
):
def
select
(
self
,
item
):
try
:
try
:
self
.
cursor
.
execute
(
item
)
self
.
cursor
.
execute
(
item
)
result
=
list
()
return
self
.
cursor
.
fetchall
()
for
i
in
self
.
cursor
.
fetchall
():
result
.
append
(
Entity
(
i
[
0
],
i
[
1
],
i
[
2
]))
return
result
except
:
except
:
self
.
cursor
.
rollback
()
self
.
cursor
.
rollback
()
...
...
exa/exa/pipelines.py
View file @
eed7af2e
...
@@ -15,12 +15,9 @@ class ExaPipeline(object):
...
@@ -15,12 +15,9 @@ class ExaPipeline(object):
def
__init__
(
self
):
def
__init__
(
self
):
# self.out = open('out/out{}.txt'.format(datetime.now()), 'w', newline='\n')
# self.out = open('out/out{}.txt'.format(datetime.now()), 'w', newline='\n')
self
.
db
=
Database
(
**
db
)
self
.
db
=
Database
(
**
db
)
self
.
urls
=
set
(
self
.
db
.
select
(
'select url from wp_esi_accept'
))
super
(
ExaPipeline
,
self
)
.
__init__
()
super
(
ExaPipeline
,
self
)
.
__init__
()
def
__del__
(
self
):
pass
# self.out.close()
def
process_item
(
self
,
item
,
spider
):
def
process_item
(
self
,
item
,
spider
):
print
(
item
)
print
(
item
)
item
[
'description'
]
=
''
.
join
(
item
[
'description'
])
.
replace
(
'
\n
'
,
' '
)
item
[
'description'
]
=
''
.
join
(
item
[
'description'
])
.
replace
(
'
\n
'
,
' '
)
...
@@ -29,7 +26,11 @@ class ExaPipeline(object):
...
@@ -29,7 +26,11 @@ class ExaPipeline(object):
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
query
=
"""INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
publish_date, record_date, company_id, is_accepted) VALUES(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
\n
"""
print
(
item
)
print
(
item
)
news
=
self
.
db
.
insert
(
query
,
data
)
n
=
self
.
db
.
select
(
"select url from wp_esi_news_accept where url={}"
.
format
(
item
[
'url'
]))
if
len
(
n
)
==
0
:
news
=
self
.
db
.
insert
(
query
,
data
)
else
:
print
(
'Duplicate'
)
# self._insert_news_entiry(news, item['company_id'])
# self._insert_news_entiry(news, item['company_id'])
# self.out.write(query)
# self.out.write(query)
...
...
exa/exa/spiders/mobihealthnews.py
View file @
eed7af2e
...
@@ -19,7 +19,8 @@ class MobiHealthNewsSpider(scrapy.Spider):
...
@@ -19,7 +19,8 @@ class MobiHealthNewsSpider(scrapy.Spider):
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
condition
=
kwargs
.
get
(
'query'
)
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
self
.
query
=
"SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if
self
.
condition
:
if
self
.
condition
:
self
.
query
+=
' and {}'
.
format
(
self
.
condition
)
print
(
self
.
condition
)
self
.
query
+=
' or {}'
.
format
(
self
.
condition
)
self
.
comp
=
db
.
select
(
self
.
query
)
self
.
comp
=
db
.
select
(
self
.
query
)
super
(
MobiHealthNewsSpider
,
self
)
.
__init__
()
super
(
MobiHealthNewsSpider
,
self
)
.
__init__
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment