Commit e2f1c612 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk Committed by Andrii Marynets

written save to db

parent 9e878ace
from collections import namedtuple
Company = namedtuple('Company', 'url, media_id, type_id, region_id')
# 'http://www.mobihealthnews.com/tag/apple',
Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
# 'http://www.mobihealthnews.com/tag/apple'
# 'http://www.mobihealthnews.com/tag/clover-health'
# 'http://www.mobihealthnews.com/tag/MedTronic'
class CompanyMaker:
def __init__(self, companies=None):
self.in_site = ['http://www.mobihealthnews.com/tag/MedTronic']
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/google']
self.in_site = companies
self.companies = list()
def make_companies(self):
......@@ -18,4 +22,9 @@ class CompanyMaker:
def _make_list(self):
for i in self.in_site:
self.companies.append(Company(i, 43, 1, 2))
tag = i.name
if tag.find(' '):
tag = tag.replace(' ', '-')
if tag.find('.'):
tag = tag.replace('.', '')
self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2))
from collections import namedtuple
import traceback
import MySQLdb
Entity = namedtuple('Entity', 'id, name, country')
class Database:
def __init__(self, host, user, pwd, database):
......@@ -17,12 +19,15 @@ class Database:
def __del__(self):
self.db.close()
def insert(self, item):
def insert(self, query, data):
try:
self.cursor.execute(item)
self.cursor.execute(query, (data))
# self.cursor.commit()
return self.cursor.lastrowid
except:
self.cursor.rollback()
print(query, data)
traceback.print_exc()
# self.cursor.rollback()
def select(self, item):
try:
......@@ -36,6 +41,13 @@ class Database:
if __name__ == '__main__':
from datetime import datetime
date = datetime.now()
select = 'SELECT id, name, country FROM wp_esi_entity WHERE id < 10'
q = """INSERT INTO wp_esi_news (title, description, URL, media_id, type_id, region_id, post_id, publish_date)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s);"""
data = ('asdsdasd', 'sadsadsad', 'sadsadsad', 43, 2, 2, 0, date.date())
db = Database('localhost', 'root', 'andrew', 'esi')
rows = db.select(select)
# rows = db.select(select)
id = db.insert(q, data)
print(id)
......@@ -12,6 +12,9 @@ class ExaItem(scrapy.Item):
date = scrapy.Field()
media_id = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
url = scrapy.Field()
region_id = scrapy.Field()
type_id = scrapy.Field()
post_id = scrapy.Field()
company_id = scrapy.Field()
\ No newline at end of file
......@@ -24,8 +24,12 @@ class SeleniumDownloadMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
from selenium.webdriver.chrome.options import Options
opts = Options()
opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/45.0 Chrome/39.0.2171.98 Safari/537.36")
self.driver = webdriver.Chrome()
self.driver = webdriver.Chrome(chrome_options=opts)
self.driver.maximize_window()
def spider_closed(self, spider):
......@@ -39,6 +43,7 @@ class SeleniumDownloadMiddleware(object):
try:
self.driver.get(request.url)
except BaseException as e:
print('Exception in process loading page')
return None
......
......@@ -13,7 +13,7 @@ db = get_project_settings().get('DB')
class ExaPipeline(object):
def __init__(self):
self.out = open('out{}.txt'.format(datetime.now()), 'w', newline='\n')
self.out = open('out/out{}.txt'.format(datetime.now()), 'w', newline='\n')
self.db = Database(**db)
super(ExaPipeline, self).__init__()
......@@ -21,9 +21,19 @@ class ExaPipeline(object):
self.out.close()
def process_item(self, item, spider):
s = "INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date) VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}')\n".format(
item['title'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['date'])
self.out.write(s)
print(item)
item['description'] = ''.join(item['description']).replace('\n', ' ')
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(),)
query = """INSERT INTO wp_esi_news (title, description, URL, media_id, type_id, region_id, post_id, publish_date, record_date)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
print(item)
news = self.db.insert(query, data)
self._insert_news_entiry(news, item['company_id'])
self.out.write(query)
return item
def _insert_news_entiry(self, news, entity):
query = 'INSERT INTO wp_esi_news_entity (news_id, entity_id) VALUES(%s, %s)'
self.db.insert(query, (news, entity))
# -*- coding: utf-8 -*-
import dateparser
import scrapy
from ..helpers import CompanyMaker
import traceback
from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database
from ..items import ExaItem
db_settings = get_project_settings().get('DB')
db = Database(**db_settings)
comp = db.select("SELECT id, name, country FROM wp_esi_entity WHERE id < 300;")
class MobiHealthNewsSpider(scrapy.Spider):
name = "mobihealthnews"
allowed_domains = ["www.mobihealthnews.com"]
def __init__(self):
self.news = list()
super(MobiHealthNewsSpider, self).__init__()
def start_requests(self):
companies = CompanyMaker()
companies = CompanyMaker(comp)
companies.make_companies()
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'type_id': i.type_id,
'region_id': i.region_id,
'media_id': i.media_id})
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0})
def parse(self, response):
# try:
try:
rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
company = response.meta['company']
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
item['media_id'] = response.meta['media_id']
item['title'] = i.xpath(".//span/a/text()").extract_first()
item['description'] = i.xpath(".//div[contains(@class, 'views-field views-field-body')]/span/text()").extract_first()
item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
item['region_id'] = response.meta['region_id']
item['type_id'] = response.meta['type_id']
item['region_id'] = company.region_id
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id']
yield item
has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
has_next = response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
print(has_next, next_url)
if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'type_id': 1, 'region_id': 2, 'media_id': 43})
# pass
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
# except BaseException:
# print('We had error')
\ No newline at end of file
except BaseException as e:
print('We had error')
traceback.print_exc()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment