Commit 9e878ace authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk Committed by Andrii Marynets

added database wrapper

parent 44e27193
from .helpers.company_maker import CompanyMaker
from .items import ExaItem
\ No newline at end of file
from .company_maker import CompanyMaker
from .db import Database
\ No newline at end of file
from collections import namedtuple
Company = namedtuple('Company', 'url, media_id, type_id, region_id')
# 'http://www.mobihealthnews.com/tag/apple',
# 'http://www.mobihealthnews.com/tag/clover-health'
class CompanyMaker:
def __init__(self, companies=None):
self.in_site = ['http://www.mobihealthnews.com/tag/MedTronic']
self.companies = list()
def make_companies(self):
self._make_list()
def get_companies(self):
return self.companies
def _make_list(self):
for i in self.in_site:
self.companies.append(Company(i, 43, 1, 2))
from collections import namedtuple
import MySQLdb
Entity = namedtuple('Entity', 'id, name, country')
class Database:
def __init__(self, host, user, pwd, database):
self.host = host
self.user = user
self.pwd = pwd
self.database = database
self.db = MySQLdb.connect(self.host, self.user, self.pwd, self.database)
self.db.autocommit(True)
self.cursor = self.db.cursor()
def __del__(self):
self.db.close()
def insert(self, item):
try:
self.cursor.execute(item)
return self.cursor.lastrowid
except:
self.cursor.rollback()
def select(self, item):
try:
self.cursor.execute(item)
result = list()
for i in self.cursor.fetchall():
result.append(Entity(i[0], i[1], i[2]))
return result
except:
self.cursor.rollback()
if __name__ == '__main__':
select = 'SELECT id, name, country FROM wp_esi_entity WHERE id < 10'
db = Database('localhost', 'root', 'andrew', 'esi')
rows = db.select(select)
......@@ -10,6 +10,8 @@ import scrapy
class ExaItem(scrapy.Item):
date = scrapy.Field()
media = scrapy.Field()
media_id = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
region_id = scrapy.Field()
type_id = scrapy.Field()
......@@ -39,7 +39,6 @@ class SeleniumDownloadMiddleware(object):
try:
self.driver.get(request.url)
except BaseException as e:
print('Exception in process loading page')
return None
......
......@@ -4,22 +4,26 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
from datetime import datetime
from .helpers.db import Database
from scrapy.utils.project import get_project_settings
db = get_project_settings().get('DB')
class ExaPipeline(object):
class ExaPipeline(object):
def __init__(self):
self.out = open('out.csv', 'w', newline='\n')
self.out = open('out{}.txt'.format(datetime.now()), 'w', newline='\n')
self.db = Database(**db)
super(ExaPipeline, self).__init__()
def __del__(self):
self.out.close()
def process_item(self, item, spider):
s = """INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
\n""".format(item['title'], item['url'], item['media'], 1, 3, item['date'])
s = "INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date) VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}')\n".format(
item['title'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['date'])
self.out.write(s)
return item
......@@ -96,20 +96,9 @@ ITEM_PIPELINES = {
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES = 10
RETRY_HTTP_CODES = [500, 503, 504, 416, 400, 403, 404, 408]
# Proxy list containing entries like
# http://host1:port
# http://username:password@host2:port
# http://host3:port
# ...
PROXY_LIST = '/home/andrii/work/exa/proxy_1000.txt'
# Proxy mode
# 0 = Every requests have different proxy
# 1 = Take only one proxy from the list and assign it to every requests
# 2 = Put a custom proxy to use in the settings
PROXY_MODE = 0
# If proxy mode is 2 uncomment this sentence :
#CUSTOM_PROXY = "http://host1:port"
DB = {
'host': 'localhost',
'user': 'root',
'pwd': 'andrew',
'database': 'esi'
}
# -*- coding: utf-8 -*-
import scrapy
import dateparser
import scrapy
from ..helpers import CompanyMaker
from ..items import ExaItem
class ExaNewsSpider(scrapy.Spider):
class MobiHealthNewsSpider(scrapy.Spider):
name = "mobihealthnews"
allowed_domains = ["www.mobihealthnews.com"]
start_urls = ['http://www.mobihealthnews.com/tag/MedTronic']
def start_requests(self):
companies = CompanyMaker()
companies.make_companies()
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'type_id': i.type_id,
'region_id': i.region_id,
'media_id': i.media_id})
def parse(self, response):
try:
# try:
rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
item['media'] = 'mobihealthnews'
item['title'] = i.xpath("..//span/a/text()").extract_first()
item['media_id'] = response.meta['media_id']
item['title'] = i.xpath(".//span/a/text()").extract_first()
item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
item['region_id'] = response.meta['region_id']
item['type_id'] = response.meta['type_id']
yield item
has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
print(has_next, next_url)
if has_next:
pass
# yield scrapy.Request(next_url, callback=self.parse)
yield scrapy.Request(next_url, callback=self.parse, meta={'type_id': 1, 'region_id': 2, 'media_id': 43})
except BaseException:
print('We had error')
\ No newline at end of file
# except BaseException:
# print('We had error')
\ No newline at end of file
# -*- coding: utf-8 -*-
import scrapy
class TechcrunchSpider(scrapy.Spider):
name = "techcrunch"
allowed_domains = ["techcrunch.com"]
start_urls = ['https://techcrunch.com/search/Behold.ai#stq=Behold.ai/']
def parse(self, response):
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment