added database wrapper

9e878ace · Vasyl Bodnaruk · Andrii Marynets · 44e27193 · 9e878ace · 9e878ace
Commit 9e878ace authored May 18, 2017 by Vasyl Bodnaruk Committed by Andrii Marynets May 18, 2017
11 changed files
--- a/exa/__init__.py
+++ b/exa/__init__.py
--- a/exa/exa/__init__.py
+++ b/exa/exa/__init__.py
+from .helpers.company_maker import CompanyMaker
+from .items import ExaItem
\ No newline at end of file
--- a/exa/exa/helpers/__init__.py
+++ b/exa/exa/helpers/__init__.py
+from .company_maker import CompanyMaker
+from .db import Database
\ No newline at end of file
--- a/exa/exa/helpers/company_maker.py
+++ b/exa/exa/helpers/company_maker.py
+from collections import namedtuple
+
+Company = namedtuple('Company', 'url, media_id, type_id, region_id')
+# 'http://www.mobihealthnews.com/tag/apple',
+# 'http://www.mobihealthnews.com/tag/clover-health'
+
+
+class CompanyMaker:
+    def __init__(self, companies=None):
+        self.in_site = ['http://www.mobihealthnews.com/tag/MedTronic']
+        self.companies = list()
+
+    def make_companies(self):
+        self._make_list()
+
+    def get_companies(self):
+        return self.companies
+
+    def _make_list(self):
+        for i in self.in_site:
+            self.companies.append(Company(i, 43, 1, 2))
--- a/exa/exa/helpers/db.py
+++ b/exa/exa/helpers/db.py
+from collections import namedtuple
+import MySQLdb
+
+Entity = namedtuple('Entity', 'id, name, country')
+
+class Database:
+
+    def __init__(self, host, user, pwd, database):
+        self.host = host
+        self.user = user
+        self.pwd = pwd
+        self.database = database
+        self.db = MySQLdb.connect(self.host, self.user, self.pwd, self.database)
+        self.db.autocommit(True)
+        self.cursor = self.db.cursor()
+
+    def __del__(self):
+        self.db.close()
+
+    def insert(self, item):
+        try:
+            self.cursor.execute(item)
+            return self.cursor.lastrowid
+        except:
+            self.cursor.rollback()
+
+    def select(self, item):
+        try:
+            self.cursor.execute(item)
+            result = list()
+            for i in self.cursor.fetchall():
+                result.append(Entity(i[0], i[1], i[2]))
+            return result
+        except:
+            self.cursor.rollback()
+
+
+if __name__ == '__main__':
+    select = 'SELECT id, name, country FROM wp_esi_entity WHERE id < 10'
+    db = Database('localhost', 'root', 'andrew', 'esi')
+    rows = db.select(select)
--- a/exa/exa/items.py
+++ b/exa/exa/items.py
@@ -10,6 +10,8 @@ import scrapy

 class ExaItem(scrapy.Item):
    date = scrapy.Field()
-    media = scrapy.Field()
+    media_id = scrapy.Field()
    title = scrapy.Field()
    url = scrapy.Field()
+    region_id = scrapy.Field()
+    type_id = scrapy.Field()
--- a/exa/exa/middlewares.py
+++ b/exa/exa/middlewares.py
@@ -39,7 +39,6 @@ class SeleniumDownloadMiddleware(object):
        try:
            self.driver.get(request.url)

-
        except BaseException as e:
            print('Exception in process loading page')
            return None

--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -4,22 +4,26 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import csv
+from datetime import datetime
+from .helpers.db import Database
+from scrapy.utils.project import get_project_settings

+db = get_project_settings().get('DB')

-class ExaPipeline(object):

+class ExaPipeline(object):
    def __init__(self):
-        self.out = open('out.csv', 'w', newline='\n')
+        self.out = open('out{}.txt'.format(datetime.now()), 'w', newline='\n')
+        self.db = Database(**db)
        super(ExaPipeline, self).__init__()

    def __del__(self):
        self.out.close()

    def process_item(self, item, spider):
-        s = """INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
-        VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
-        \n""".format(item['title'], item['url'], item['media'], 1, 3, item['date'])
+        s = "INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date) VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}')\n".format(
+            item['title'], item['url'], item['media_id'], item['type_id'],
+            item['region_id'], item['date'])
        self.out.write(s)

        return item
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -96,20 +96,9 @@ ITEM_PIPELINES = {
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

-RETRY_TIMES = 10
-RETRY_HTTP_CODES = [500, 503, 504, 416, 400, 403, 404, 408]
-# Proxy list containing entries like
-# http://host1:port
-# http://username:password@host2:port
-# http://host3:port
-# ...
-PROXY_LIST = '/home/andrii/work/exa/proxy_1000.txt'
-
-# Proxy mode
-# 0 = Every requests have different proxy
-# 1 = Take only one proxy from the list and assign it to every requests
-# 2 = Put a custom proxy to use in the settings
-PROXY_MODE = 0
-
-# If proxy mode is 2 uncomment this sentence :
-#CUSTOM_PROXY = "http://host1:port"
+DB = {
+    'host': 'localhost',
+    'user': 'root',
+    'pwd': 'andrew',
+    'database': 'esi'
+}
--- a/exa/exa/spiders/exa_news.py
+++ b/exa/exa/spiders/exa_news.py
 # -*- coding: utf-8 -*-
-import scrapy
 import dateparser
+import scrapy
+
+from ..helpers import CompanyMaker
 from ..items import ExaItem


-class ExaNewsSpider(scrapy.Spider):
+class MobiHealthNewsSpider(scrapy.Spider):
    name = "mobihealthnews"
    allowed_domains = ["www.mobihealthnews.com"]
-    start_urls = ['http://www.mobihealthnews.com/tag/MedTronic']
+
+    def start_requests(self):
+        companies = CompanyMaker()
+        companies.make_companies()
+        for i in companies.get_companies():
+            yield scrapy.Request(i.url, callback=self.parse, meta={'type_id': i.type_id,
+                                                                   'region_id': i.region_id,
+                                                                   'media_id': i.media_id})

    def parse(self, response):
-        try:
+        # try:
            rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
            for i in rows:
                item = ExaItem()
                item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
-                item['media'] = 'mobihealthnews'
-                item['title'] = i.xpath("..//span/a/text()").extract_first()
+                item['media_id'] = response.meta['media_id']
+                item['title'] = i.xpath(".//span/a/text()").extract_first()
                item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
+                item['region_id'] = response.meta['region_id']
+                item['type_id'] = response.meta['type_id']

                yield item
            has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
            next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
            print(has_next, next_url)
+
            if has_next:
-                pass
-                # yield scrapy.Request(next_url, callback=self.parse)
+                yield scrapy.Request(next_url, callback=self.parse, meta={'type_id': 1, 'region_id': 2, 'media_id': 43})

-        except BaseException:
-            print('We had error')
\ No newline at end of file
+        # except BaseException:
+        #     print('We had error')
\ No newline at end of file
--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class TechcrunchSpider(scrapy.Spider):
+    name = "techcrunch"
+    allowed_domains = ["techcrunch.com"]
+    start_urls = ['https://techcrunch.com/search/Behold.ai#stq=Behold.ai/']
+
+    def parse(self, response):
+        pass