written save to db

e2f1c612 · Vasyl Bodnaruk · Andrii Marynets · 9e878ace · e2f1c612 · e2f1c612
Commit e2f1c612 authored May 22, 2017 by Vasyl Bodnaruk Committed by Andrii Marynets May 22, 2017
6 changed files
--- a/exa/exa/helpers/company_maker.py
+++ b/exa/exa/helpers/company_maker.py
 from collections import namedtuple

-Company = namedtuple('Company', 'url, media_id, type_id, region_id')
-# 'http://www.mobihealthnews.com/tag/apple',
+Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
+# 'http://www.mobihealthnews.com/tag/apple'
 # 'http://www.mobihealthnews.com/tag/clover-health'
+# 'http://www.mobihealthnews.com/tag/MedTronic'


 class CompanyMaker:
    def __init__(self, companies=None):
-        self.in_site = ['http://www.mobihealthnews.com/tag/MedTronic']
+        # self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
+        #                 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
+        #                 'http://www.mobihealthnews.com/tag/google']
+        self.in_site = companies
        self.companies = list()

    def make_companies(self):
@@ -18,4 +22,9 @@ class CompanyMaker:

    def _make_list(self):
        for i in self.in_site:
-            self.companies.append(Company(i, 43, 1, 2))
+            tag = i.name
+            if tag.find(' '):
+                tag = tag.replace(' ', '-')
+            if tag.find('.'):
+                tag = tag.replace('.', '')
+            self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2))
--- a/exa/exa/helpers/db.py
+++ b/exa/exa/helpers/db.py
 from collections import namedtuple
+import traceback
 import MySQLdb

 Entity = namedtuple('Entity', 'id, name, country')

+
 class Database:

    def __init__(self, host, user, pwd, database):
@@ -17,12 +19,15 @@ class Database:
    def __del__(self):
        self.db.close()

-    def insert(self, item):
+    def insert(self, query, data):
        try:
-            self.cursor.execute(item)
+            self.cursor.execute(query, (data))
+            # self.cursor.commit()
            return self.cursor.lastrowid
        except:
-            self.cursor.rollback()
+            print(query, data)
+            traceback.print_exc()
+        #     self.cursor.rollback()

    def select(self, item):
        try:
@@ -36,6 +41,13 @@ class Database:


 if __name__ == '__main__':
+    from datetime import datetime
+    date = datetime.now()
    select = 'SELECT id, name, country FROM wp_esi_entity WHERE id < 10'
+    q = """INSERT INTO wp_esi_news (title, description, URL, media_id, type_id, region_id, post_id, publish_date)
+                VALUES(%s, %s, %s, %s, %s, %s, %s, %s);"""
+    data = ('asdsdasd', 'sadsadsad', 'sadsadsad', 43, 2, 2, 0, date.date())
    db = Database('localhost', 'root', 'andrew', 'esi')
-    rows = db.select(select)
+    # rows = db.select(select)
+    id = db.insert(q, data)
+    print(id)
--- a/exa/exa/items.py
+++ b/exa/exa/items.py
@@ -12,6 +12,9 @@ class ExaItem(scrapy.Item):
    date = scrapy.Field()
    media_id = scrapy.Field()
    title = scrapy.Field()
+    description = scrapy.Field()
    url = scrapy.Field()
    region_id = scrapy.Field()
    type_id = scrapy.Field()
+    post_id = scrapy.Field()
+    company_id = scrapy.Field()
\ No newline at end of file
--- a/exa/exa/middlewares.py
+++ b/exa/exa/middlewares.py
@@ -24,8 +24,12 @@ class SeleniumDownloadMiddleware(object):

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
+        from selenium.webdriver.chrome.options import Options
+        opts = Options()
+        opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/45.0 Chrome/39.0.2171.98 Safari/537.36")

-        self.driver = webdriver.Chrome()
+
+        self.driver = webdriver.Chrome(chrome_options=opts)
        self.driver.maximize_window()

    def spider_closed(self, spider):
@@ -39,6 +43,7 @@ class SeleniumDownloadMiddleware(object):
        try:
            self.driver.get(request.url)

+
        except BaseException as e:
            print('Exception in process loading page')
            return None

--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -13,7 +13,7 @@ db = get_project_settings().get('DB')

 class ExaPipeline(object):
    def __init__(self):
-        self.out = open('out{}.txt'.format(datetime.now()), 'w', newline='\n')
+        self.out = open('out/out{}.txt'.format(datetime.now()), 'w', newline='\n')
        self.db = Database(**db)
        super(ExaPipeline, self).__init__()

@@ -21,9 +21,19 @@ class ExaPipeline(object):
        self.out.close()

    def process_item(self, item, spider):
-        s = "INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date) VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}')\n".format(
-            item['title'], item['url'], item['media_id'], item['type_id'],
-            item['region_id'], item['date'])
-        self.out.write(s)
+        print(item)
+        item['description'] = ''.join(item['description']).replace('\n', ' ')
+        data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
+                item['region_id'], item['post_id'], item['date'], datetime.now().date(),)
+        query = """INSERT INTO wp_esi_news (title, description, URL, media_id, type_id, region_id, post_id, publish_date, record_date)
+                VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
+        print(item)
+        news = self.db.insert(query, data)
+        self._insert_news_entiry(news, item['company_id'])
+        self.out.write(query)

        return item
+
+    def _insert_news_entiry(self, news, entity):
+        query = 'INSERT INTO wp_esi_news_entity (news_id, entity_id) VALUES(%s, %s)'
+        self.db.insert(query, (news, entity))
--- a/exa/exa/spiders/mobihealthnews.py
+++ b/exa/exa/spiders/mobihealthnews.py
 # -*- coding: utf-8 -*-
 import dateparser
 import scrapy
-
-from ..helpers import CompanyMaker
+import traceback
+from scrapy.utils.project import get_project_settings
+from ..helpers import CompanyMaker, Database
 from ..items import ExaItem


+db_settings = get_project_settings().get('DB')
+db = Database(**db_settings)
+comp = db.select("SELECT id, name, country FROM wp_esi_entity WHERE id < 300;")
+
+
 class MobiHealthNewsSpider(scrapy.Spider):
    name = "mobihealthnews"
    allowed_domains = ["www.mobihealthnews.com"]

+    def __init__(self):
+        self.news = list()
+        super(MobiHealthNewsSpider, self).__init__()
+
    def start_requests(self):
-        companies = CompanyMaker()
+        companies = CompanyMaker(comp)
        companies.make_companies()
        for i in companies.get_companies():
-            yield scrapy.Request(i.url, callback=self.parse, meta={'type_id': i.type_id,
-                                                                   'region_id': i.region_id,
-                                                                   'media_id': i.media_id})
+            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
+                                                                   'post_id': 0})

    def parse(self, response):
-        # try:
+        try:
            rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
+            company = response.meta['company']
            for i in rows:
                item = ExaItem()
                item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
-                item['media_id'] = response.meta['media_id']
                item['title'] = i.xpath(".//span/a/text()").extract_first()
+                item['description'] = i.xpath(".//div[contains(@class, 'views-field views-field-body')]/span/text()").extract_first()
                item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
-                item['region_id'] = response.meta['region_id']
-                item['type_id'] = response.meta['type_id']
+
+                item['region_id'] = company.region_id
+                item['type_id'] = company.type_id
+                item['media_id'] = company.media_id
+                item['company_id'] = company.id
+
+                item['post_id'] = response.meta['post_id']

                yield item
-            has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
-            next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
+            has_next = response.xpath(
+                "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
+            next_url = 'http://www.mobihealthnews.com' + response.xpath(
+                "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
            print(has_next, next_url)

            if has_next:
-                yield scrapy.Request(next_url, callback=self.parse, meta={'type_id': 1, 'region_id': 2, 'media_id': 43})
+                # pass
+                yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})

-        # except BaseException:
-        #     print('We had error')
\ No newline at end of file
+        except BaseException as e:
+            print('We had error')
+            traceback.print_exc()
\ No newline at end of file