add functional for scraping techcrunch

First step add name of company as tag in url

add functional for scraping techcrunch
First step add name of company as tag in url
293b7a26 · Vasyl Bodnaruk · 227cd025 · 293b7a26 · 293b7a26 · 293b7a26
Commit 293b7a26 authored Jun 09, 2017 by Vasyl Bodnaruk
6 changed files
--- a/exa/exa/helpers/company_maker.py
+++ b/exa/exa/helpers/company_maker.py
 from collections import namedtuple

-Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
-# 'http://www.mobihealthnews.com/tag/apple'
-# 'http://www.mobihealthnews.com/tag/clover-health'
-# 'http://www.mobihealthnews.com/tag/MedTronic'
-
+Company = namedtuple('Company', 'id, url, media_id, type_id, region_id, name')

 Entity = namedtuple('Entity', 'id, name, country')

+
 class CompanyMaker:
    def __init__(self, companies=None):
-        # self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
-        #                 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
-        #                 'http://www.mobihealthnews.com/tag/google']
        self.in_site = list()
        if companies:
            for i in companies:
                self.in_site.append(Entity(i[0], i[1], i[2]))
        self.companies = list()

-    def make_companies(self):
-        self._make_list()
+    def make_companies(self, media):
+        if media == 'mhn':
+            self._make_list_for_mhn()
+        elif media == 'tc':
+            self._make_list_for_tc()

    def get_companies(self):
        return self.companies

-    def _make_list(self):
+    def _make_list_for_mhn(self):
        for i in self.in_site:
            tag = i.name
            if tag.find(' '):
                tag = tag.replace(' ', '-')
            if tag.find('.'):
                tag = tag.replace('.', '')
-            self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2))
+            self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2, i.name))
+
+    def _make_list_for_tc(self):
+        for i in self.in_site:
+            self.companies.append(Company(i.id, 'https://techcrunch.com/tag/' + i.name, 81, 2, 2, i.name))
\ No newline at end of file
--- a/exa/exa/helpers/decorators.py
+++ b/exa/exa/helpers/decorators.py
--- a/exa/exa/middlewares.py
+++ b/exa/exa/middlewares.py
@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object):
            from pyvirtualdisplay import Display
            self.display = Display()
            self.display.start()
+        else:
+            self.display = None

        if middleware['driver'] == 'Chrome':
            from selenium.webdriver.chrome.options import Options

--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -15,21 +15,21 @@ class ExaPipeline(object):
    def __init__(self):
        self.db = Database(**db)
        self.urls = {i[0] for i in self.db.select('select url from wp_esi_news_accept')}
-        print(self.urls)
        super(ExaPipeline, self).__init__()

    def process_item(self, item, spider):
-        print(item)
+        item['title'] = ''.join(item['title']).replace('\n', ' ')
        item['description'] = ''.join(item['description']).replace('\n', ' ')
        data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
                item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
        query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
        publish_date, record_date, company_id, is_accepted) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
-        print(item)
        if item['url'] in self.urls:
-            print("DUPLICATE")
+            print("DUPLICATE", item)
        else:
+            print("UNIQUE", item)
            self.db.insert(query, data)
+            self.urls.add(item['url'])
        # self._insert_news_entiry(news, item['company_id'])
        # self.out.write(query)


--- a/exa/exa/spiders/mobihealthnews.py
+++ b/exa/exa/spiders/mobihealthnews.py
@@ -12,7 +12,7 @@ db = Database(**db_settings)


 class MobiHealthNewsSpider(scrapy.Spider):
-    name = "mobihealthnews"
+    name = "mhn"
    allowed_domains = ["www.mobihealthnews.com"]

    def __init__(self, *args, **kwargs):
@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider):

    def start_requests(self):
        companies = CompanyMaker(self.comp)
-        companies.make_companies()
+        companies.make_companies(self.name)
        for i in companies.get_companies():
            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
                                                                   'post_id': 0})

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
 # -*- coding: utf-8 -*-
 import scrapy
+from scrapy.utils.project import get_project_settings
+from ..helpers import CompanyMaker, Database
+from ..items import ExaItem
+
+
+db_settings = get_project_settings().get('DB')
+db = Database(**db_settings)


 class TechcrunchSpider(scrapy.Spider):
-    name = "techcrunch"
+    name = "tc"
    allowed_domains = ["techcrunch.com"]
-    start_urls = ['https://techcrunch.com/search/Behold.ai#stq=Behold.ai/']
+    start_urls = ['https://techcrunch.com/tag/Ericsson/']
+
+    def __init__(self, *args, **kwargs):
+        self.condition = kwargs.get('query')
+        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
+        if self.condition:
+            print(self.condition)
+            self.query += ' or {}'.format(self.condition)
+        self.comp = db.select(self.query)
+        super(TechcrunchSpider, self).__init__()
+
+    def start_requests(self):
+        companies = CompanyMaker(self.comp)
+        companies.make_companies(self.name)
+        for i in companies.get_companies():
+            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
+                                                                   'post_id': 0})

    def parse(self, response):
-        pass
+        news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
+        company = response.meta['company']
+        for i in news_list:
+            item = ExaItem()
+            item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
+            item['title'] = i.xpath("./div/h2/a/text()").extract_first()
+            item['description'] = i.xpath("./div/p//text()").extract_first()
+            item['url'] = i.xpath("./div/h2/a/@href").extract_first()
+
+            item['region_id'] = company.region_id
+            item['type_id'] = company.type_id
+            item['media_id'] = company.media_id
+            item['company_id'] = company.id
+
+            item['post_id'] = response.meta['post_id']
+
+            # yield item
+
+        has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
+        next_url = 'https://techcrunch.com' + has_next
+        if has_next:
+            yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})