Commit 293b7a26 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

add functional for scraping techcrunch

First step add name of company as tag in url
parent 227cd025
from collections import namedtuple from collections import namedtuple
Company = namedtuple('Company', 'id, url, media_id, type_id, region_id') Company = namedtuple('Company', 'id, url, media_id, type_id, region_id, name')
# 'http://www.mobihealthnews.com/tag/apple'
# 'http://www.mobihealthnews.com/tag/clover-health'
# 'http://www.mobihealthnews.com/tag/MedTronic'
Entity = namedtuple('Entity', 'id, name, country') Entity = namedtuple('Entity', 'id, name, country')
class CompanyMaker: class CompanyMaker:
def __init__(self, companies=None): def __init__(self, companies=None):
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/google']
self.in_site = list() self.in_site = list()
if companies: if companies:
for i in companies: for i in companies:
self.in_site.append(Entity(i[0], i[1], i[2])) self.in_site.append(Entity(i[0], i[1], i[2]))
self.companies = list() self.companies = list()
def make_companies(self): def make_companies(self, media):
self._make_list() if media == 'mhn':
self._make_list_for_mhn()
elif media == 'tc':
self._make_list_for_tc()
def get_companies(self): def get_companies(self):
return self.companies return self.companies
def _make_list(self): def _make_list_for_mhn(self):
for i in self.in_site: for i in self.in_site:
tag = i.name tag = i.name
if tag.find(' '): if tag.find(' '):
tag = tag.replace(' ', '-') tag = tag.replace(' ', '-')
if tag.find('.'): if tag.find('.'):
tag = tag.replace('.', '') tag = tag.replace('.', '')
self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2)) self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2, i.name))
def _make_list_for_tc(self):
for i in self.in_site:
self.companies.append(Company(i.id, 'https://techcrunch.com/tag/' + i.name, 81, 2, 2, i.name))
\ No newline at end of file
...@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object): ...@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object):
from pyvirtualdisplay import Display from pyvirtualdisplay import Display
self.display = Display() self.display = Display()
self.display.start() self.display.start()
else:
self.display = None
if middleware['driver'] == 'Chrome': if middleware['driver'] == 'Chrome':
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
......
...@@ -15,21 +15,21 @@ class ExaPipeline(object): ...@@ -15,21 +15,21 @@ class ExaPipeline(object):
def __init__(self): def __init__(self):
self.db = Database(**db) self.db = Database(**db)
self.urls = {i[0] for i in self.db.select('select url from wp_esi_news_accept')} self.urls = {i[0] for i in self.db.select('select url from wp_esi_news_accept')}
print(self.urls)
super(ExaPipeline, self).__init__() super(ExaPipeline, self).__init__()
def process_item(self, item, spider): def process_item(self, item, spider):
print(item) item['title'] = ''.join(item['title']).replace('\n', ' ')
item['description'] = ''.join(item['description']).replace('\n', ' ') item['description'] = ''.join(item['description']).replace('\n', ' ')
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'], data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0) item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id, query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n""" publish_date, record_date, company_id, is_accepted) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
print(item)
if item['url'] in self.urls: if item['url'] in self.urls:
print("DUPLICATE") print("DUPLICATE", item)
else: else:
print("UNIQUE", item)
self.db.insert(query, data) self.db.insert(query, data)
self.urls.add(item['url'])
# self._insert_news_entiry(news, item['company_id']) # self._insert_news_entiry(news, item['company_id'])
# self.out.write(query) # self.out.write(query)
......
...@@ -12,7 +12,7 @@ db = Database(**db_settings) ...@@ -12,7 +12,7 @@ db = Database(**db_settings)
class MobiHealthNewsSpider(scrapy.Spider): class MobiHealthNewsSpider(scrapy.Spider):
name = "mobihealthnews" name = "mhn"
allowed_domains = ["www.mobihealthnews.com"] allowed_domains = ["www.mobihealthnews.com"]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider): ...@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
companies = CompanyMaker(self.comp) companies = CompanyMaker(self.comp)
companies.make_companies() companies.make_companies(self.name)
for i in companies.get_companies(): for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0}) 'post_id': 0})
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database
from ..items import ExaItem
db_settings = get_project_settings().get('DB')
db = Database(**db_settings)
class TechcrunchSpider(scrapy.Spider): class TechcrunchSpider(scrapy.Spider):
name = "techcrunch" name = "tc"
allowed_domains = ["techcrunch.com"] allowed_domains = ["techcrunch.com"]
start_urls = ['https://techcrunch.com/search/Behold.ai#stq=Behold.ai/'] start_urls = ['https://techcrunch.com/tag/Ericsson/']
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
self.comp = db.select(self.query)
super(TechcrunchSpider, self).__init__()
def start_requests(self):
companies = CompanyMaker(self.comp)
companies.make_companies(self.name)
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0})
def parse(self, response): def parse(self, response):
pass news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item['region_id'] = company.region_id
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id']
# yield item
has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment