Commit 293b7a26 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

add functional for scraping techcrunch

First step add name of company as tag in url
parent 227cd025
from collections import namedtuple
Company = namedtuple('Company', 'id, url, media_id, type_id, region_id')
# 'http://www.mobihealthnews.com/tag/apple'
# 'http://www.mobihealthnews.com/tag/clover-health'
# 'http://www.mobihealthnews.com/tag/MedTronic'
Company = namedtuple('Company', 'id, url, media_id, type_id, region_id, name')
Entity = namedtuple('Entity', 'id, name, country')
class CompanyMaker:
def __init__(self, companies=None):
# self.in_site = ['http://www.mobihealthnews.com/tag/Twitter', 'http://www.mobihealthnews.com/tag/intel',
# 'http://www.mobihealthnews.com/tag/ibm', 'http://www.mobihealthnews.com/tag/Salesforce',
# 'http://www.mobihealthnews.com/tag/google']
self.in_site = list()
if companies:
for i in companies:
self.in_site.append(Entity(i[0], i[1], i[2]))
self.companies = list()
def make_companies(self):
self._make_list()
def make_companies(self, media):
if media == 'mhn':
self._make_list_for_mhn()
elif media == 'tc':
self._make_list_for_tc()
def get_companies(self):
return self.companies
def _make_list(self):
def _make_list_for_mhn(self):
for i in self.in_site:
tag = i.name
if tag.find(' '):
tag = tag.replace(' ', '-')
if tag.find('.'):
tag = tag.replace('.', '')
self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2))
self.companies.append(Company(i.id, 'http://www.mobihealthnews.com/tag/' + tag, 43, 2, 2, i.name))
def _make_list_for_tc(self):
for i in self.in_site:
self.companies.append(Company(i.id, 'https://techcrunch.com/tag/' + i.name, 81, 2, 2, i.name))
\ No newline at end of file
......@@ -53,6 +53,8 @@ class SeleniumDownloadMiddleware(object):
from pyvirtualdisplay import Display
self.display = Display()
self.display.start()
else:
self.display = None
if middleware['driver'] == 'Chrome':
from selenium.webdriver.chrome.options import Options
......
......@@ -15,21 +15,21 @@ class ExaPipeline(object):
def __init__(self):
self.db = Database(**db)
self.urls = {i[0] for i in self.db.select('select url from wp_esi_news_accept')}
print(self.urls)
super(ExaPipeline, self).__init__()
def process_item(self, item, spider):
print(item)
item['title'] = ''.join(item['title']).replace('\n', ' ')
item['description'] = ''.join(item['description']).replace('\n', ' ')
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
publish_date, record_date, company_id, is_accepted) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);\n"""
print(item)
if item['url'] in self.urls:
print("DUPLICATE")
print("DUPLICATE", item)
else:
print("UNIQUE", item)
self.db.insert(query, data)
self.urls.add(item['url'])
# self._insert_news_entiry(news, item['company_id'])
# self.out.write(query)
......
......@@ -12,7 +12,7 @@ db = Database(**db_settings)
class MobiHealthNewsSpider(scrapy.Spider):
name = "mobihealthnews"
name = "mhn"
allowed_domains = ["www.mobihealthnews.com"]
def __init__(self, *args, **kwargs):
......@@ -26,7 +26,7 @@ class MobiHealthNewsSpider(scrapy.Spider):
def start_requests(self):
companies = CompanyMaker(self.comp)
companies.make_companies()
companies.make_companies(self.name)
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0})
......
# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database
from ..items import ExaItem
db_settings = get_project_settings().get('DB')
db = Database(**db_settings)
class TechcrunchSpider(scrapy.Spider):
name = "techcrunch"
name = "tc"
allowed_domains = ["techcrunch.com"]
start_urls = ['https://techcrunch.com/search/Behold.ai#stq=Behold.ai/']
start_urls = ['https://techcrunch.com/tag/Ericsson/']
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
self.comp = db.select(self.query)
super(TechcrunchSpider, self).__init__()
def start_requests(self):
companies = CompanyMaker(self.comp)
companies.make_companies(self.name)
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i,
'post_id': 0})
def parse(self, response):
pass
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item['region_id'] = company.region_id
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id']
# yield item
has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment