Commit 44e27193 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk Committed by Andrii Marynets

writed new scraper

parent a1df6c29
......@@ -10,5 +10,6 @@ import scrapy
class ExaItem(scrapy.Item):
date = scrapy.Field()
media = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
......@@ -38,7 +38,8 @@ class SeleniumDownloadMiddleware(object):
self.driver.set_page_load_timeout(60)
try:
self.driver.get(request.url)
time.sleep(4)
except BaseException as e:
print('Exception in process loading page')
return None
......
......@@ -8,9 +8,18 @@ import csv
class ExaPipeline(object):
def __init__(self):
self.out = open('out.csv', 'w', newline='\n')
super(ExaPipeline, self).__init__()
def __del__(self):
self.out.close()
def process_item(self, item, spider):
with open('out.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
writer.writerow(str(item))
s = """INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
\n""".format(item['title'], item['url'], item['media'], 1, 3, item['date'])
self.out.write(s)
return item
......@@ -14,80 +14,102 @@ BOT_NAME = 'exa'
SPIDER_MODULES = ['exa.spiders']
NEWSPIDER_MODULE = 'exa.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'exa (+http://www.yourdomain.com)'
# USER_AGENT = 'exa (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 1
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'Accept-Encoding': 'gzip, deflate, sdch',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Upgrade-Insecure-Requests': 1,
'Cookie': '_vdl=1; __uvt=; __qca=P0-1643001621-1493898328387; D_SID=31.134.92.67:3/rJdf3hJG6+TR13YMxSvUiTJ8h57So0kyLM43rDTdg; __cfduid=dadeec73c1d1ff4bd071afb9fb374f3211493899807; uvts=5yfEPqhJPZPFQxyd; multivariate_bot=false; s_sq=%5B%5BB%5D%5D; user_intent_path=%2Faccount%2Fsignup%3Fredirect_to%3D%2Forganization%2Fsnappr-2%2Fpress%2Fedit; user_origin_path=%2Forganization%2Fsnappr-2; jaco_uid=4ec53bb9-8854-42eb-a433-3ee13728d283; jaco_referer=; _oklv=1494501188795%2Cg8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; _okdetect=%7B%22token%22%3A%2214945011896690%22%2C%22proto%22%3A%22https%3A%22%2C%22host%22%3A%22www.crunchbase.com%22%7D; olfsk=olfsk5754888747300402; _okbk=cd4%3Dtrue%2Cvi5%3D0%2Cvi4%3D1494501191139%2Cvi3%3Dactive%2Cvi2%3Dfalse%2Cvi1%3Dfalse%2Ccd8%3Dchat%2Ccd6%3D0%2Ccd5%3Daway%2Ccd3%3Dfalse%2Ccd2%3D0%2Ccd1%3D0%2C; _ok=1554-355-10-6773; wcsid=g8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; hblid=fK32w02XYxOB0upN3F6pZ0M0P0REO2B6; AMCV_6B25357E519160E40A490D44%40AdobeOrg=1256414278%7CMCMID%7C86901182656108813444510944813131305330%7CMCAAMLH-1495108161%7C6%7CMCAAMB-1495108161%7CNRX38WO0n5BH8Th-nqAG_A%7CMCAID%7CNONE; _site_session=927d74172f306e57a17c1d078aed0328; _ga=GA1.2.2032591643.1493898326; _gid=GA1.2.154162834.1494583376; _hp2_props.973801186=%7B%22Logged%20In%22%3A%22false%22%2C%22Pro%22%3Afalse%7D; s_pers=%20s_getnr%3D1494583377399-Repeat%7C1557655377399%3B%20s_nrgvo%3DRepeat%7C1557655377402%3B; s_cc=true; _hp2_ses_props.973801186=%7B%22ts%22%3A1494583361786%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2Forganization%2Fsense-ly%2Fpress%22%7D; _hp2_id.973801186=%7B%22userId%22%3A%221725770103252256%22%2C%22pageviewId%22%3A%223361758318735220%22%2C%22sessionId%22%3A%228966052576718307%22%2C%22identity%22%3Anull%2C%22trackerVersion%22%3A%223.0%22%7D; D_PID=579CC756-6031-3F9A-8537-A12264CBC935; D_IID=4FDC617B-AEC3-339A-8ED0-AD1AE00E2167; D_UID=2FE95751-354F-3057-A1E7-15B142B136FE; D_HID=6ixGrm4H6tT/P1hvQqpycrUcm9v3AYCJ7RG1XjcwPR0; D_ZID=7C440876-9267-3387-8333-D2425576FA59; D_ZUID=390BF5F5-7D40-32EE-A617-F3F2C5669811; _px=eyJzIjp7ImEiOjAsImIiOjB9LCJ0IjoxNDk0NTg0NDM4MDg5LCJoIjoiYzE3YWNiMDFiZjMzNjE0NDA0NGJkZDJjNzY4OWRjNmZlODllZmU2ODY3N2ExZjVjY2U3MjljMWUxOWM0YWQzMCJ9'
}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'exa.middlewares.ExaSpiderMiddleware': 543,
#}
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'exa.middlewares.SeleniumDownloadMiddleware': 543,
'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'exa.pipelines.ExaPipeline': 300,
'exa.pipelines.ExaPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES = 10
RETRY_HTTP_CODES = [500, 503, 504, 416, 400, 403, 404, 408]
# Proxy list containing entries like
# http://host1:port
# http://username:password@host2:port
# http://host3:port
# ...
PROXY_LIST = '/home/andrii/work/exa/proxy_1000.txt'
# Proxy mode
# 0 = Every requests have different proxy
# 1 = Take only one proxy from the list and assign it to every requests
# 2 = Put a custom proxy to use in the settings
PROXY_MODE = 0
# If proxy mode is 2 uncomment this sentence :
#CUSTOM_PROXY = "http://host1:port"
# -*- coding: utf-8 -*-
import scrapy
import dateparser
from ..items import ExaItem
class ExaNewsSpider(scrapy.Spider):
name = "exa_news"
allowed_domains = ["https://www.crunchbase.com/organization/sense-ly/press/"]
start_urls = ['https://www.crunchbase.com/organization/sense-ly/press/']
name = "mobihealthnews"
allowed_domains = ["www.mobihealthnews.com"]
start_urls = ['http://www.mobihealthnews.com/tag/MedTronic']
def parse(self, response):
rows = response.xpath("..//table/tbody/tr")
for i in rows:
item = ExaItem()
item['date'] = i.xpath("//td[contains(@class, 'date')]/text()").extract_first()
item['title'] = i.xpath("//td/a/text()").extract_first()
item['url'] = i.xpath("//td/a/@href").extract_first()
yield item
try:
rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
item['media'] = 'mobihealthnews'
item['title'] = i.xpath("..//span/a/text()").extract_first()
item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
yield item
has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
print(has_next, next_url)
if has_next:
pass
# yield scrapy.Request(next_url, callback=self.parse)
except BaseException:
print('We had error')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment