writed new scraper

44e27193 · Vasyl Bodnaruk · Andrii Marynets · a1df6c29 · 44e27193 · 44e27193
Commit 44e27193 authored May 16, 2017 by Vasyl Bodnaruk Committed by Andrii Marynets May 16, 2017
Showing with 88 additions and 42 deletions

items.py exa/exa/items.py +1 -0

middlewares.py exa/exa/middlewares.py +2 -1

pipelines.py exa/exa/pipelines.py +12 -3

settings.py exa/exa/settings.py +50 -28

exa_news.py exa/exa/spiders/exa_news.py +23 -10

No files found.
--- a/exa/exa/items.py
+++ b/exa/exa/items.py
@@ -10,5 +10,6 @@ import scrapy

 class ExaItem(scrapy.Item):
    date = scrapy.Field()
+    media = scrapy.Field()
    title = scrapy.Field()
    url = scrapy.Field()
--- a/exa/exa/middlewares.py
+++ b/exa/exa/middlewares.py
@@ -38,7 +38,8 @@ class SeleniumDownloadMiddleware(object):
        self.driver.set_page_load_timeout(60)
        try:
            self.driver.get(request.url)
-            time.sleep(4)
+
+
        except BaseException as e:
            print('Exception in process loading page')
            return None

--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -8,9 +8,18 @@ import csv


 class ExaPipeline(object):
+
+    def __init__(self):
+        self.out = open('out.csv', 'w', newline='\n')
+        super(ExaPipeline, self).__init__()
+
+    def __del__(self):
+        self.out.close()
+
    def process_item(self, item, spider):
-        with open('out.csv', 'w', newline='') as csvfile:
-            writer = csv.writer(csvfile, delimiter=' ')
-            writer.writerow(str(item))
+        s = """INSERT INTO wp_esi_news (title, URL, media_id, type_id, region_id, publish_date)
+        VALUES('{0}', '{1}', '{2}', {3}, {4}, '{5}')
+        \n""".format(item['title'], item['url'], item['media'], 1, 3, item['date'])
+        self.out.write(s)

        return item
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -14,80 +14,102 @@ BOT_NAME = 'exa'
 SPIDER_MODULES = ['exa.spiders']
 NEWSPIDER_MODULE = 'exa.spiders'

-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'exa (+http://www.yourdomain.com)'
+# USER_AGENT = 'exa (+http://www.yourdomain.com)'

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+CONCURRENT_REQUESTS = 1

 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
-   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-   'Accept-Language': 'en',
-   'Accept-Encoding': 'gzip, deflate, sdch',
-   'Connection': 'keep-alive',
-   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+    'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2',
+    'Accept-Encoding': 'gzip, deflate, sdch, br',
+    'Connection': 'keep-alive',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
+    'Upgrade-Insecure-Requests': 1,
+    'Cookie': '_vdl=1; __uvt=; __qca=P0-1643001621-1493898328387; D_SID=31.134.92.67:3/rJdf3hJG6+TR13YMxSvUiTJ8h57So0kyLM43rDTdg; __cfduid=dadeec73c1d1ff4bd071afb9fb374f3211493899807; uvts=5yfEPqhJPZPFQxyd; multivariate_bot=false; s_sq=%5B%5BB%5D%5D; user_intent_path=%2Faccount%2Fsignup%3Fredirect_to%3D%2Forganization%2Fsnappr-2%2Fpress%2Fedit; user_origin_path=%2Forganization%2Fsnappr-2; jaco_uid=4ec53bb9-8854-42eb-a433-3ee13728d283; jaco_referer=; _oklv=1494501188795%2Cg8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; _okdetect=%7B%22token%22%3A%2214945011896690%22%2C%22proto%22%3A%22https%3A%22%2C%22host%22%3A%22www.crunchbase.com%22%7D; olfsk=olfsk5754888747300402; _okbk=cd4%3Dtrue%2Cvi5%3D0%2Cvi4%3D1494501191139%2Cvi3%3Dactive%2Cvi2%3Dfalse%2Cvi1%3Dfalse%2Ccd8%3Dchat%2Ccd6%3D0%2Ccd5%3Daway%2Ccd3%3Dfalse%2Ccd2%3D0%2Ccd1%3D0%2C; _ok=1554-355-10-6773; wcsid=g8jiSlQryYuVaqCz3F6pZ0M0P0REorPO; hblid=fK32w02XYxOB0upN3F6pZ0M0P0REO2B6; AMCV_6B25357E519160E40A490D44%40AdobeOrg=1256414278%7CMCMID%7C86901182656108813444510944813131305330%7CMCAAMLH-1495108161%7C6%7CMCAAMB-1495108161%7CNRX38WO0n5BH8Th-nqAG_A%7CMCAID%7CNONE; _site_session=927d74172f306e57a17c1d078aed0328; _ga=GA1.2.2032591643.1493898326; _gid=GA1.2.154162834.1494583376; _hp2_props.973801186=%7B%22Logged%20In%22%3A%22false%22%2C%22Pro%22%3Afalse%7D; s_pers=%20s_getnr%3D1494583377399-Repeat%7C1557655377399%3B%20s_nrgvo%3DRepeat%7C1557655377402%3B; s_cc=true; _hp2_ses_props.973801186=%7B%22ts%22%3A1494583361786%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2Forganization%2Fsense-ly%2Fpress%22%7D; _hp2_id.973801186=%7B%22userId%22%3A%221725770103252256%22%2C%22pageviewId%22%3A%223361758318735220%22%2C%22sessionId%22%3A%228966052576718307%22%2C%22identity%22%3Anull%2C%22trackerVersion%22%3A%223.0%22%7D; D_PID=579CC756-6031-3F9A-8537-A12264CBC935; D_IID=4FDC617B-AEC3-339A-8ED0-AD1AE00E2167; D_UID=2FE95751-354F-3057-A1E7-15B142B136FE; D_HID=6ixGrm4H6tT/P1hvQqpycrUcm9v3AYCJ7RG1XjcwPR0; D_ZID=7C440876-9267-3387-8333-D2425576FA59; D_ZUID=390BF5F5-7D40-32EE-A617-F3F2C5669811; _px=eyJzIjp7ImEiOjAsImIiOjB9LCJ0IjoxNDk0NTg0NDM4MDg5LCJoIjoiYzE3YWNiMDFiZjMzNjE0NDA0NGJkZDJjNzY4OWRjNmZlODllZmU2ODY3N2ExZjVjY2U3MjljMWUxOWM0YWQzMCJ9'
 }

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'exa.middlewares.ExaSpiderMiddleware': 543,
-#}
+# }

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-   'exa.middlewares.SeleniumDownloadMiddleware': 543,
+    'exa.middlewares.SeleniumDownloadMiddleware': 543,
+    # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
+    # 'scrapy_proxies.RandomProxy': 100,
+    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
 }

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'exa.pipelines.ExaPipeline': 300,
+    'exa.pipelines.ExaPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+RETRY_TIMES = 10
+RETRY_HTTP_CODES = [500, 503, 504, 416, 400, 403, 404, 408]
+# Proxy list containing entries like
+# http://host1:port
+# http://username:password@host2:port
+# http://host3:port
+# ...
+PROXY_LIST = '/home/andrii/work/exa/proxy_1000.txt'
+
+# Proxy mode
+# 0 = Every requests have different proxy
+# 1 = Take only one proxy from the list and assign it to every requests
+# 2 = Put a custom proxy to use in the settings
+PROXY_MODE = 0
+
+# If proxy mode is 2 uncomment this sentence :
+#CUSTOM_PROXY = "http://host1:port"
--- a/exa/exa/spiders/exa_news.py
+++ b/exa/exa/spiders/exa_news.py
 # -*- coding: utf-8 -*-
 import scrapy
+import dateparser
 from ..items import ExaItem


 class ExaNewsSpider(scrapy.Spider):
-    name = "exa_news"
-    allowed_domains = ["https://www.crunchbase.com/organization/sense-ly/press/"]
-    start_urls = ['https://www.crunchbase.com/organization/sense-ly/press/']
+    name = "mobihealthnews"
+    allowed_domains = ["www.mobihealthnews.com"]
+    start_urls = ['http://www.mobihealthnews.com/tag/MedTronic']

    def parse(self, response):
-        rows = response.xpath("..//table/tbody/tr")
-        for i in rows:
-            item = ExaItem()
-            item['date'] = i.xpath("//td[contains(@class, 'date')]/text()").extract_first()
-            item['title'] = i.xpath("//td/a/text()").extract_first()
-            item['url'] = i.xpath("//td/a/@href").extract_first()
-            yield item
+        try:
+            rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
+            for i in rows:
+                item = ExaItem()
+                item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
+                item['media'] = 'mobihealthnews'
+                item['title'] = i.xpath("..//span/a/text()").extract_first()
+                item['url'] = 'http://www.mobihealthnews.com' + i.xpath(".//span/a/@href").extract_first()
+
+                yield item
+            has_next = response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
+            next_url = 'http://www.mobihealthnews.com' + response.xpath("..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
+            print(has_next, next_url)
+            if has_next:
+                pass
+                # yield scrapy.Request(next_url, callback=self.parse)
+
+        except BaseException:
+            print('We had error')
\ No newline at end of file