Add splash to crawler

03681fe7 · Andrii Marynets · 94e4c2e3 · 03681fe7 · 03681fe7 · 03681fe7
Commit 03681fe7 authored Oct 13, 2017 by Andrii Marynets
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 25 deletions

settings.py exa/exa/settings.py +9 -9

cb.py exa/exa/spiders/cb.py +29 -16

requirements.txt requirements.txt +1 -0

No files found.
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 1
-
+SPLASH_URL = 'http://127.0.0.1:8050'
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
@@ -54,20 +54,20 @@ DEFAULT_REQUEST_HEADERS = {

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-# SPIDER_MIDDLEWARES = {
-#    'exa.middlewares.ExaSpiderMiddleware': 543,
-# }
-
+SPIDER_MIDDLEWARES = {
+   # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
-    # 'exa.middlewares.SeleniumDownloadMiddleware': 543,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
-    # 'scrapy_proxies.RandomProxy': 100,
-    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
-    'scrapy_crawlera.CrawleraMiddleware': 610
+    'scrapy_crawlera.CrawleraMiddleware': 750
 }

 # Enable or disable extensions

--- a/exa/exa/spiders/cb.py
+++ b/exa/exa/spiders/cb.py
 # -*- coding: utf-8 -*-
-import dateparser
 import scrapy
 from scrapy.utils.project import get_project_settings
-
+from scrapy_splash import SplashRequest
+from w3lib.http import basic_auth_header
+from pkgutil import get_data

 from .base import BaseSpider
-
-
 from ..items import ExaItem

+api_key = get_project_settings().get('CRAWLERA_APIKEY')
+

 class CbSpider(BaseSpider):
    name = "cb"
    allowed_domains = ["www.crunchbase.com"]
-    crawlera_enabled = True
-    crawlera_apikey = get_project_settings().get('CRAWLERA_APIKEY')
+    # crawlera_enabled = True
+    # crawlera_apikey = api_key

    # start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
    co = 0

+    def __init__(self, *args, **kwargs):
+        self.LUA_SOURCE = get_data(
+            'exa', 'scripts/crawlera.lua'
+        ).decode('utf-8')
+        super(CbSpider, self).__init__(*args, **kwargs)
+
    def start_requests(self):
        for i in self.companies(self.name):
            try:
-                yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
+                yield SplashRequest(url=i.url,
+                                    callback=self.parse,
+                                    meta={'company': i, 'post_id': 0},
+                                    args={'wait': 5,
+                                          'lua_source': self.LUA_SOURCE,
+                                          'crawlera_user': self.settings['CRAWLERA_APIKEY'],
+                                          },
+                                    cache_args=['lua_source'],
+                                    )
            except:
                pass

    def parse(self, response):
-        rows = response.xpath("//table/tr")
-        if 'page=1' in response.url:
-            rows = rows[1:]
+        rows = response.xpath(".//div[@class='grid-body']/div")
        company = response.meta['company']
        is_duplicate = False
        for i in rows:
            item = ExaItem()
-            item['date'] = self.format_date(i.xpath("./td[contains(@class, 'date')]/text()").extract_first())
-            item['title'] = i.xpath("./td/a/text()").extract_first()
-            item['url'] = i.xpath("./td/a/@href").extract_first()
+            item['date'] = self.format_date(i.xpath("./div/div/field-formatter/span/@title").extract_first())
+            item['title'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/text()").extract_first().strip()
+            item['url'] = i.xpath("./div/div/press-reference/div/div/span/span[2]/a/@href").extract_first()
            item.update(self.get_common_items(company))
            item['media_id'] = self._get_media(i)
            item['description'] = None
@@ -50,9 +63,9 @@ class CbSpider(BaseSpider):
                if self.fresh:
                    break
            yield item
-        next_url = self._next_url(response.url)
-        if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
-            yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
+        # next_url = self._next_url(response.url)
+        # if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
+        #     yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)

    def _get_media(self, elem):
        media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()

--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ python-scrapyd-api==2.0.1
 scrapyd-client==1.1.0
 scrapy-proxies==0.3
 scrapy-crawlera==1.2.4
+scrapy-splash==0.7.2
 newspaper3k==0.2.2
 PyVirtualDisplay==0.2.1
 selenium==3.4.1