Wrote spider

a1df6c29 · Vasyl Bodnaruk · Andrii Marynets · 61045b1d · a1df6c29 · a1df6c29
Commit a1df6c29 authored May 11, 2017 by Vasyl Bodnaruk Committed by Andrii Marynets May 11, 2017
6 changed files
--- a/exa/exa/items.py
+++ b/exa/exa/items.py
@@ -9,6 +9,6 @@ import scrapy


 class ExaItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+    date = scrapy.Field()
+    title = scrapy.Field()
+    url = scrapy.Field()
--- a/exa/exa/middlewares.py
+++ b/exa/exa/middlewares.py
@@ -4,53 +4,44 @@
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
+import time
 from scrapy import signals
+from scrapy.http import HtmlResponse
+from selenium import webdriver


-class ExaSpiderMiddleware(object):
+class SeleniumDownloadMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-
-    def process_start_requests(start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
+        middleware = cls()
+        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
+        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
+        return middleware

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
+
+        self.driver = webdriver.Chrome()
+        self.driver.maximize_window()
+
+    def spider_closed(self, spider):
+        if self.driver:
+            self.driver.quit()
+        else:
+            print('Driver closed by exception or error')
+
+    def process_request(self, request, spider):
+        self.driver.set_page_load_timeout(60)
+        try:
+            self.driver.get(request.url)
+            time.sleep(4)
+        except BaseException as e:
+            print('Exception in process loading page')
+            return None
+
+        body = str.encode(self.driver.page_source)
+        return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
\ No newline at end of file
--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -4,8 +4,13 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import csv


 class ExaPipeline(object):
    def process_item(self, item, spider):
+        with open('out.csv', 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile, delimiter=' ')
+            writer.writerow(str(item))
+
        return item
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'exa.spiders'
 #USER_AGENT = 'exa (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -39,10 +39,13 @@ ROBOTSTXT_OBEY = True
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
+DEFAULT_REQUEST_HEADERS = {
+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+   'Accept-Language': 'en',
+   'Accept-Encoding': 'gzip, deflate, sdch',
+   'Connection': 'keep-alive',
+   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
+}

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
@@ -52,9 +55,9 @@ ROBOTSTXT_OBEY = True

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'exa.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+DOWNLOADER_MIDDLEWARES = {
+   'exa.middlewares.SeleniumDownloadMiddleware': 543,
+}

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
@@ -64,9 +67,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'exa.pipelines.ExaPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'exa.pipelines.ExaPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/exa/exa/spiders/exa_news.py
+++ b/exa/exa/spiders/exa_news.py
 # -*- coding: utf-8 -*-
 import scrapy
+from ..items import ExaItem


 class ExaNewsSpider(scrapy.Spider):
    name = "exa_news"
    allowed_domains = ["https://www.crunchbase.com/organization/sense-ly/press/"]
-    start_urls = ['http://https://www.crunchbase.com/organization/sense-ly/press//']
+    start_urls = ['https://www.crunchbase.com/organization/sense-ly/press/']

    def parse(self, response):
-        pass
+        rows = response.xpath("..//table/tbody/tr")
+        for i in rows:
+            item = ExaItem()
+            item['date'] = i.xpath("//td[contains(@class, 'date')]/text()").extract_first()
+            item['title'] = i.xpath("//td/a/text()").extract_first()
+            item['url'] = i.xpath("//td/a/@href").extract_first()
+            yield item
--- a/requirements.txt
+++ b/requirements.txt
 Scrapy==1.3.3
+selenium==3.4.1
+dateparser==0.6.0
+