make scrapping by tag in separate function for better dev flow

8744a168 · Vasyl Bodnaruk · 6ebd5b73 · 8744a168 · 8744a168
Commit 8744a168 authored Jun 12, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 36 deletions

settings.py exa/exa/settings.py +2 -2

techcrunch.py exa/exa/spiders/techcrunch.py +11 -34

No files found.
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -57,7 +57,7 @@ DEFAULT_REQUEST_HEADERS = {
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-    'exa.middlewares.SeleniumDownloadMiddleware': 543,
+    # 'exa.middlewares.SeleniumDownloadMiddleware': 543,
    # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    # 'scrapy_proxies.RandomProxy': 100,
    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'exa.pipelines.ExaPipeline': 300,
+    # 'exa.pipelines.ExaPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -28,9 +28,17 @@ class TechcrunchSpider(scrapy.Spider):
        companies = CompanyMaker(self.comp)
        companies.make_companies(self.name)
        for i in companies.get_companies():
-            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
+            yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})

    def parse(self, response):
+            if 'tag' in response.url:
+                return self.parse_tag(response)
+
+    def get_common_items(self, company):
+        return {'region_id': company.region_id, 'type_id': company.type_id,
+                'media_id': company.media_id, 'company_id': company.id}
+
+    def parse_tag(self, response):
        try:
            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
            company = response.meta['company']
@@ -44,46 +52,15 @@ class TechcrunchSpider(scrapy.Spider):
                item.update(self.get_common_items(company))

                item['post_id'] = response.meta['post_id']
-                # print(item)

                yield item

            has_next = response.xpath(
                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-            next_url = 'https://techcrunch.com' + has_next
            if has_next:
+                next_url = 'https://techcrunch.com' + has_next + '/'
                yield scrapy.Request(next_url, callback=self.parse,
                                     meta={'company': response.meta['company'], 'post_id': 0})
        except BaseException as e:
            print('We had error')
-            traceback.print_exc()
-
-    def get_common_items(self, company):
-        return {'region_id': company.region_id, 'type_id': company.type_id,
-                'media_id': company.media_id, 'company_id': company.id}
-
-    def parse_tag(self, response):
-        news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
-        company = response.meta['company']
-        print('FOOOOOOOOOOOOOOOOOOOOOO')
-        for i in news_list:
-            print('GGGGGGGGGGGGGGGGGGGGGG')
-            item = ExaItem()
-            item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
-            item['title'] = i.xpath("./div/h2/a/text()").extract_first()
-            item['description'] = i.xpath("./div/p//text()").extract_first()
-            item['url'] = i.xpath("./div/h2/a/@href").extract_first()
-
-            item.update(self.get_common_items(company))
-
-            item['post_id'] = response.meta['post_id']
-            print(item)
-
-            # yield item
-
-        has_next = response.xpath(
-            "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-        next_url = 'https://techcrunch.com' + has_next
-        if has_next:
-            yield scrapy.Request(next_url, callback=self.parse,
-                                 meta={'company': response.meta['company'], 'post_id': 0})
\ No newline at end of file
+            traceback.print_exc()
\ No newline at end of file