make scrapping by tag in separate function for better dev flow

8744a168 · Vasyl Bodnaruk · 6ebd5b73 · 8744a168 · 8744a168
Commit 8744a168 authored Jun 12, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 36 deletions

settings.py exa/exa/settings.py +2 -2

techcrunch.py exa/exa/spiders/techcrunch.py +11 -34

No files found.
--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -57,7 +57,7 @@ DEFAULT_REQUEST_HEADERS = {
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-    'exa.middlewares.SeleniumDownloadMiddleware': 543,
+    # 'exa.middlewares.SeleniumDownloadMiddleware': 543,
    # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    # 'scrapy_proxies.RandomProxy': 100,
    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'exa.pipelines.ExaPipeline': 300,
+    # 'exa.pipelines.ExaPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -28,46 +28,21 @@ class TechcrunchSpider(scrapy.Spider):
        companies = CompanyMaker(self.comp)
        companies.make_companies(self.name)
        for i in companies.get_companies():
-            yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
+            yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})

    def parse(self, response):
-        try:
-            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
-            company = response.meta['company']
-            for i in news_list:
-                item = ExaItem()
-                item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
-                item['title'] = i.xpath("./div/h2/a/text()").extract_first()
-                item['description'] = i.xpath("./div/p//text()").extract_first()
-                item['url'] = i.xpath("./div/h2/a/@href").extract_first()
-
-                item.update(self.get_common_items(company))
-
-                item['post_id'] = response.meta['post_id']
-                # print(item)
-
-                yield item
-
-            has_next = response.xpath(
-                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-            next_url = 'https://techcrunch.com' + has_next
-            if has_next:
-                yield scrapy.Request(next_url, callback=self.parse,
-                                     meta={'company': response.meta['company'], 'post_id': 0})
-        except BaseException as e:
-            print('We had error')
-            traceback.print_exc()
+            if 'tag' in response.url:
+                return self.parse_tag(response)

    def get_common_items(self, company):
        return {'region_id': company.region_id, 'type_id': company.type_id,
                'media_id': company.media_id, 'company_id': company.id}

    def parse_tag(self, response):
+        try:
            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
            company = response.meta['company']
-        print('FOOOOOOOOOOOOOOOOOOOOOO')
            for i in news_list:
-            print('GGGGGGGGGGGGGGGGGGGGGG')
                item = ExaItem()
                item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
                item['title'] = i.xpath("./div/h2/a/text()").extract_first()
@@ -77,13 +52,15 @@ class TechcrunchSpider(scrapy.Spider):
                item.update(self.get_common_items(company))

                item['post_id'] = response.meta['post_id']
-            print(item)

-            # yield item
+                yield item

            has_next = response.xpath(
                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
-        next_url = 'https://techcrunch.com' + has_next
            if has_next:
+                next_url = 'https://techcrunch.com' + has_next + '/'
                yield scrapy.Request(next_url, callback=self.parse,
                                     meta={'company': response.meta['company'], 'post_id': 0})
+        except BaseException as e:
+            print('We had error')
+            traceback.print_exc()
\ No newline at end of file