simple refactor

6ebd5b73 · Vasyl Bodnaruk · 9655cc2e · 6ebd5b73
Commit 6ebd5b73 authored Jun 09, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 9 deletions

techcrunch.py exa/exa/spiders/techcrunch.py +39 -9

No files found.
--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -13,7 +13,7 @@ db = Database(**db_settings)
 class TechcrunchSpider(scrapy.Spider):
    name = "tc"
    allowed_domains = ["techcrunch.com"]
-    start_urls = ['https://techcrunch.com/tag/Ericsson/']
+    # start_urls = ['https://techcrunch.com/tag/Ericsson/']
    def __init__(self, *args, **kwargs):
        self.condition = kwargs.get('query')
@@ -41,19 +41,49 @@ class TechcrunchSpider(scrapy.Spider):
                item['description'] = i.xpath("./div/p//text()").extract_first()
                item['url'] = i.xpath("./div/h2/a/@href").extract_first()
-                item['region_id'] = company.region_id
+                item.update(self.get_common_items(company))
-                item['type_id'] = company.type_id
-                item['media_id'] = company.media_id
-                item['company_id'] = company.id
                item['post_id'] = response.meta['post_id']
+                # print(item)
-                # yield item
+                yield item
-            has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first()
+            has_next = response.xpath(
+                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
            next_url = 'https://techcrunch.com' + has_next
            if has_next:
-                yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0})
+                yield scrapy.Request(next_url, callback=self.parse,
+                                     meta={'company': response.meta['company'], 'post_id': 0})
        except BaseException as e:
            print('We had error')
            traceback.print_exc()
+    def get_common_items(self, company):
+        return {'region_id': company.region_id, 'type_id': company.type_id,
+                'media_id': company.media_id, 'company_id': company.id}
+    def parse_tag(self, response):
+        news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
+        company = response.meta['company']
+        print('FOOOOOOOOOOOOOOOOOOOOOO')
+        for i in news_list:
+            print('GGGGGGGGGGGGGGGGGGGGGG')
+            item = ExaItem()
+            item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
+            item['title'] = i.xpath("./div/h2/a/text()").extract_first()
+            item['description'] = i.xpath("./div/p//text()").extract_first()
+            item['url'] = i.xpath("./div/h2/a/@href").extract_first()
+            item.update(self.get_common_items(company))
+            item['post_id'] = response.meta['post_id']
+            print(item)
+            # yield item
+        has_next = response.xpath(
+            "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
+        next_url = 'https://techcrunch.com' + has_next
+        if has_next:
+            yield scrapy.Request(next_url, callback=self.parse,
+                                 meta={'company': response.meta['company'], 'post_id': 0})
\ No newline at end of file