Add function for parse tags from news on TC

d5f5eaaa · Vasyl Bodnaruk · 5422394d · d5f5eaaa
Commit d5f5eaaa authored Jul 18, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

techcrunch.py exa/exa/spiders/techcrunch.py +8 -2

No files found.
--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -38,7 +38,7 @@ class TechcrunchSpider(BaseSpider):
                item['post_id'] = response.meta['post_id']

                if item['title']:
-                    yield item
+                    yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})

            has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
            if has_next:
@@ -65,7 +65,7 @@ class TechcrunchSpider(BaseSpider):

                item['post_id'] = response.meta['post_id']
                if 'Crunch Report' not in item['title']:
-                    yield item
+                    yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})

            has_next = response.xpath(
                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
@@ -75,3 +75,9 @@ class TechcrunchSpider(BaseSpider):
        except BaseException as e:
            print('We had error')
            traceback.print_exc()
+
+    def parse_tags(self, response):
+        item = response.meta['item']
+        item['tags'] = response.xpath(".//div[@class='accordion recirc-accordion']//ul//li[not(contains(@class, 'active'))]//a/text()").extract()
+
+        yield item