Commit d5f5eaaa authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add function for parse tags from news on TC

parent 5422394d
......@@ -38,7 +38,7 @@ class TechcrunchSpider(BaseSpider):
item['post_id'] = response.meta['post_id']
if item['title']:
yield item
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
......@@ -65,7 +65,7 @@ class TechcrunchSpider(BaseSpider):
item['post_id'] = response.meta['post_id']
if 'Crunch Report' not in item['title']:
yield item
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
......@@ -75,3 +75,9 @@ class TechcrunchSpider(BaseSpider):
except BaseException as e:
print('We had error')
traceback.print_exc()
def parse_tags(self, response):
item = response.meta['item']
item['tags'] = response.xpath(".//div[@class='accordion recirc-accordion']//ul//li[not(contains(@class, 'active'))]//a/text()").extract()
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment