Commit 98c9b2f3 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add functional for select fresh news

parent 4d5f7514
......@@ -21,6 +21,7 @@ class MobiHealthNewsSpider(BaseSpider):
try:
rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
company = response.meta['company']
is_duplicate = False
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
......@@ -32,20 +33,30 @@ class MobiHealthNewsSpider(BaseSpider):
item['post_id'] = response.meta['post_id']
if self.pipeline.check_url(item['url']):
is_duplicate = True
if not is_duplicate:
yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
else:
# print(item)
pass
has_next = response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
next_url = 'http://www.mobihealthnews.com' + response.xpath(
"..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
print("COUNT", is_duplicate)
if self.can_follow(has_next, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
else:
print("DUPLICATE NEWS")
except BaseException as e:
print('We had error')
print(response.url)
traceback.print_exc()
def parse_tags(self, response):
item = response.meta['item']
item['tags'] = response.xpath(".//div[@class='bottom-tags field field-name-field-tags field-type-taxonomy-term-reference field-label-inline clearfix']//a/text()").extract()
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment