Add functional for select fresh news

98c9b2f3 · Vasyl Bodnaruk · 4d5f7514 · 98c9b2f3
Commit 98c9b2f3 authored Jul 21, 2017 by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 3 deletions

mobihealthnews.py exa/exa/spiders/mobihealthnews.py +14 -3

No files found.
--- a/exa/exa/spiders/mobihealthnews.py
+++ b/exa/exa/spiders/mobihealthnews.py
@@ -21,6 +21,7 @@ class MobiHealthNewsSpider(BaseSpider):
        try:
            rows = response.xpath("..//div[contains(@class, 'group-left')]//div[contains(@class, 'views-row')]")
            company = response.meta['company']
+            is_duplicate = False
            for i in rows:
                item = ExaItem()
                item['date'] = dateparser.parse(i.xpath(".//span/span[contains(@class, 'day_list')]/text()").extract_first()).date()
@@ -32,20 +33,30 @@ class MobiHealthNewsSpider(BaseSpider):

                item['post_id'] = response.meta['post_id']

+                if self.pipeline.check_url(item['url']):
+                    is_duplicate = True
+                if not is_duplicate:
                    yield scrapy.Request(item['url'], callback=self.parse_tags, meta={'item': item})
+                else:
+                    # print(item)
+                    pass
            has_next = response.xpath(
                "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/text()").extract_first()
            next_url = 'http://www.mobihealthnews.com' + response.xpath(
                "..//div[contains(@class, 'text-center')]/ul/li[contains(@class, 'next')]/a/@href").extract_first()
-
-            if has_next:
+            print("COUNT", is_duplicate)
+            if self.can_follow(has_next, is_duplicate):
                yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
+            else:
+                print("DUPLICATE NEWS")

        except BaseException as e:
            print('We had error')
+            print(response.url)
            traceback.print_exc()

    def parse_tags(self, response):
        item = response.meta['item']
        item['tags'] = response.xpath(".//div[@class='bottom-tags field field-name-field-tags field-type-taxonomy-term-reference field-label-inline clearfix']//a/text()").extract()
        yield item
+