make spider with pagination handling by company profile

afea8191 · Vasyl Bodnaruk · 8744a168 · afea8191 · afea8191 · afea8191
Commit afea8191 authored Jun 12, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 9 deletions

pipelines.py exa/exa/pipelines.py +2 -3

settings.py exa/exa/settings.py +1 -1

techcrunch.py exa/exa/spiders/techcrunch.py +33 -5

No files found.
--- a/exa/exa/pipelines.py
+++ b/exa/exa/pipelines.py
@@ -19,7 +19,8 @@ class ExaPipeline(object):

    def process_item(self, item, spider):
        item['title'] = ''.join(item['title']).replace('\n', ' ')
-        item['description'] = ''.join(item['description']).replace('\n', ' ')
+        if item['description']:
+            item['description'] = ''.join(item['description']).replace('\n', ' ')
        data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
                item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
        query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
@@ -30,8 +31,6 @@ class ExaPipeline(object):
            print("UNIQUE", item)
            self.db.insert(query, data)
            self.urls.add(item['url'])
-        # self._insert_news_entiry(news, item['company_id'])
-        # self.out.write(query)

        return item


--- a/exa/exa/settings.py
+++ b/exa/exa/settings.py
@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    # 'exa.pipelines.ExaPipeline': 300,
+    'exa.pipelines.ExaPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -17,7 +17,7 @@ class TechcrunchSpider(scrapy.Spider):

    def __init__(self, *args, **kwargs):
        self.condition = kwargs.get('query')
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
+        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
        if self.condition:
            print(self.condition)
            self.query += ' or {}'.format(self.condition)
@@ -31,8 +31,10 @@ class TechcrunchSpider(scrapy.Spider):
            yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})

    def parse(self, response):
-            if 'tag' in response.url:
-                return self.parse_tag(response)
+        if 'tag' in response.url:
+            return self.parse_tag(response)
+        if 'company' in response.url:
+            return self.parse_company(response)

    def get_common_items(self, company):
        return {'region_id': company.region_id, 'type_id': company.type_id,
@@ -55,11 +57,37 @@ class TechcrunchSpider(scrapy.Spider):

                yield item

+            has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
+            if has_next:
+                next_url = 'https://techcrunch.com' + has_next + '/'
+                yield scrapy.Request(next_url, callback=self.parse_tag,
+                                     meta={'company': response.meta['company'], 'post_id': 0})
+        except BaseException as e:
+            print('We had error')
+            traceback.print_exc()
+
+    def parse_company(self, response):
+        try:
+            company = response.meta['company']
+            news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]")
+            for i in news_list:
+                item = ExaItem()
+                item['date'] = i.xpath("./div/time/@datetime").extract_first()
+                item['title'] = i.xpath("./h3/a/text()").extract_first()
+                # Because we don't have description here
+                item['description'] = None
+                item['url'] = i.xpath("./h3/a/@href").extract_first()
+
+                item.update(self.get_common_items(company))
+
+                item['post_id'] = response.meta['post_id']
+                if 'Crunch Report' not in item['title']:
+                    yield item
+
            has_next = response.xpath(
                "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
            if has_next:
-                next_url = 'https://techcrunch.com' + has_next + '/'
-                yield scrapy.Request(next_url, callback=self.parse,
+                yield scrapy.Request(has_next, callback=self.parse_company,
                                     meta={'company': response.meta['company'], 'post_id': 0})
        except BaseException as e:
            print('We had error')