Commit afea8191 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

make spider with pagination handling by company profile

parent 8744a168
...@@ -19,6 +19,7 @@ class ExaPipeline(object): ...@@ -19,6 +19,7 @@ class ExaPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
item['title'] = ''.join(item['title']).replace('\n', ' ') item['title'] = ''.join(item['title']).replace('\n', ' ')
if item['description']:
item['description'] = ''.join(item['description']).replace('\n', ' ') item['description'] = ''.join(item['description']).replace('\n', ' ')
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'], data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0) item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
...@@ -30,8 +31,6 @@ class ExaPipeline(object): ...@@ -30,8 +31,6 @@ class ExaPipeline(object):
print("UNIQUE", item) print("UNIQUE", item)
self.db.insert(query, data) self.db.insert(query, data)
self.urls.add(item['url']) self.urls.add(item['url'])
# self._insert_news_entiry(news, item['company_id'])
# self.out.write(query)
return item return item
......
...@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'exa.pipelines.ExaPipeline': 300, 'exa.pipelines.ExaPipeline': 300,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
......
...@@ -17,7 +17,7 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -17,7 +17,7 @@ class TechcrunchSpider(scrapy.Spider):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query') self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26" self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if self.condition: if self.condition:
print(self.condition) print(self.condition)
self.query += ' or {}'.format(self.condition) self.query += ' or {}'.format(self.condition)
...@@ -33,6 +33,8 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -33,6 +33,8 @@ class TechcrunchSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
if 'tag' in response.url: if 'tag' in response.url:
return self.parse_tag(response) return self.parse_tag(response)
if 'company' in response.url:
return self.parse_company(response)
def get_common_items(self, company): def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id, return {'region_id': company.region_id, 'type_id': company.type_id,
...@@ -55,11 +57,37 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -55,11 +57,37 @@ class TechcrunchSpider(scrapy.Spider):
yield item yield item
has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse_tag,
meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
traceback.print_exc()
def parse_company(self, response):
try:
company = response.meta['company']
news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]")
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/time/@datetime").extract_first()
item['title'] = i.xpath("./h3/a/text()").extract_first()
# Because we don't have description here
item['description'] = None
item['url'] = i.xpath("./h3/a/@href").extract_first()
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
if 'Crunch Report' not in item['title']:
yield item
has_next = response.xpath( has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first() "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next: if has_next:
next_url = 'https://techcrunch.com' + has_next + '/' yield scrapy.Request(has_next, callback=self.parse_company,
yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0}) meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e: except BaseException as e:
print('We had error') print('We had error')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment