Commit afea8191 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

make spider with pagination handling by company profile

parent 8744a168
......@@ -19,7 +19,8 @@ class ExaPipeline(object):
def process_item(self, item, spider):
item['title'] = ''.join(item['title']).replace('\n', ' ')
item['description'] = ''.join(item['description']).replace('\n', ' ')
if item['description']:
item['description'] = ''.join(item['description']).replace('\n', ' ')
data = (item['title'], item['description'], item['url'], item['media_id'], item['type_id'],
item['region_id'], item['post_id'], item['date'], datetime.now().date(), item['company_id'], 0)
query = """INSERT INTO wp_esi_news_accept (title, description, URL, media_id, type_id, region_id, post_id,
......@@ -30,8 +31,6 @@ class ExaPipeline(object):
print("UNIQUE", item)
self.db.insert(query, data)
self.urls.add(item['url'])
# self._insert_news_entiry(news, item['company_id'])
# self.out.write(query)
return item
......
......@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'exa.pipelines.ExaPipeline': 300,
'exa.pipelines.ExaPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
......
......@@ -17,7 +17,7 @@ class TechcrunchSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=26"
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
......@@ -31,8 +31,10 @@ class TechcrunchSpider(scrapy.Spider):
yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})
def parse(self, response):
if 'tag' in response.url:
return self.parse_tag(response)
if 'tag' in response.url:
return self.parse_tag(response)
if 'company' in response.url:
return self.parse_company(response)
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
......@@ -55,11 +57,37 @@ class TechcrunchSpider(scrapy.Spider):
yield item
has_next = response.xpath("//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse_tag,
meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
traceback.print_exc()
def parse_company(self, response):
try:
company = response.meta['company']
news_list = response.xpath(".//div[contains(@class, 'block-content-topic')]")
for i in news_list:
item = ExaItem()
item['date'] = i.xpath("./div/time/@datetime").extract_first()
item['title'] = i.xpath("./h3/a/text()").extract_first()
# Because we don't have description here
item['description'] = None
item['url'] = i.xpath("./h3/a/@href").extract_first()
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
if 'Crunch Report' not in item['title']:
yield item
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
if has_next:
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse,
yield scrapy.Request(has_next, callback=self.parse_company,
meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment