Commit 6ebd5b73 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

simple refactor

parent 9655cc2e
...@@ -13,7 +13,7 @@ db = Database(**db_settings) ...@@ -13,7 +13,7 @@ db = Database(**db_settings)
class TechcrunchSpider(scrapy.Spider): class TechcrunchSpider(scrapy.Spider):
name = "tc" name = "tc"
allowed_domains = ["techcrunch.com"] allowed_domains = ["techcrunch.com"]
start_urls = ['https://techcrunch.com/tag/Ericsson/'] # start_urls = ['https://techcrunch.com/tag/Ericsson/']
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query') self.condition = kwargs.get('query')
...@@ -41,19 +41,49 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -41,19 +41,49 @@ class TechcrunchSpider(scrapy.Spider):
item['description'] = i.xpath("./div/p//text()").extract_first() item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first() item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item['region_id'] = company.region_id item.update(self.get_common_items(company))
item['type_id'] = company.type_id
item['media_id'] = company.media_id
item['company_id'] = company.id
item['post_id'] = response.meta['post_id'] item['post_id'] = response.meta['post_id']
# print(item)
# yield item yield item
has_next = response.xpath("//div[contains(@class, 'pagination-container')]//li[contains(@class, 'next')]/a/@href").extract_first() has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next next_url = 'https://techcrunch.com' + has_next
if has_next: if has_next:
yield scrapy.Request(next_url, callback=self.parse, meta={'company': response.meta['company'], 'post_id': 0}) yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e: except BaseException as e:
print('We had error') print('We had error')
traceback.print_exc() traceback.print_exc()
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response):
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
print('FOOOOOOOOOOOOOOOOOOOOOO')
for i in news_list:
print('GGGGGGGGGGGGGGGGGGGGGG')
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
print(item)
# yield item
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0})
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment