Commit 8744a168 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

make scrapping by tag in separate function for better dev flow

parent 6ebd5b73
......@@ -57,7 +57,7 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
......@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'exa.pipelines.ExaPipeline': 300,
# 'exa.pipelines.ExaPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
......
......@@ -28,9 +28,17 @@ class TechcrunchSpider(scrapy.Spider):
companies = CompanyMaker(self.comp)
companies.make_companies(self.name)
for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})
def parse(self, response):
if 'tag' in response.url:
return self.parse_tag(response)
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response):
try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
......@@ -44,46 +52,15 @@ class TechcrunchSpider(scrapy.Spider):
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
# print(item)
yield item
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e:
print('We had error')
traceback.print_exc()
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response):
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
print('FOOOOOOOOOOOOOOOOOOOOOO')
for i in news_list:
print('GGGGGGGGGGGGGGGGGGGGGG')
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
print(item)
# yield item
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0})
\ No newline at end of file
traceback.print_exc()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment