Commit 8744a168 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

make scrapping by tag in separate function for better dev flow

parent 6ebd5b73
...@@ -57,7 +57,7 @@ DEFAULT_REQUEST_HEADERS = { ...@@ -57,7 +57,7 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { DOWNLOADER_MIDDLEWARES = {
'exa.middlewares.SeleniumDownloadMiddleware': 543, # 'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100, # 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
...@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -72,7 +72,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
'exa.pipelines.ExaPipeline': 300, # 'exa.pipelines.ExaPipeline': 300,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
......
...@@ -28,9 +28,17 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -28,9 +28,17 @@ class TechcrunchSpider(scrapy.Spider):
companies = CompanyMaker(self.comp) companies = CompanyMaker(self.comp)
companies.make_companies(self.name) companies.make_companies(self.name)
for i in companies.get_companies(): for i in companies.get_companies():
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0}) yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})
def parse(self, response): def parse(self, response):
if 'tag' in response.url:
return self.parse_tag(response)
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response):
try: try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]") news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company'] company = response.meta['company']
...@@ -44,46 +52,15 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -44,46 +52,15 @@ class TechcrunchSpider(scrapy.Spider):
item.update(self.get_common_items(company)) item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id'] item['post_id'] = response.meta['post_id']
# print(item)
yield item yield item
has_next = response.xpath( has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first() "//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next: if has_next:
next_url = 'https://techcrunch.com' + has_next + '/'
yield scrapy.Request(next_url, callback=self.parse, yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0}) meta={'company': response.meta['company'], 'post_id': 0})
except BaseException as e: except BaseException as e:
print('We had error') print('We had error')
traceback.print_exc() traceback.print_exc()
\ No newline at end of file
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response):
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
company = response.meta['company']
print('FOOOOOOOOOOOOOOOOOOOOOO')
for i in news_list:
print('GGGGGGGGGGGGGGGGGGGGGG')
item = ExaItem()
item['date'] = i.xpath("./div/div/time/@datetime").extract_first()
item['title'] = i.xpath("./div/h2/a/text()").extract_first()
item['description'] = i.xpath("./div/p//text()").extract_first()
item['url'] = i.xpath("./div/h2/a/@href").extract_first()
item.update(self.get_common_items(company))
item['post_id'] = response.meta['post_id']
print(item)
# yield item
has_next = response.xpath(
"//div[contains(@class, 'river-nav')]//li[contains(@class, 'next')]/a/@href").extract_first()
next_url = 'https://techcrunch.com' + has_next
if has_next:
yield scrapy.Request(next_url, callback=self.parse,
meta={'company': response.meta['company'], 'post_id': 0})
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment