Commit de32a15f authored by Andrii Marynets's avatar Andrii Marynets

Check item on duplicate

parent da12a613
......@@ -79,7 +79,8 @@ class CbSpider(BaseSpider):
headers={'x-requested-with': 'XMLHttpRequest', 'content-type': 'application/json'},
callback=self.parse_news,
meta={'cookiejar': response.meta['cookiejar'],
'company': response.meta['company']})
'company': response.meta['company'],
'post_id': response.meta['post_id']})
rows = response.xpath(".//div[@class='grid-body']/div")
company = response.meta['company']
......@@ -107,7 +108,6 @@ class CbSpider(BaseSpider):
# yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def parse_news(self, response):
body = json.loads(response.body.decode('utf8'))
print(body)
for i in body['entities']:
prop = i['properties']
if prop['entity_def_id'] == 'press_reference':
......@@ -118,7 +118,12 @@ class CbSpider(BaseSpider):
publisher = prop['activity_properties']['publisher']
item.update(self.get_common_items(response.meta['company']))
item['media_id'] = self._get_media((publisher, item['url']))
print(item)
item['description'] = None
item['post_id'] = response.meta['post_id']
item['tags'] = None
if self.pipeline.check_url(item['url']) and self.fresh:
break
yield item
def _get_media(self, site):
media_name, media_url = site
......@@ -132,8 +137,3 @@ class CbSpider(BaseSpider):
else:
media = media[0][0]
return media
def _next_url(self, url):
pos = url.rfind('=') + 1
next_page = int(url[pos:]) + 1
return url[:pos] + str(next_page)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment