Commit da12a613 authored by Andrii Marynets's avatar Andrii Marynets

Get media and common info

parent e8b3ad9c
# -*- coding: utf-8 -*-
import json
from urllib.request import urlparse
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy_splash import SplashRequest
......@@ -113,13 +114,16 @@ class CbSpider(BaseSpider):
item = ExaItem()
item['date'] = self.format_date(prop['activity_date'])
item['title'] = prop['activity_properties']['title']
item['url'] = prop['activity_properties']['url']
item['url'] = prop['activity_properties']['url']['value']
publisher = prop['activity_properties']['publisher']
item.update(self.get_common_items(response.meta['company']))
item['media_id'] = self._get_media((publisher, item['url']))
print(item)
def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
media_url = elem.xpath("./td/a/@data_publisher").extract_first()
def _get_media(self, site):
media_name, media_url = site
clean = lambda x: x[4:] if x.startswith('www.') else x
media_url = clean(urlparse(media_url).netloc)
query = "select * from wp_esi_media where name like '%{}%' or url like '%{}%'".format(media_name, media_url)
media = self.pipeline.db.select(query)
if len(media) == 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment