Commit 8bdee0a5 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add get_media function for select media

If media doesn't exist in DB - need to create media and return id
parent 0019f323
......@@ -14,11 +14,22 @@ class CbSpider(BaseSpider):
def parse(self, response):
rows = response.xpath("//table/tr")[1:]
print(rows)
for i in rows:
item = ExaItem()
item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first()
item['title'] = i.xpath("./td/a/text()").extract_first()
item['url'] = i.xpath("./td/a/@href").extract_first()
item['media_id'] = self._get_media(i)
print(item)
def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
media_url = elem.xpath("./td/a/@data_publisher").extract_first()
query = "select * from wp_esi_media where name like '%{}%' or url like '%{}%'".format(media_name, media_url)
media = self.pipeline.db.select(query)
if len(media) == 0:
media = self.pipeline.db.insert("INSERT INTO wp_esi_media (name, url) VALUES(%s, %s)", (media_name, media_url))
else:
media = media[0][0]
return media
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment