Commit 6ddd556a authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Fix searching by tag

parent 87b3f20f
......@@ -48,40 +48,38 @@ class AitopSpider(BaseSpider):
else:
return False
for i in self.build_items(response):
items, is_duplicate = self.build_items(response)
for i in items:
if is_company_in_item(i):
yield i
next_url = self.next_url(response)
if next_url:
if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse_by_title_description, meta=response.meta)
else:
print("DUPLICATE NEWS")
def parse_by_tag(self, response):
try:
for i in self.build_items(response):
yield i
next_url = self.next_url(response)
if next_url:
yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
except:
pass
def build_items(self, response):
try:
items = list()
rows = response.xpath(".//div[contains(@class, 'summaries')]")
for r in rows:
i = r.xpath("//div[@class='row']")
is_duplicate = False
for i in rows:
# i = r.xpath("//div[@class='row']")
item = ExaItem()
item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
item['title'] = ''.join(i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a//text()").extract())
item['description'] = ''.join(i.xpath(".//div[@class='summary-content']/p/text()").extract())
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(tzinfo=None)
item['title'] = i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract()
item['description'] = i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract()
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
item.update(self.get_common_items(response.meta['company']))
item['post_id'] = response.meta['post_id']
item['tags'] = r.xpath(".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
yield item
next_url = self.next_url(response)
if next_url:
print('FOLLOW')
# yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
items.append(item)
return items
except:
pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment