Commit 52f4db11 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Fix searching by description and tag

parent 6ddd556a
...@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -73,7 +73,7 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
'exa.pipelines.ExaPipeline': 300, # 'exa.pipelines.ExaPipeline': 300,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
......
...@@ -48,8 +48,22 @@ class AitopSpider(BaseSpider): ...@@ -48,8 +48,22 @@ class AitopSpider(BaseSpider):
else: else:
return False return False
items, is_duplicate = self.build_items(response) rows = response.xpath(".//div[contains(@class, 'summaries')]")
for i in items: is_duplicate = False
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
tzinfo=None)
item['title'] = ''.join(
i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
item['description'] = ''.join(
i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
item.update(self.get_common_items(response.meta['company']))
item['post_id'] = response.meta['post_id']
item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
yield item
if is_company_in_item(i): if is_company_in_item(i):
yield i yield i
next_url = self.next_url(response) next_url = self.next_url(response)
...@@ -63,11 +77,13 @@ class AitopSpider(BaseSpider): ...@@ -63,11 +77,13 @@ class AitopSpider(BaseSpider):
rows = response.xpath(".//div[contains(@class, 'summaries')]") rows = response.xpath(".//div[contains(@class, 'summaries')]")
is_duplicate = False is_duplicate = False
for i in rows: for i in rows:
# i = r.xpath("//div[@class='row']")
item = ExaItem() item = ExaItem()
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(tzinfo=None) item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
item['title'] = i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract() tzinfo=None)
item['description'] = i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract() item['title'] = ''.join(
i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
item['description'] = ''.join(
i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first() item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
item.update(self.get_common_items(response.meta['company'])) item.update(self.get_common_items(response.meta['company']))
item['post_id'] = response.meta['post_id'] item['post_id'] = response.meta['post_id']
...@@ -89,5 +105,3 @@ class AitopSpider(BaseSpider): ...@@ -89,5 +105,3 @@ class AitopSpider(BaseSpider):
return 'https://aitopics.org/search' + has_next return 'https://aitopics.org/search' + has_next
else: else:
return None return None
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment