Commit b83e565f authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Refactor parse by tag

parent c5d3c31d
......@@ -77,26 +77,9 @@ class AitopSpider(BaseSpider):
def parse_by_tag(self, response):
try:
rows = response.xpath(".//div[contains(@class, 'summaries')]")
is_duplicate = False
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
tzinfo=None)
item['title'] = ''.join(
i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
item['description'] = ''.join(
i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
item.update(self.get_common_items(response.meta['company']))
item['post_id'] = response.meta['post_id']
item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
yield item
items, is_duplicate = self.build_items(response)
for i in items:
yield i
next_url = self.next_url(response)
if self.can_follow(next_url, is_duplicate):
......@@ -113,3 +96,27 @@ class AitopSpider(BaseSpider):
return 'https://aitopics.org/search' + has_next
else:
return None
def build_items(self, response):
rows = response.xpath(".//div[contains(@class, 'summaries')]")
is_duplicate = False
items = list()
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
tzinfo=None)
item['title'] = ''.join(
i.xpath("./div[@class='row']//div[contains(@class, 'col-xs-12')]//h3/a//text()").extract())
item['description'] = ''.join(
i.xpath("./div[@class='row']//div[@class='summary-content']/p//text()").extract())
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
item.update(self.get_common_items(response.meta['company']))
item['post_id'] = response.meta['post_id']
item['tags'] = i.xpath(
".//div[@class='row hidden-xs']//div[@title='Concept Tags']//a[@class='filter btn btn-link']/text()").extract()
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
items.append(item)
return items, is_duplicate
......@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
def __init__(self, query=None, *args, **kwargs):
super(BaseSpider, self).__init__(*args, **kwargs)
self.condition = query
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment