Commit b83e565f authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Refactor parse by tag

parent c5d3c31d
......@@ -77,8 +77,30 @@ class AitopSpider(BaseSpider):
def parse_by_tag(self, response):
try:
items, is_duplicate = self.build_items(response)
for i in items:
yield i
next_url = self.next_url(response)
if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
else:
print("DUPLICATE NEWS")
except:
pass
def next_url(self, response):
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
return 'https://aitopics.org/search' + has_next
else:
return None
def build_items(self, response):
rows = response.xpath(".//div[contains(@class, 'summaries')]")
is_duplicate = False
items = list()
for i in rows:
item = ExaItem()
item['date'] = dateparser.parse(i.xpath("./div[@class='row']//time/@datetime").extract_first()).replace(
......@@ -96,20 +118,5 @@ class AitopSpider(BaseSpider):
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
yield item
next_url = self.next_url(response)
if self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse_by_tag, meta=response.meta)
else:
print("DUPLICATE NEWS")
except:
pass
def next_url(self, response):
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
return 'https://aitopics.org/search' + has_next
else:
return None
items.append(item)
return items, is_duplicate
......@@ -10,7 +10,7 @@ class BaseSpider(scrapy.Spider):
def __init__(self, query=None, *args, **kwargs):
super(BaseSpider, self).__init__(*args, **kwargs)
self.condition = query
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=17"
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment