Commit 93293705 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

add function for parsing by tags and by search

parent 61f1ee4a
......@@ -7,9 +7,37 @@ from ..items import ExaItem
class AitopSpider(scrapy.Spider):
name = "aitop"
allowed_domains = ["aitopics.org"]
start_urls = ['https://aitopics.org/search?filters=concept-tagsRaw%3AUber']
start_urls = ['https://aitopics.org/search?view=&filters=&sort=score+desc&q=x.ai']
def parse(self, response):
name = 'Uber'
url = 'https://aitopics.org'
def tags(res):
concept_tags = response.xpath(".//div[contains(@class, 'facet_concept-tagsraw')]//div//div//ul//li")
for i in concept_tags:
if i.xpath(".//@title").extract_first().lower() == name.lower():
href = i.xpath(".//a/@href").extract_first()
return url + href
return None
url_for_parse_tag = tags(response)
if url_for_parse_tag:
return scrapy.Request(url_for_parse_tag, callback=self.parse_by_tag)
else:
return scrapy.Request(response.url, callback=self.parse_by_title_description)
def parse_by_title_description(self, response):
rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
for i in rows:
item = dict()
item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
print(item)
def parse_by_tag(self, response):
try:
rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
for i in rows:
......@@ -22,6 +50,6 @@ class AitopSpider(scrapy.Spider):
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
next_url = 'https://aitopics.org/search' + has_next
return scrapy.Request(next_url, callback=self.parse)
return scrapy.Request(next_url, callback=self.parse_by_tag)
except:
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment