Commit 6112944d authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Refactor. Add functional for reuse code. Fix bug when we use only search for scrapping

parent c3d696f4
......@@ -37,32 +37,42 @@ class AitopSpider(scrapy.Spider):
return True
else:
return False
rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
for i in rows:
item = dict()
item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
# print(item)
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
next_url = 'https://aitopics.org/search' + has_next
return scrapy.Request(next_url, callback=self.parse_by_tag)
for i in self.build_items(response):
# if is_company_in_item(i):
print(i)
next_url = self.next_url(response)
# if next_url:
# return scrapy.Request(next_url, callback=self.parse_by_title_description)
def parse_by_tag(self, response):
try:
for i in self.build_items(response):
print(i)
next_url = self.next_url(response)
if next_url:
return scrapy.Request(next_url, callback=self.parse_by_tag)
except:
pass
def build_items(self, response):
try:
items = list()
rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
for i in rows:
item = dict()
item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
item['title'] = ''.join(i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a//text()").extract())
item['description'] = ''.join(i.xpath(".//div[@class='summary-content']/p/text()").extract())
item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
# print(item)
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
next_url = 'https://aitopics.org/search' + has_next
return scrapy.Request(next_url, callback=self.parse_by_tag)
items.append(item)
return items
except:
pass
def next_url(self, response):
has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
if has_next and has_next != '#':
return 'https://aitopics.org/search' + has_next
else:
return None
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment