add function for parsing by tags and by search

93293705 · Vasyl Bodnaruk · 61f1ee4a · 93293705
Commit 93293705 authored Jul 06, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 2 deletions

aitop.py exa/exa/spiders/aitop.py +30 -2

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -7,9 +7,37 @@ from ..items import ExaItem
 class AitopSpider(scrapy.Spider):
    name = "aitop"
    allowed_domains = ["aitopics.org"]
-    start_urls = ['https://aitopics.org/search?filters=concept-tagsRaw%3AUber']
+    start_urls = ['https://aitopics.org/search?view=&filters=&sort=score+desc&q=x.ai']

    def parse(self, response):
+        name = 'Uber'
+        url = 'https://aitopics.org'
+
+        def tags(res):
+            concept_tags = response.xpath(".//div[contains(@class, 'facet_concept-tagsraw')]//div//div//ul//li")
+            for i in concept_tags:
+                if i.xpath(".//@title").extract_first().lower() == name.lower():
+                    href = i.xpath(".//a/@href").extract_first()
+                    return url + href
+            return None
+
+        url_for_parse_tag = tags(response)
+        if url_for_parse_tag:
+            return scrapy.Request(url_for_parse_tag, callback=self.parse_by_tag)
+        else:
+            return scrapy.Request(response.url, callback=self.parse_by_title_description)
+
+    def parse_by_title_description(self, response):
+        rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
+        for i in rows:
+            item = dict()
+            item['date'] = dateparser.parse(i.xpath(".//time/@datetime").extract_first()).replace(tzinfo=None)
+            item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
+            item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
+            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
+            print(item)
+
+    def parse_by_tag(self, response):
        try:
            rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
            for i in rows:
@@ -22,6 +50,6 @@ class AitopSpider(scrapy.Spider):
            has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
            if has_next and has_next != '#':
                next_url = 'https://aitopics.org/search' + has_next
-                return scrapy.Request(next_url, callback=self.parse)
+                return scrapy.Request(next_url, callback=self.parse_by_tag)
        except:
            pass