Add function for check exist company in news

c3d696f4 · Vasyl Bodnaruk · 93293705 · c3d696f4
Commit c3d696f4 authored 7 years ago by Vasyl Bodnaruk
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 1 deletion

aitop.py exa/exa/spiders/aitop.py +14 -1

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -28,6 +28,15 @@ class AitopSpider(scrapy.Spider):
            return scrapy.Request(response.url, callback=self.parse_by_title_description)

    def parse_by_title_description(self, response):
+        name = 'x.ai'
+
+        def is_company_in_item(item):
+            if name.lower() in item['title'].lower():
+                return True
+            elif name.lower() in item['description'].lower():
+                return True
+            else:
+                return False
        rows = response.xpath(".//div[contains(@class, 'summaries')]//div[@class='row']")
        for i in rows:
            item = dict()
@@ -35,7 +44,11 @@ class AitopSpider(scrapy.Spider):
            item['title'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/text()").extract_first()
            item['description'] = i.xpath(".//div[@class='summary-content']/p/text()").extract_first()
            item['url'] = i.xpath(".//div[contains(@class, 'col-xs-12')]/h3/a/@href").extract_first()
-            print(item)
+            # print(item)
+        has_next = response.xpath(".//ul[@class='pagination']//li/a/@href").extract()[-2]
+        if has_next and has_next != '#':
+            next_url = 'https://aitopics.org/search' + has_next
+            return scrapy.Request(next_url, callback=self.parse_by_tag)

    def parse_by_tag(self, response):
        try: