Add mixin to spider

50b27c03 · Vasyl Bodnaruk · 5a897b66 · 50b27c03 · 50b27c03 · 50b27c03
Commit 50b27c03 authored Jul 10, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 40 deletions

aitop.py exa/exa/spiders/aitop.py +4 -14

mobihealthnews.py exa/exa/spiders/mobihealthnews.py +2 -11

techcrunch.py exa/exa/spiders/techcrunch.py +2 -15

No files found.
--- a/exa/exa/spiders/aitop.py
+++ b/exa/exa/spiders/aitop.py
@@ -2,31 +2,23 @@
 import scrapy
 import dateparser
 from scrapy.utils.project import get_project_settings
-from ..helpers import CompanyMaker, Database
+from ..helpers import CompanyMaker, Database, QueryMixin
 from ..items import ExaItem

 db_settings = get_project_settings().get('DB')
 db = Database(**db_settings)


-class AitopSpider(scrapy.Spider):
+class AitopSpider(QueryMixin, scrapy.Spider):
    name = "aitop"
    allowed_domains = ["aitopics.org"]

-    def __init__(self, *args, **kwargs):
-        self.condition = kwargs.get('query')
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
-        if self.condition:
-            print(self.condition)
-            self.query += ' or {}'.format(self.condition)
-        super(AitopSpider, self).__init__()
-
    def start_requests(self):
        companies = CompanyMaker(db.select(self.query))
        companies.make_companies(self.name)
        for i in companies.get_companies():
            try:
-                yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0})
+                yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
            except:
                pass

@@ -102,6 +94,4 @@ class AitopSpider(scrapy.Spider):
        else:
            return None

-    def get_common_items(self, company):
-        return {'region_id': company.region_id, 'type_id': company.type_id,
-                'media_id': company.media_id, 'company_id': company.id}
\ No newline at end of file
+
--- a/exa/exa/spiders/mobihealthnews.py
+++ b/exa/exa/spiders/mobihealthnews.py
@@ -3,7 +3,7 @@ import dateparser
 import scrapy
 import traceback
 from scrapy.utils.project import get_project_settings
-from ..helpers import CompanyMaker, Database
+from ..helpers import CompanyMaker, Database, QueryMixin
 from ..items import ExaItem


@@ -11,19 +11,10 @@ db_settings = get_project_settings().get('DB')
 db = Database(**db_settings)


-class MobiHealthNewsSpider(scrapy.Spider):
+class MobiHealthNewsSpider(QueryMixin, scrapy.Spider):
    name = "mhn"
    allowed_domains = ["www.mobihealthnews.com"]

-    def __init__(self, *args, **kwargs):
-        self.condition = kwargs.get('query')
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
-        if self.condition:
-            print(self.condition)
-            self.query += ' or {}'.format(self.condition)
-        print(self.query)
-        super(MobiHealthNewsSpider, self).__init__()
-
    def start_requests(self):
        companies = CompanyMaker(db.select(self.query))
        companies.make_companies(self.name)

--- a/exa/exa/spiders/techcrunch.py
+++ b/exa/exa/spiders/techcrunch.py
@@ -2,7 +2,7 @@
 import scrapy
 import traceback
 from scrapy.utils.project import get_project_settings
-from ..helpers import CompanyMaker, Database
+from ..helpers import CompanyMaker, Database, QueryMixin
 from ..items import ExaItem


@@ -10,18 +10,10 @@ db_settings = get_project_settings().get('DB')
 db = Database(**db_settings)


-class TechcrunchSpider(scrapy.Spider):
+class TechcrunchSpider(QueryMixin, scrapy.Spider):
    name = "tc"
    allowed_domains = ["techcrunch.com"]

-    def __init__(self, *args, **kwargs):
-        self.condition = kwargs.get('query')
-        self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
-        if self.condition:
-            print(self.condition)
-            self.query += ' or {}'.format(self.condition)
-        super(TechcrunchSpider, self).__init__()
-
    def start_requests(self):
        companies = CompanyMaker(db.select(self.query))
        companies.make_companies(self.name)
@@ -32,16 +24,11 @@ class TechcrunchSpider(scrapy.Spider):
                pass

    def parse(self, response):
-        print(response.request.headers)
        if 'tag' in response.url:
            return self.parse_tag(response)
        if 'company' in response.url:
            return self.parse_company(response)

-    def get_common_items(self, company):
-        return {'region_id': company.region_id, 'type_id': company.type_id,
-                'media_id': company.media_id, 'company_id': company.id}
-
    def parse_tag(self, response):
        try:
            news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")