Commit 50b27c03 authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add mixin to spider

parent 5a897b66
...@@ -2,31 +2,23 @@ ...@@ -2,31 +2,23 @@
import scrapy import scrapy
import dateparser import dateparser
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database from ..helpers import CompanyMaker, Database, QueryMixin
from ..items import ExaItem from ..items import ExaItem
db_settings = get_project_settings().get('DB') db_settings = get_project_settings().get('DB')
db = Database(**db_settings) db = Database(**db_settings)
class AitopSpider(scrapy.Spider): class AitopSpider(QueryMixin, scrapy.Spider):
name = "aitop" name = "aitop"
allowed_domains = ["aitopics.org"] allowed_domains = ["aitopics.org"]
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
super(AitopSpider, self).__init__()
def start_requests(self): def start_requests(self):
companies = CompanyMaker(db.select(self.query)) companies = CompanyMaker(db.select(self.query))
companies.make_companies(self.name) companies.make_companies(self.name)
for i in companies.get_companies(): for i in companies.get_companies():
try: try:
yield scrapy.Request(i.url + '/', callback=self.parse, meta={'company': i, 'post_id': 0}) yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
except: except:
pass pass
...@@ -102,6 +94,4 @@ class AitopSpider(scrapy.Spider): ...@@ -102,6 +94,4 @@ class AitopSpider(scrapy.Spider):
else: else:
return None return None
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
\ No newline at end of file
...@@ -3,7 +3,7 @@ import dateparser ...@@ -3,7 +3,7 @@ import dateparser
import scrapy import scrapy
import traceback import traceback
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database from ..helpers import CompanyMaker, Database, QueryMixin
from ..items import ExaItem from ..items import ExaItem
...@@ -11,19 +11,10 @@ db_settings = get_project_settings().get('DB') ...@@ -11,19 +11,10 @@ db_settings = get_project_settings().get('DB')
db = Database(**db_settings) db = Database(**db_settings)
class MobiHealthNewsSpider(scrapy.Spider): class MobiHealthNewsSpider(QueryMixin, scrapy.Spider):
name = "mhn" name = "mhn"
allowed_domains = ["www.mobihealthnews.com"] allowed_domains = ["www.mobihealthnews.com"]
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=3"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
print(self.query)
super(MobiHealthNewsSpider, self).__init__()
def start_requests(self): def start_requests(self):
companies = CompanyMaker(db.select(self.query)) companies = CompanyMaker(db.select(self.query))
companies.make_companies(self.name) companies.make_companies(self.name)
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import scrapy import scrapy
import traceback import traceback
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
from ..helpers import CompanyMaker, Database from ..helpers import CompanyMaker, Database, QueryMixin
from ..items import ExaItem from ..items import ExaItem
...@@ -10,18 +10,10 @@ db_settings = get_project_settings().get('DB') ...@@ -10,18 +10,10 @@ db_settings = get_project_settings().get('DB')
db = Database(**db_settings) db = Database(**db_settings)
class TechcrunchSpider(scrapy.Spider): class TechcrunchSpider(QueryMixin, scrapy.Spider):
name = "tc" name = "tc"
allowed_domains = ["techcrunch.com"] allowed_domains = ["techcrunch.com"]
def __init__(self, *args, **kwargs):
self.condition = kwargs.get('query')
self.query = "SELECT id, name, country FROM wp_esi_entity WHERE 1 and id=13"
if self.condition:
print(self.condition)
self.query += ' or {}'.format(self.condition)
super(TechcrunchSpider, self).__init__()
def start_requests(self): def start_requests(self):
companies = CompanyMaker(db.select(self.query)) companies = CompanyMaker(db.select(self.query))
companies.make_companies(self.name) companies.make_companies(self.name)
...@@ -32,16 +24,11 @@ class TechcrunchSpider(scrapy.Spider): ...@@ -32,16 +24,11 @@ class TechcrunchSpider(scrapy.Spider):
pass pass
def parse(self, response): def parse(self, response):
print(response.request.headers)
if 'tag' in response.url: if 'tag' in response.url:
return self.parse_tag(response) return self.parse_tag(response)
if 'company' in response.url: if 'company' in response.url:
return self.parse_company(response) return self.parse_company(response)
def get_common_items(self, company):
return {'region_id': company.region_id, 'type_id': company.type_id,
'media_id': company.media_id, 'company_id': company.id}
def parse_tag(self, response): def parse_tag(self, response):
try: try:
news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]") news_list = response.xpath("..//div[contains(@class, 'block block-thumb ')]")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment