Commit 7dad524c authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add custom settings to CB spider

parent dd3fcca4
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import random
import scrapy import scrapy
from .base import BaseSpider from .base import BaseSpider
...@@ -11,11 +12,19 @@ class CbSpider(BaseSpider): ...@@ -11,11 +12,19 @@ class CbSpider(BaseSpider):
name = "cb" name = "cb"
allowed_domains = ["www.crunchbase.com"] allowed_domains = ["www.crunchbase.com"]
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/'] # start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
custom_settings = {
'DOWNLOAD_DELAY': 15,
'CONCURRENT_REQUESTS': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
'exa.middlewares.SeleniumDownloadMiddleware': 543
}
co = 0
def start_requests(self): def start_requests(self):
for i in self.companies(self.name): for i in self.companies(self.name):
print(i) print(i)
try: try:
self.custom_settings['DOWNLOAD_DELAY'] = random.random() * random.randint(1, 15)
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0}) yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
except: except:
pass pass
...@@ -31,6 +40,8 @@ class CbSpider(BaseSpider): ...@@ -31,6 +40,8 @@ class CbSpider(BaseSpider):
item.update(self.get_common_items(company)) item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i) item['media_id'] = self._get_media(i)
print(item) print(item)
self.co += 1
print(self.co)
if len(rows) != 0: if len(rows) != 0:
yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta) yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta)
......
import json
from scrapy.utils.project import get_project_settings
from newspaper import Article
from exa.helpers import Database
from exa.esi_news_classification.news_classify_tag import Classifier
db_cred = get_project_settings().get('DB')
class NewsUpdater:
def __init__(self):
self.classifier = Classifier()
self.classifier.teach_model()
self.db = Database(**db_cred)
def select_news(self, query):
return self.db.select(query)
def load_text(self, url):
article = Article(url)
article.download()
article.parse()
return article.text
def get_tags(self, text):
self.classifier.classify(text)
tags = list()
for i in self.classifier.teg_accordance:
tags.append(i[0])
return json.dumps(tags)
def update_news(self, query):
self.db.update(query)
# this bad way
def update_all(self):
for i in self.select_news('select id, url from wp_esi_news_accept where id> 80 and id<100'):
try:
text = self.load_text(i[1])
tags = self.get_tags(text)
self.update_news('update wp_esi_news_accept set tags_id="{}" where id={}'.format(tags, i[0]))
print('News id={} was updated'.format(i[0]))
except BaseException as e:
print(e.with_traceback())
if __name__ == '__main__':
ml = NewsUpdater()
ml.update_all()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment