Add custom settings to CB spider

7dad524c · Vasyl Bodnaruk · dd3fcca4 · 7dad524c · 7dad524c
Commit 7dad524c authored Aug 01, 2017 by Vasyl Bodnaruk
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 0 deletions

cb.py exa/exa/spiders/cb.py +11 -0

update.py exa/update.py +49 -0

No files found.
--- a/exa/exa/spiders/cb.py
+++ b/exa/exa/spiders/cb.py
 # -*- coding: utf-8 -*-
+import random
 import scrapy

 from .base import BaseSpider
@@ -11,11 +12,19 @@ class CbSpider(BaseSpider):
    name = "cb"
    allowed_domains = ["www.crunchbase.com"]
    # start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
+    custom_settings = {
+        'DOWNLOAD_DELAY': 15,
+        'CONCURRENT_REQUESTS': 2,
+        'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
+        'exa.middlewares.SeleniumDownloadMiddleware': 543
+    }
+    co = 0

    def start_requests(self):
        for i in self.companies(self.name):
            print(i)
            try:
+                self.custom_settings['DOWNLOAD_DELAY'] = random.random() * random.randint(1, 15)
                yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
            except:
                pass
@@ -31,6 +40,8 @@ class CbSpider(BaseSpider):
            item.update(self.get_common_items(company))
            item['media_id'] = self._get_media(i)
            print(item)
+            self.co += 1
+            print(self.co)

        if len(rows) != 0:
            yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta)

--- a/exa/update.py
+++ b/exa/update.py
+import json
+from scrapy.utils.project import get_project_settings
+from newspaper import Article
+from exa.helpers import Database
+from exa.esi_news_classification.news_classify_tag import Classifier
+
+db_cred = get_project_settings().get('DB')
+
+
+class NewsUpdater:
+    def __init__(self):
+        self.classifier = Classifier()
+        self.classifier.teach_model()
+        self.db = Database(**db_cred)
+
+    def select_news(self, query):
+        return self.db.select(query)
+
+    def load_text(self, url):
+        article = Article(url)
+        article.download()
+        article.parse()
+        return article.text
+
+    def get_tags(self, text):
+        self.classifier.classify(text)
+        tags = list()
+        for i in self.classifier.teg_accordance:
+            tags.append(i[0])
+        return json.dumps(tags)
+
+    def update_news(self, query):
+        self.db.update(query)
+
+    # this bad way
+    def update_all(self):
+        for i in self.select_news('select id, url from wp_esi_news_accept where id> 80 and id<100'):
+            try:
+                text = self.load_text(i[1])
+                tags = self.get_tags(text)
+                self.update_news('update wp_esi_news_accept set tags_id="{}" where id={}'.format(tags, i[0]))
+                print('News id={} was updated'.format(i[0]))
+            except BaseException as e:
+                print(e.with_traceback())
+
+
+if __name__ == '__main__':
+    ml = NewsUpdater()
+    ml.update_all()