Commit b6524c3b authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add unique news crawl

parent 3aae620c
...@@ -39,12 +39,12 @@ DOWNLOAD_DELAY = 3 ...@@ -39,12 +39,12 @@ DOWNLOAD_DELAY = 3
# Override the default request headers: # Override the default request headers:
DEFAULT_REQUEST_HEADERS = { DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2', # 'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2',
'Accept-Encoding': 'gzip, deflate, sdch, br', # 'Accept-Encoding': 'gzip, deflate, sdch, br',
'Connection': 'keep-alive', # 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Upgrade-Insecure-Requests': 1, # 'Upgrade-Insecure-Requests': 1,
} }
# Enable or disable spider middlewares # Enable or disable spider middlewares
...@@ -56,12 +56,13 @@ DEFAULT_REQUEST_HEADERS = { ...@@ -56,12 +56,13 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, # 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'exa.middlewares.SeleniumDownloadMiddleware': 543, # 'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100, # 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_crawlera.CrawleraMiddleware': 610
} }
# Enable or disable extensions # Enable or disable extensions
...@@ -112,7 +113,6 @@ MIDDLEWARE = { ...@@ -112,7 +113,6 @@ MIDDLEWARE = {
# PROXY # PROXY
import os import os
import sys
PROXY_LIST_DIR = os.path.dirname(os.path.abspath(__file__)) PROXY_LIST_DIR = os.path.dirname(os.path.abspath(__file__))
RETRY_TIMES = 10 RETRY_TIMES = 10
RETRY_HTTP_CODES = [416] RETRY_HTTP_CODES = [416]
......
...@@ -12,19 +12,11 @@ class CbSpider(BaseSpider): ...@@ -12,19 +12,11 @@ class CbSpider(BaseSpider):
name = "cb" name = "cb"
allowed_domains = ["www.crunchbase.com"] allowed_domains = ["www.crunchbase.com"]
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/'] # start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
custom_settings = {
'DOWNLOAD_DELAY': 15,
'CONCURRENT_REQUESTS': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
'exa.middlewares.SeleniumDownloadMiddleware': 543
}
co = 0 co = 0
def start_requests(self): def start_requests(self):
for i in self.companies(self.name): for i in self.companies(self.name):
print(i)
try: try:
self.custom_settings['DOWNLOAD_DELAY'] = random.random() * random.randint(1, 15)
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0}) yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
except: except:
pass pass
...@@ -32,6 +24,7 @@ class CbSpider(BaseSpider): ...@@ -32,6 +24,7 @@ class CbSpider(BaseSpider):
def parse(self, response): def parse(self, response):
rows = response.xpath("//table/tr")[1:] rows = response.xpath("//table/tr")[1:]
company = response.meta['company'] company = response.meta['company']
is_duplicate = False
for i in rows: for i in rows:
item = ExaItem() item = ExaItem()
item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first() item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first()
...@@ -39,12 +32,20 @@ class CbSpider(BaseSpider): ...@@ -39,12 +32,20 @@ class CbSpider(BaseSpider):
item['url'] = i.xpath("./td/a/@href").extract_first() item['url'] = i.xpath("./td/a/@href").extract_first()
item.update(self.get_common_items(company)) item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i) item['media_id'] = self._get_media(i)
item['description'] = None
item['post_id'] = response.meta['post_id']
item['tags'] = None
print(item) print(item)
self.co += 1 self.co += 1
print(self.co) print(self.co)
if len(rows) != 0: if self.pipeline.check_url(item['url']):
yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta) is_duplicate = True
break
yield item
next_url = self._next_url(response.url)
if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def _get_media(self, elem): def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first() media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
......
...@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0 ...@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0
python-scrapyd-api==2.0.1 python-scrapyd-api==2.0.1
scrapyd-client==1.1.0 scrapyd-client==1.1.0
scrapy-proxies==0.3 scrapy-proxies==0.3
scrapy-crawlera==1.2.4
newspaper3k==0.2.2 newspaper3k==0.2.2
PyVirtualDisplay==0.2.1 PyVirtualDisplay==0.2.1
selenium==3.4.1 selenium==3.4.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment