Commit b6524c3b authored by Vasyl Bodnaruk's avatar Vasyl Bodnaruk

Add unique news crawl

parent 3aae620c
......@@ -39,12 +39,12 @@ DOWNLOAD_DELAY = 3
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Upgrade-Insecure-Requests': 1,
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Accept-Language': 'uk,ru;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2',
# 'Accept-Encoding': 'gzip, deflate, sdch, br',
# 'Connection': 'keep-alive',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
# 'Upgrade-Insecure-Requests': 1,
}
# Enable or disable spider middlewares
......@@ -56,12 +56,13 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
# 'exa.middlewares.SeleniumDownloadMiddleware': 543,
# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
# 'scrapy_proxies.RandomProxy': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_crawlera.CrawleraMiddleware': 610
}
# Enable or disable extensions
......@@ -112,7 +113,6 @@ MIDDLEWARE = {
# PROXY
import os
import sys
PROXY_LIST_DIR = os.path.dirname(os.path.abspath(__file__))
RETRY_TIMES = 10
RETRY_HTTP_CODES = [416]
......
......@@ -12,19 +12,11 @@ class CbSpider(BaseSpider):
name = "cb"
allowed_domains = ["www.crunchbase.com"]
# start_urls = ['http://www.crunchbase.com/organization/sense-ly/press/']
custom_settings = {
'DOWNLOAD_DELAY': 15,
'CONCURRENT_REQUESTS': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
'exa.middlewares.SeleniumDownloadMiddleware': 543
}
co = 0
def start_requests(self):
for i in self.companies(self.name):
print(i)
try:
self.custom_settings['DOWNLOAD_DELAY'] = random.random() * random.randint(1, 15)
yield scrapy.Request(i.url, callback=self.parse, meta={'company': i, 'post_id': 0})
except:
pass
......@@ -32,6 +24,7 @@ class CbSpider(BaseSpider):
def parse(self, response):
rows = response.xpath("//table/tr")[1:]
company = response.meta['company']
is_duplicate = False
for i in rows:
item = ExaItem()
item['date'] = i.xpath("./td[contains(@class, 'date')]/text()").extract_first()
......@@ -39,12 +32,20 @@ class CbSpider(BaseSpider):
item['url'] = i.xpath("./td/a/@href").extract_first()
item.update(self.get_common_items(company))
item['media_id'] = self._get_media(i)
item['description'] = None
item['post_id'] = response.meta['post_id']
item['tags'] = None
print(item)
self.co += 1
print(self.co)
if len(rows) != 0:
yield scrapy.Request(self._next_url(response.url), callback=self.parse, meta=response.meta)
if self.pipeline.check_url(item['url']):
is_duplicate = True
break
yield item
next_url = self._next_url(response.url)
if len(rows) != 0 and self.can_follow(next_url, is_duplicate):
yield scrapy.Request(next_url, callback=self.parse, meta=response.meta)
def _get_media(self, elem):
media_name = elem.xpath("./td[contains(@class, 'article')]/span/text()").extract_first()
......
......@@ -3,6 +3,7 @@ scrapy-fake-useragent==1.1.0
python-scrapyd-api==2.0.1
scrapyd-client==1.1.0
scrapy-proxies==0.3
scrapy-crawlera==1.2.4
newspaper3k==0.2.2
PyVirtualDisplay==0.2.1
selenium==3.4.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment