Add .lua script for using Crawlera

07793329 · Andrii Marynets · 39c0c968 · 07793329 · 07793329
Commit 07793329 authored Oct 17, 2017 by Andrii Marynets
Hide whitespace changes
Inline Side-by-side

Showing with 91 additions and 0 deletions

cr.lua exa/exa/scripts/cr.lua +42 -0

crawlera.lua exa/exa/scripts/crawlera.lua +49 -0

No files found.
--- a/exa/exa/scripts/cr.lua
+++ b/exa/exa/scripts/cr.lua
+-- myscript.lua
+function main(splash)
+    -- begin example from crawlera docs
+    local host = "proxy.crawlera.com"
+    local port = 8010
+    local user = splash.args.apikey
+    local password = ""
+    local session_header = "X-Crawlera-Session"
+    local session_id = "create"
+
+    splash:on_request(function (request)
+        request:set_header(session_header, session_id)
+        request:set_proxy{host, port, username=user, password=password}
+    end)
+
+    splash:on_response_headers(function (response)
+        if response.headers[session_header] ~= nil then
+            session_id = response.headers[session_header]
+        end
+    end)
+    -- end example from crawlera docs
+
+    -- customized render script inspired by scrapy-splash examples
+    splash:init_cookies(splash.args.cookies)
+    assert(splash:go{
+        splash.args.url,
+        headers=splash.args.headers,
+        http_method=splash.args.http_method,
+        body=splash.args.body,
+    })
+    assert(splash:wait(0.5))
+
+    local entries = splash:history()
+    local last_response = entries[#entries].response
+    return {
+        url = splash:url(),
+        headers = last_response.headers,
+        http_status = last_response.status,
+        cookies = splash:get_cookies(),
+        html = splash:html(),
+    }
+end
\ No newline at end of file
--- a/exa/exa/scripts/crawlera.lua
+++ b/exa/exa/scripts/crawlera.lua
+function use_crawlera(splash)
+    -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
+    -- Have a look at the file spiders/quotes-js.py to see how to do it.
+    -- Find your Crawlera credentials in https://app.scrapinghub.com/
+    local user = splash.args.crawlera_user
+
+    local host = 'proxy.crawlera.com'
+    local port = 8010
+    local session_header = 'X-Crawlera-Session'
+    local session_id = 'create'
+
+    splash:on_request(function (request)
+        -- The commented code below can be used to speed up the crawling
+        -- process. They filter requests to undesired domains and useless
+        -- resources. Uncomment the ones that make sense to your use case
+        -- and add your own rules.
+
+        -- Discard requests to advertising and tracking domains.
+        -- if string.find(request.url, 'doubleclick%.net') or
+        --    string.find(request.url, 'analytics%.google%.com') then
+        --     request.abort()
+        --     return
+        -- end
+
+        -- Avoid using Crawlera for subresources fetching to increase crawling
+        -- speed. The example below avoids using Crawlera for URLS starting
+        -- with 'static.' and the ones ending with '.png'.
+        -- if string.find(request.url, '://static%.') ~= nil or
+        --    string.find(request.url, '%.png$') ~= nil then
+        --     return
+        -- end
+
+        request:set_header('X-Crawlera-Cookies', 'disable')
+        request:set_header(session_header, session_id)
+        request:set_proxy{host, port, username=user, password=''}
+    end)
+
+    splash:on_response_headers(function (response)
+        if type(response.headers[session_header]) ~= nil then
+            session_id = response.headers[session_header]
+        end
+    end)
+end
+
+function main(splash)
+    use_crawlera(splash)
+    splash:go(splash.args.url)
+    return splash:html()
+end
\ No newline at end of file