diff --git a/brozzler/browser.py b/brozzler/browser.py index 8692e1d..0cda56e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -425,7 +425,7 @@ class Browser: screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, page_timeout=300, behavior_timeout=900, - extract_outlinks_timeout=60, download_throughput=-1): + extract_outlinks_timeout=60, download_throughput=-1, stealth=False): ''' Browses page in browser. @@ -491,7 +491,8 @@ class Browser: self.configure_browser( extra_headers=extra_headers, user_agent=user_agent, - download_throughput=download_throughput) + download_throughput=download_throughput, + stealth=stealth) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) @@ -577,7 +578,8 @@ class Browser: # run behavior again with short timeout? # retrieve outlinks again and append to list? - def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1): + def configure_browser(self, extra_headers=None, user_agent=None, + download_throughput=-1, stealth=False): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) @@ -596,6 +598,16 @@ class Browser: # parameter value as bytes/second, or -1 to disable (default) msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions', params={'downloadThroughput': download_throughput}) + if stealth: + self.websock_thread.expect_result(self._command_id.peek()) + js = brozzler.jinja2_environment().get_template('stealth.js').render() + msg_id = self.send_to_chrome( + method='Page.addScriptToEvaluateOnNewDocument', + params={'source': js}) + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=10) + def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) diff --git a/brozzler/cli.py b/brozzler/cli.py index fb973e3..e3bc93c 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -156,6 +156,9 @@ def brozzle_page(argv=None): arg_parser.add_argument( '--browser_throughput', type=int, dest='download_throughput', default=-1, help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + arg_parser.add_argument( + '--stealth', dest='stealth', action='store_true', + help='Try to avoid web bot detection') arg_parser.add_argument( '--screenshot-full-page', dest='screenshot_full_page', action='store_true') @@ -189,7 +192,8 @@ def brozzle_page(argv=None): skip_youtube_dl=args.skip_youtube_dl, simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page, - download_throughput=args.download_throughput) + download_throughput=args.download_throughput, + stealth=args.stealth) def on_screenshot(screenshot_jpeg): OK_CHARS = string.ascii_letters + string.digits diff --git a/brozzler/js-templates/stealth.js b/brozzler/js-templates/stealth.js new file mode 100644 index 0000000..dee3882 --- /dev/null +++ b/brozzler/js-templates/stealth.js @@ -0,0 +1,19 @@ +/** + * Mock GPU information with real values. Check using: https://bot.sannysoft.com/ + */ +WebGLRenderingContext.prototype.getParameter = function(origFn) { + const paramMap = {}; + paramMap[0x9245] = "Google Inc. (NVIDIA Corporation)"; // UNMASKED_VENDOR_WEBGL + paramMap[0x9246] = "ANGLE (NVIDIA Corporation, Quadro P400/PCIe/SSE2, OpenGL 4.5.0)"; // UNMASKED_RENDERER_WEBGL + paramMap[0x1F00] = "WebKit"; // VENDOR + paramMap[0x1F01] = "WebKit WebGL"; // RENDERER + paramMap[0x1F02] = "WebGL 1.0 (OpenGL ES 2.0 Chromium)"; // VERSION + + return function(parameter) { + return paramMap[parameter] || origFn.call(this, parameter); + }; +}(WebGLRenderingContext.prototype.getParameter); + +// TODO Add many more feature detection evations here. For example: +// Mock navigator.permissions.query. In headful on secure origins the +// permission should be "default", not "denied". diff --git a/brozzler/worker.py b/brozzler/worker.py index f631ced..8aa6083 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -53,7 +53,7 @@ class BrozzlerWorker: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, - download_throughput=-1): + download_throughput=-1, stealth=False): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -71,6 +71,7 @@ class BrozzlerWorker: self._behavior_timeout = behavior_timeout self._extract_outlinks_timeout = extract_outlinks_timeout self._download_throughput = download_throughput + self._stealth = stealth self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -311,7 +312,8 @@ class BrozzlerWorker: page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout, extract_outlinks_timeout=self._extract_outlinks_timeout, - download_throughput=self._download_throughput) + download_throughput=self._download_throughput, + stealth=self._stealth) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks