From 7a12925004a5b39dcedc03cdfb1f575f924336ce Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 17 Jun 2022 10:53:12 +0000 Subject: [PATCH] Add stealth parameter to avoid antibot systems The aim is to prevent Brozzler detection and blocking by antibot systems. To do that, we need to run some JS before any other code runs on page load and mock specific browser attributes which indicate that Brozzler is a bot. We add the option `stealth` in `Browser`, `brozzler.cli` and `BrozzlerWorker`. It is disabled by default. If enabled, we run `stealth.js` which is executed before anything else on the page via `Page.addScriptToEvaluateOnNewDocument`. For now, we mock only the graphics driver attributes. If this is OK, we can add more antibot evasions in the same script. There are many antibot tests, we are using this: https://bot.sannysoft.com/ Inspired mainly by: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth --- brozzler/browser.py | 18 +++++++++++++++--- brozzler/cli.py | 6 +++++- brozzler/js-templates/stealth.js | 19 +++++++++++++++++++ brozzler/worker.py | 6 ++++-- 4 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 brozzler/js-templates/stealth.js diff --git a/brozzler/browser.py b/brozzler/browser.py index 8692e1d..0cda56e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -425,7 +425,7 @@ class Browser: screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, page_timeout=300, behavior_timeout=900, - extract_outlinks_timeout=60, download_throughput=-1): + extract_outlinks_timeout=60, download_throughput=-1, stealth=False): ''' Browses page in browser. @@ -491,7 +491,8 @@ class Browser: self.configure_browser( extra_headers=extra_headers, user_agent=user_agent, - download_throughput=download_throughput) + download_throughput=download_throughput, + stealth=stealth) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) @@ -577,7 +578,8 @@ class Browser: # run behavior again with short timeout? # retrieve outlinks again and append to list? - def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1): + def configure_browser(self, extra_headers=None, user_agent=None, + download_throughput=-1, stealth=False): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) @@ -596,6 +598,16 @@ class Browser: # parameter value as bytes/second, or -1 to disable (default) msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions', params={'downloadThroughput': download_throughput}) + if stealth: + self.websock_thread.expect_result(self._command_id.peek()) + js = brozzler.jinja2_environment().get_template('stealth.js').render() + msg_id = self.send_to_chrome( + method='Page.addScriptToEvaluateOnNewDocument', + params={'source': js}) + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=10) + def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) diff --git a/brozzler/cli.py b/brozzler/cli.py index fb973e3..e3bc93c 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -156,6 +156,9 @@ def brozzle_page(argv=None): arg_parser.add_argument( '--browser_throughput', type=int, dest='download_throughput', default=-1, help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + arg_parser.add_argument( + '--stealth', dest='stealth', action='store_true', + help='Try to avoid web bot detection') arg_parser.add_argument( '--screenshot-full-page', dest='screenshot_full_page', action='store_true') @@ -189,7 +192,8 @@ def brozzle_page(argv=None): skip_youtube_dl=args.skip_youtube_dl, simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page, - download_throughput=args.download_throughput) + download_throughput=args.download_throughput, + stealth=args.stealth) def on_screenshot(screenshot_jpeg): OK_CHARS = string.ascii_letters + string.digits diff --git a/brozzler/js-templates/stealth.js b/brozzler/js-templates/stealth.js new file mode 100644 index 0000000..dee3882 --- /dev/null +++ b/brozzler/js-templates/stealth.js @@ -0,0 +1,19 @@ +/** + * Mock GPU information with real values. Check using: https://bot.sannysoft.com/ + */ +WebGLRenderingContext.prototype.getParameter = function(origFn) { + const paramMap = {}; + paramMap[0x9245] = "Google Inc. (NVIDIA Corporation)"; // UNMASKED_VENDOR_WEBGL + paramMap[0x9246] = "ANGLE (NVIDIA Corporation, Quadro P400/PCIe/SSE2, OpenGL 4.5.0)"; // UNMASKED_RENDERER_WEBGL + paramMap[0x1F00] = "WebKit"; // VENDOR + paramMap[0x1F01] = "WebKit WebGL"; // RENDERER + paramMap[0x1F02] = "WebGL 1.0 (OpenGL ES 2.0 Chromium)"; // VERSION + + return function(parameter) { + return paramMap[parameter] || origFn.call(this, parameter); + }; +}(WebGLRenderingContext.prototype.getParameter); + +// TODO Add many more feature detection evations here. For example: +// Mock navigator.permissions.query. In headful on secure origins the +// permission should be "default", not "denied". diff --git a/brozzler/worker.py b/brozzler/worker.py index f631ced..8aa6083 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -53,7 +53,7 @@ class BrozzlerWorker: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, - download_throughput=-1): + download_throughput=-1, stealth=False): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -71,6 +71,7 @@ class BrozzlerWorker: self._behavior_timeout = behavior_timeout self._extract_outlinks_timeout = extract_outlinks_timeout self._download_throughput = download_throughput + self._stealth = stealth self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -311,7 +312,8 @@ class BrozzlerWorker: page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout, extract_outlinks_timeout=self._extract_outlinks_timeout, - download_throughput=self._download_throughput) + download_throughput=self._download_throughput, + stealth=self._stealth) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks