Add stealth parameter to avoid antibot systems

The aim is to prevent Brozzler detection and blocking by antibot
systems. To do that, we need to run some JS before any other code runs
on page load and mock specific browser attributes which indicate that
Brozzler is a bot.

We add the option `stealth` in `Browser`, `brozzler.cli` and
`BrozzlerWorker`. It is disabled by default.

If enabled, we run `stealth.js` which is executed before anything else
on the page via `Page.addScriptToEvaluateOnNewDocument`.

For now, we mock only the graphics driver attributes.
If this is OK, we can add more antibot evasions in the same script.

There are many antibot tests, we are using this: https://bot.sannysoft.com/

Inspired mainly by:
https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
This commit is contained in:
Vangelis Banos 2022-06-17 10:53:12 +00:00
parent 1de63f0aea
commit 7a12925004
4 changed files with 43 additions and 6 deletions

View File

@ -425,7 +425,7 @@ class Browser:
screenshot_full_page=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
page_timeout=300, behavior_timeout=900,
extract_outlinks_timeout=60, download_throughput=-1):
extract_outlinks_timeout=60, download_throughput=-1, stealth=False):
'''
Browses page in browser.
@ -491,7 +491,8 @@ class Browser:
self.configure_browser(
extra_headers=extra_headers,
user_agent=user_agent,
download_throughput=download_throughput)
download_throughput=download_throughput,
stealth=stealth)
self.navigate_to_page(page_url, timeout=page_timeout)
if password:
self.try_login(username, password, timeout=page_timeout)
@ -577,7 +578,8 @@ class Browser:
# run behavior again with short timeout?
# retrieve outlinks again and append to list?
def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1):
def configure_browser(self, extra_headers=None, user_agent=None,
download_throughput=-1, stealth=False):
headers = extra_headers or {}
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
self.websock_thread.expect_result(self._command_id.peek())
@ -596,6 +598,16 @@ class Browser:
# parameter value as bytes/second, or -1 to disable (default)
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
params={'downloadThroughput': download_throughput})
if stealth:
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template('stealth.js').render()
msg_id = self.send_to_chrome(
method='Page.addScriptToEvaluateOnNewDocument',
params={'source': js})
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=10)
def navigate_to_page(self, page_url, timeout=300):
self.logger.info('navigating to page %s', page_url)

View File

@ -156,6 +156,9 @@ def brozzle_page(argv=None):
arg_parser.add_argument(
'--browser_throughput', type=int, dest='download_throughput', default=-1,
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
arg_parser.add_argument(
'--stealth', dest='stealth', action='store_true',
help='Try to avoid web bot detection')
arg_parser.add_argument(
'--screenshot-full-page', dest='screenshot_full_page',
action='store_true')
@ -189,7 +192,8 @@ def brozzle_page(argv=None):
skip_youtube_dl=args.skip_youtube_dl,
simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page,
download_throughput=args.download_throughput)
download_throughput=args.download_throughput,
stealth=args.stealth)
def on_screenshot(screenshot_jpeg):
OK_CHARS = string.ascii_letters + string.digits

View File

@ -0,0 +1,19 @@
/**
* Mock GPU information with real values. Check using: https://bot.sannysoft.com/
*/
WebGLRenderingContext.prototype.getParameter = function(origFn) {
const paramMap = {};
paramMap[0x9245] = "Google Inc. (NVIDIA Corporation)"; // UNMASKED_VENDOR_WEBGL
paramMap[0x9246] = "ANGLE (NVIDIA Corporation, Quadro P400/PCIe/SSE2, OpenGL 4.5.0)"; // UNMASKED_RENDERER_WEBGL
paramMap[0x1F00] = "WebKit"; // VENDOR
paramMap[0x1F01] = "WebKit WebGL"; // RENDERER
paramMap[0x1F02] = "WebGL 1.0 (OpenGL ES 2.0 Chromium)"; // VERSION
return function(parameter) {
return paramMap[parameter] || origFn.call(this, parameter);
};
}(WebGLRenderingContext.prototype.getParameter);
// TODO Add many more feature detection evations here. For example:
// Mock navigator.permissions.query. In headful on secure origins the
// permission should be "default", not "denied".

View File

@ -53,7 +53,7 @@ class BrozzlerWorker:
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
download_throughput=-1):
download_throughput=-1, stealth=False):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -71,6 +71,7 @@ class BrozzlerWorker:
self._behavior_timeout = behavior_timeout
self._extract_outlinks_timeout = extract_outlinks_timeout
self._download_throughput = download_throughput
self._stealth = stealth
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -311,7 +312,8 @@ class BrozzlerWorker:
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout,
extract_outlinks_timeout=self._extract_outlinks_timeout,
download_throughput=self._download_throughput)
download_throughput=self._download_throughput,
stealth=self._stealth)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks