mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Add stealth parameter to avoid antibot systems
The aim is to prevent Brozzler detection and blocking by antibot systems. To do that, we need to run some JS before any other code runs on page load and mock specific browser attributes which indicate that Brozzler is a bot. We add the option `stealth` in `Browser`, `brozzler.cli` and `BrozzlerWorker`. It is disabled by default. If enabled, we run `stealth.js` which is executed before anything else on the page via `Page.addScriptToEvaluateOnNewDocument`. For now, we mock only the graphics driver attributes. If this is OK, we can add more antibot evasions in the same script. There are many antibot tests, we are using this: https://bot.sannysoft.com/ Inspired mainly by: https://www.npmjs.com/package/puppeteer-extra-plugin-stealth
This commit is contained in:
parent
1de63f0aea
commit
7a12925004
@ -425,7 +425,7 @@ class Browser:
|
||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
||||
page_timeout=300, behavior_timeout=900,
|
||||
extract_outlinks_timeout=60, download_throughput=-1):
|
||||
extract_outlinks_timeout=60, download_throughput=-1, stealth=False):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -491,7 +491,8 @@ class Browser:
|
||||
self.configure_browser(
|
||||
extra_headers=extra_headers,
|
||||
user_agent=user_agent,
|
||||
download_throughput=download_throughput)
|
||||
download_throughput=download_throughput,
|
||||
stealth=stealth)
|
||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||
if password:
|
||||
self.try_login(username, password, timeout=page_timeout)
|
||||
@ -577,7 +578,8 @@ class Browser:
|
||||
# run behavior again with short timeout?
|
||||
# retrieve outlinks again and append to list?
|
||||
|
||||
def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1):
|
||||
def configure_browser(self, extra_headers=None, user_agent=None,
|
||||
download_throughput=-1, stealth=False):
|
||||
headers = extra_headers or {}
|
||||
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
|
||||
self.websock_thread.expect_result(self._command_id.peek())
|
||||
@ -596,6 +598,16 @@ class Browser:
|
||||
# parameter value as bytes/second, or -1 to disable (default)
|
||||
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
|
||||
params={'downloadThroughput': download_throughput})
|
||||
if stealth:
|
||||
self.websock_thread.expect_result(self._command_id.peek())
|
||||
js = brozzler.jinja2_environment().get_template('stealth.js').render()
|
||||
msg_id = self.send_to_chrome(
|
||||
method='Page.addScriptToEvaluateOnNewDocument',
|
||||
params={'source': js})
|
||||
self._wait_for(
|
||||
lambda: self.websock_thread.received_result(msg_id),
|
||||
timeout=10)
|
||||
|
||||
|
||||
def navigate_to_page(self, page_url, timeout=300):
|
||||
self.logger.info('navigating to page %s', page_url)
|
||||
|
@ -156,6 +156,9 @@ def brozzle_page(argv=None):
|
||||
arg_parser.add_argument(
|
||||
'--browser_throughput', type=int, dest='download_throughput', default=-1,
|
||||
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
|
||||
arg_parser.add_argument(
|
||||
'--stealth', dest='stealth', action='store_true',
|
||||
help='Try to avoid web bot detection')
|
||||
arg_parser.add_argument(
|
||||
'--screenshot-full-page', dest='screenshot_full_page',
|
||||
action='store_true')
|
||||
@ -189,7 +192,8 @@ def brozzle_page(argv=None):
|
||||
skip_youtube_dl=args.skip_youtube_dl,
|
||||
simpler404=args.simpler404,
|
||||
screenshot_full_page=args.screenshot_full_page,
|
||||
download_throughput=args.download_throughput)
|
||||
download_throughput=args.download_throughput,
|
||||
stealth=args.stealth)
|
||||
|
||||
def on_screenshot(screenshot_jpeg):
|
||||
OK_CHARS = string.ascii_letters + string.digits
|
||||
|
19
brozzler/js-templates/stealth.js
Normal file
19
brozzler/js-templates/stealth.js
Normal file
@ -0,0 +1,19 @@
|
||||
/**
|
||||
* Mock GPU information with real values. Check using: https://bot.sannysoft.com/
|
||||
*/
|
||||
WebGLRenderingContext.prototype.getParameter = function(origFn) {
|
||||
const paramMap = {};
|
||||
paramMap[0x9245] = "Google Inc. (NVIDIA Corporation)"; // UNMASKED_VENDOR_WEBGL
|
||||
paramMap[0x9246] = "ANGLE (NVIDIA Corporation, Quadro P400/PCIe/SSE2, OpenGL 4.5.0)"; // UNMASKED_RENDERER_WEBGL
|
||||
paramMap[0x1F00] = "WebKit"; // VENDOR
|
||||
paramMap[0x1F01] = "WebKit WebGL"; // RENDERER
|
||||
paramMap[0x1F02] = "WebGL 1.0 (OpenGL ES 2.0 Chromium)"; // VERSION
|
||||
|
||||
return function(parameter) {
|
||||
return paramMap[parameter] || origFn.call(this, parameter);
|
||||
};
|
||||
}(WebGLRenderingContext.prototype.getParameter);
|
||||
|
||||
// TODO Add many more feature detection evations here. For example:
|
||||
// Mock navigator.permissions.query. In headful on secure origins the
|
||||
// permission should be "default", not "denied".
|
@ -53,7 +53,7 @@ class BrozzlerWorker:
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
|
||||
download_throughput=-1):
|
||||
download_throughput=-1, stealth=False):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -71,6 +71,7 @@ class BrozzlerWorker:
|
||||
self._behavior_timeout = behavior_timeout
|
||||
self._extract_outlinks_timeout = extract_outlinks_timeout
|
||||
self._download_throughput = download_throughput
|
||||
self._stealth = stealth
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(
|
||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
@ -311,7 +312,8 @@ class BrozzlerWorker:
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout,
|
||||
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
||||
download_throughput=self._download_throughput)
|
||||
download_throughput=self._download_throughput,
|
||||
stealth=self._stealth)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
|
Loading…
x
Reference in New Issue
Block a user