Merge pull request #246 from vbanos/stealth

Looks good, thank you, @vbanos!
This commit is contained in:
Barbara Miller 2022-06-20 13:43:25 -07:00 committed by GitHub
commit fe0aaa1ff6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 43 additions and 6 deletions

View File

@ -425,7 +425,7 @@ class Browser:
screenshot_full_page=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
page_timeout=300, behavior_timeout=900,
extract_outlinks_timeout=60, download_throughput=-1):
extract_outlinks_timeout=60, download_throughput=-1, stealth=False):
'''
Browses page in browser.
@ -491,7 +491,8 @@ class Browser:
self.configure_browser(
extra_headers=extra_headers,
user_agent=user_agent,
download_throughput=download_throughput)
download_throughput=download_throughput,
stealth=stealth)
self.navigate_to_page(page_url, timeout=page_timeout)
if password:
self.try_login(username, password, timeout=page_timeout)
@ -577,7 +578,8 @@ class Browser:
# run behavior again with short timeout?
# retrieve outlinks again and append to list?
def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1):
def configure_browser(self, extra_headers=None, user_agent=None,
download_throughput=-1, stealth=False):
headers = extra_headers or {}
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
self.websock_thread.expect_result(self._command_id.peek())
@ -596,6 +598,16 @@ class Browser:
# parameter value as bytes/second, or -1 to disable (default)
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
params={'downloadThroughput': download_throughput})
if stealth:
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template('stealth.js').render()
msg_id = self.send_to_chrome(
method='Page.addScriptToEvaluateOnNewDocument',
params={'source': js})
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=10)
def navigate_to_page(self, page_url, timeout=300):
self.logger.info('navigating to page %s', page_url)

View File

@ -156,6 +156,9 @@ def brozzle_page(argv=None):
arg_parser.add_argument(
'--browser_throughput', type=int, dest='download_throughput', default=-1,
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
arg_parser.add_argument(
'--stealth', dest='stealth', action='store_true',
help='Try to avoid web bot detection')
arg_parser.add_argument(
'--screenshot-full-page', dest='screenshot_full_page',
action='store_true')
@ -189,7 +192,8 @@ def brozzle_page(argv=None):
skip_youtube_dl=args.skip_youtube_dl,
simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page,
download_throughput=args.download_throughput)
download_throughput=args.download_throughput,
stealth=args.stealth)
def on_screenshot(screenshot_jpeg):
OK_CHARS = string.ascii_letters + string.digits

View File

@ -0,0 +1,19 @@
/**
* Mock GPU information with real values. Check using: https://bot.sannysoft.com/
*/
WebGLRenderingContext.prototype.getParameter = function(origFn) {
const paramMap = {};
paramMap[0x9245] = "Google Inc. (NVIDIA Corporation)"; // UNMASKED_VENDOR_WEBGL
paramMap[0x9246] = "ANGLE (NVIDIA Corporation, Quadro P400/PCIe/SSE2, OpenGL 4.5.0)"; // UNMASKED_RENDERER_WEBGL
paramMap[0x1F00] = "WebKit"; // VENDOR
paramMap[0x1F01] = "WebKit WebGL"; // RENDERER
paramMap[0x1F02] = "WebGL 1.0 (OpenGL ES 2.0 Chromium)"; // VERSION
return function(parameter) {
return paramMap[parameter] || origFn.call(this, parameter);
};
}(WebGLRenderingContext.prototype.getParameter);
// TODO Add many more feature detection evations here. For example:
// Mock navigator.permissions.query. In headful on secure origins the
// permission should be "default", not "denied".

View File

@ -53,7 +53,7 @@ class BrozzlerWorker:
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
download_throughput=-1):
download_throughput=-1, stealth=False):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -71,6 +71,7 @@ class BrozzlerWorker:
self._behavior_timeout = behavior_timeout
self._extract_outlinks_timeout = extract_outlinks_timeout
self._download_throughput = download_throughput
self._stealth = stealth
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -311,7 +312,8 @@ class BrozzlerWorker:
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout,
extract_outlinks_timeout=self._extract_outlinks_timeout,
download_throughput=self._download_throughput)
download_throughput=self._download_throughput,
stealth=self._stealth)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks