diff --git a/brozzler/cli.py b/brozzler/cli.py index e3bc93c..c146a48 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -340,6 +340,9 @@ def brozzler_worker(argv=None): arg_parser.add_argument( '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) + arg_parser.add_argument( + '--stealth', dest='stealth', action='store_true', + help='Try to avoid web bot detection') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -376,7 +379,8 @@ def brozzler_worker(argv=None): warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, - skip_youtube_dl=args.skip_youtube_dl) + skip_youtube_dl=args.skip_youtube_dl, + stealth=args.stealth) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) diff --git a/brozzler/js-templates/stealth.js b/brozzler/js-templates/stealth.js index dee3882..4835d2d 100644 --- a/brozzler/js-templates/stealth.js +++ b/brozzler/js-templates/stealth.js @@ -14,6 +14,34 @@ WebGLRenderingContext.prototype.getParameter = function(origFn) { }; }(WebGLRenderingContext.prototype.getParameter); +// This is `Linux x86_64` on Linux. +Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' +}); + +// Randomize navigator.deviceMemory and navigator.hardwareConcurrency to evade +// browser fingerprinting. +function getRandomInt(min, max) { + min = Math.ceil(min); + max = Math.floor(max); + return Math.floor(Math.random() * (max - min) + min); //The maximum is exclusive and the minimum is inclusive +} + +Object.defineProperty(navigator, 'deviceMemory', { + get: () => getRandomInt(4, 32) +}); +Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => getRandomInt(4, 32) +}); + +// Brozzler runs chrome with --disable-notifications which disables `window.Notification`. +// This object is used for web bot detection and should be there. +if (!window.Notification) { + window.Notification = { + permission: 'denied' + } +} + // TODO Add many more feature detection evations here. For example: // Mock navigator.permissions.query. In headful on secure origins the // permission should be "default", not "denied". diff --git a/brozzler/model.py b/brozzler/model.py index 689b268..8e35d0a 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -2,7 +2,7 @@ brozzler/models.py - model classes representing jobs, sites, and pages, with related logic -Copyright (C) 2014-2019 Internet Archive +Copyright (C) 2014-2022 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,7 +18,9 @@ limitations under the License. ''' import brozzler +import base64 import cerberus +import copy import datetime import doublethink import hashlib @@ -31,6 +33,7 @@ import urlcanon import urllib import uuid import yaml +import zlib from typing import Optional def load_schema(): @@ -275,17 +278,19 @@ class Site(doublethink.Document, ElapsedMixIn): def extra_headers(self, page: Optional["Page"] = None): hdrs = {} if self.warcprox_meta: + temp_warcprox_meta = copy.deepcopy(self.warcprox_meta) + if "blocks" in self.warcprox_meta: + # delete temp_warcprox_meta's 'blocks' (they may be big!) + del temp_warcprox_meta['blocks'] + # str-ify blocks + blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':')) + # encode(), compress, b64encode, decode() + temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode() if page is not None: - self.warcprox_meta["metadata"]["hop_path"] = page.hop_path - self.warcprox_meta["metadata"]["brozzled_url"] = page.url - self.warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url - warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':')) - del self.warcprox_meta["metadata"]["hop_path"] - del self.warcprox_meta["metadata"]["brozzled_url"] - del self.warcprox_meta["metadata"]["hop_via_url"] - else: - warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':')) - hdrs["Warcprox-Meta"] = warcprox_meta_json + temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path + temp_warcprox_meta["metadata"]["brozzled_url"] = page.url + temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url + hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':')) return hdrs def accept_reject_or_neither(self, url, parent_page=None): diff --git a/setup.py b/setup.py index 0619b97..5119ff1 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.29', + version='1.5.32', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',