mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' of github.com:internetarchive/brozzler
This commit is contained in:
commit
ddc808710b
@ -340,6 +340,9 @@ def brozzler_worker(argv=None):
|
||||
arg_parser.add_argument(
|
||||
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--stealth', dest='stealth', action='store_true',
|
||||
help='Try to avoid web bot detection')
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -376,7 +379,8 @@ def brozzler_worker(argv=None):
|
||||
warcprox_auto=args.warcprox_auto,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl)
|
||||
skip_youtube_dl=args.skip_youtube_dl,
|
||||
stealth=args.stealth)
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||
|
@ -14,6 +14,34 @@ WebGLRenderingContext.prototype.getParameter = function(origFn) {
|
||||
};
|
||||
}(WebGLRenderingContext.prototype.getParameter);
|
||||
|
||||
// This is `Linux x86_64` on Linux.
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => 'Win32'
|
||||
});
|
||||
|
||||
// Randomize navigator.deviceMemory and navigator.hardwareConcurrency to evade
|
||||
// browser fingerprinting.
|
||||
function getRandomInt(min, max) {
|
||||
min = Math.ceil(min);
|
||||
max = Math.floor(max);
|
||||
return Math.floor(Math.random() * (max - min) + min); //The maximum is exclusive and the minimum is inclusive
|
||||
}
|
||||
|
||||
Object.defineProperty(navigator, 'deviceMemory', {
|
||||
get: () => getRandomInt(4, 32)
|
||||
});
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||
get: () => getRandomInt(4, 32)
|
||||
});
|
||||
|
||||
// Brozzler runs chrome with --disable-notifications which disables `window.Notification`.
|
||||
// This object is used for web bot detection and should be there.
|
||||
if (!window.Notification) {
|
||||
window.Notification = {
|
||||
permission: 'denied'
|
||||
}
|
||||
}
|
||||
|
||||
// TODO Add many more feature detection evations here. For example:
|
||||
// Mock navigator.permissions.query. In headful on secure origins the
|
||||
// permission should be "default", not "denied".
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||
related logic
|
||||
|
||||
Copyright (C) 2014-2019 Internet Archive
|
||||
Copyright (C) 2014-2022 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -18,7 +18,9 @@ limitations under the License.
|
||||
'''
|
||||
|
||||
import brozzler
|
||||
import base64
|
||||
import cerberus
|
||||
import copy
|
||||
import datetime
|
||||
import doublethink
|
||||
import hashlib
|
||||
@ -31,6 +33,7 @@ import urlcanon
|
||||
import urllib
|
||||
import uuid
|
||||
import yaml
|
||||
import zlib
|
||||
from typing import Optional
|
||||
|
||||
def load_schema():
|
||||
@ -275,17 +278,19 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
def extra_headers(self, page: Optional["Page"] = None):
|
||||
hdrs = {}
|
||||
if self.warcprox_meta:
|
||||
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
|
||||
if "blocks" in self.warcprox_meta:
|
||||
# delete temp_warcprox_meta's 'blocks' (they may be big!)
|
||||
del temp_warcprox_meta['blocks']
|
||||
# str-ify blocks
|
||||
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
|
||||
# encode(), compress, b64encode, decode()
|
||||
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
|
||||
if page is not None:
|
||||
self.warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||
self.warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||
self.warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||
warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
del self.warcprox_meta["metadata"]["hop_path"]
|
||||
del self.warcprox_meta["metadata"]["brozzled_url"]
|
||||
del self.warcprox_meta["metadata"]["hop_via_url"]
|
||||
else:
|
||||
warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
hdrs["Warcprox-Meta"] = warcprox_meta_json
|
||||
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
|
||||
return hdrs
|
||||
|
||||
def accept_reject_or_neither(self, url, parent_page=None):
|
||||
|
Loading…
x
Reference in New Issue
Block a user