mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-06 13:34:31 -04:00
Merge branch 'master' of github.com:internetarchive/brozzler
This commit is contained in:
commit
ddc808710b
4 changed files with 50 additions and 13 deletions
|
@ -340,6 +340,9 @@ def brozzler_worker(argv=None):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-youtube-dl', dest='skip_youtube_dl',
|
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
action='store_true', help=argparse.SUPPRESS)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--stealth', dest='stealth', action='store_true',
|
||||||
|
help='Try to avoid web bot detection')
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
@ -376,7 +379,8 @@ def brozzler_worker(argv=None):
|
||||||
warcprox_auto=args.warcprox_auto,
|
warcprox_auto=args.warcprox_auto,
|
||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl)
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
|
stealth=args.stealth)
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||||
|
|
|
@ -14,6 +14,34 @@ WebGLRenderingContext.prototype.getParameter = function(origFn) {
|
||||||
};
|
};
|
||||||
}(WebGLRenderingContext.prototype.getParameter);
|
}(WebGLRenderingContext.prototype.getParameter);
|
||||||
|
|
||||||
|
// This is `Linux x86_64` on Linux.
|
||||||
|
Object.defineProperty(navigator, 'platform', {
|
||||||
|
get: () => 'Win32'
|
||||||
|
});
|
||||||
|
|
||||||
|
// Randomize navigator.deviceMemory and navigator.hardwareConcurrency to evade
|
||||||
|
// browser fingerprinting.
|
||||||
|
function getRandomInt(min, max) {
|
||||||
|
min = Math.ceil(min);
|
||||||
|
max = Math.floor(max);
|
||||||
|
return Math.floor(Math.random() * (max - min) + min); //The maximum is exclusive and the minimum is inclusive
|
||||||
|
}
|
||||||
|
|
||||||
|
Object.defineProperty(navigator, 'deviceMemory', {
|
||||||
|
get: () => getRandomInt(4, 32)
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||||
|
get: () => getRandomInt(4, 32)
|
||||||
|
});
|
||||||
|
|
||||||
|
// Brozzler runs chrome with --disable-notifications which disables `window.Notification`.
|
||||||
|
// This object is used for web bot detection and should be there.
|
||||||
|
if (!window.Notification) {
|
||||||
|
window.Notification = {
|
||||||
|
permission: 'denied'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO Add many more feature detection evations here. For example:
|
// TODO Add many more feature detection evations here. For example:
|
||||||
// Mock navigator.permissions.query. In headful on secure origins the
|
// Mock navigator.permissions.query. In headful on secure origins the
|
||||||
// permission should be "default", not "denied".
|
// permission should be "default", not "denied".
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||||
related logic
|
related logic
|
||||||
|
|
||||||
Copyright (C) 2014-2019 Internet Archive
|
Copyright (C) 2014-2022 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -18,7 +18,9 @@ limitations under the License.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import brozzler
|
import brozzler
|
||||||
|
import base64
|
||||||
import cerberus
|
import cerberus
|
||||||
|
import copy
|
||||||
import datetime
|
import datetime
|
||||||
import doublethink
|
import doublethink
|
||||||
import hashlib
|
import hashlib
|
||||||
|
@ -31,6 +33,7 @@ import urlcanon
|
||||||
import urllib
|
import urllib
|
||||||
import uuid
|
import uuid
|
||||||
import yaml
|
import yaml
|
||||||
|
import zlib
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
def load_schema():
|
def load_schema():
|
||||||
|
@ -275,17 +278,19 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
def extra_headers(self, page: Optional["Page"] = None):
|
def extra_headers(self, page: Optional["Page"] = None):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
if self.warcprox_meta:
|
if self.warcprox_meta:
|
||||||
|
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
|
||||||
|
if "blocks" in self.warcprox_meta:
|
||||||
|
# delete temp_warcprox_meta's 'blocks' (they may be big!)
|
||||||
|
del temp_warcprox_meta['blocks']
|
||||||
|
# str-ify blocks
|
||||||
|
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
|
||||||
|
# encode(), compress, b64encode, decode()
|
||||||
|
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
|
||||||
if page is not None:
|
if page is not None:
|
||||||
self.warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||||
self.warcprox_meta["metadata"]["brozzled_url"] = page.url
|
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||||
self.warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||||
warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':'))
|
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
|
||||||
del self.warcprox_meta["metadata"]["hop_path"]
|
|
||||||
del self.warcprox_meta["metadata"]["brozzled_url"]
|
|
||||||
del self.warcprox_meta["metadata"]["hop_via_url"]
|
|
||||||
else:
|
|
||||||
warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':'))
|
|
||||||
hdrs["Warcprox-Meta"] = warcprox_meta_json
|
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def accept_reject_or_neither(self, url, parent_page=None):
|
def accept_reject_or_neither(self, url, parent_page=None):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.29',
|
version='1.5.32',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue