mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
brozzle-worker options --proxy-server=host:port and --ignore-certificate-errors (for use with warcprox)
This commit is contained in:
parent
b0f3b8a5e3
commit
ddd764cac5
@ -22,6 +22,10 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
|
|||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
help='max number of chrome instances simultaneously browsing pages')
|
help='max number of chrome instances simultaneously browsing pages')
|
||||||
|
arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
|
||||||
|
help='configure browser to use specified proxy server')
|
||||||
|
arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
|
||||||
|
action='store_true', help='configure browser to ignore certificate errors')
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
||||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
arg_parser.add_argument('--version', action='version',
|
arg_parser.add_argument('--version', action='version',
|
||||||
@ -54,7 +58,8 @@ def completed(site, crawl_url):
|
|||||||
q.put(crawl_url.to_dict())
|
q.put(crawl_url.to_dict())
|
||||||
|
|
||||||
def brozzle_site(site, chrome_port):
|
def brozzle_site(site, chrome_port):
|
||||||
with umbra.Browser(chrome_port=chrome_port, chrome_exe=args.chrome_exe) as browser:
|
with umbra.Browser(chrome_port=chrome_port, chrome_exe=args.chrome_exe,
|
||||||
|
proxy_server=args.proxy_server, ignore_cert_errors=args.ignore_cert_errors) as browser:
|
||||||
with browsers_lock:
|
with browsers_lock:
|
||||||
browsers.add(browser)
|
browsers.add(browser)
|
||||||
try:
|
try:
|
||||||
|
@ -66,10 +66,12 @@ class Browser:
|
|||||||
|
|
||||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||||
|
|
||||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'):
|
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
|
||||||
self.command_id = itertools.count(1)
|
self.command_id = itertools.count(1)
|
||||||
self.chrome_port = chrome_port
|
self.chrome_port = chrome_port
|
||||||
self.chrome_exe = chrome_exe
|
self.chrome_exe = chrome_exe
|
||||||
|
self.proxy_server = proxy_server
|
||||||
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
self._behavior = None
|
self._behavior = None
|
||||||
self._websock = None
|
self._websock = None
|
||||||
self._abort_browse_page = False
|
self._abort_browse_page = False
|
||||||
@ -88,8 +90,12 @@ class Browser:
|
|||||||
def start(self):
|
def start(self):
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe,
|
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||||
self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"]))
|
executable=self.chrome_exe,
|
||||||
|
user_home_dir=self._work_dir.name,
|
||||||
|
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||||
|
proxy_server=self.proxy_server,
|
||||||
|
ignore_cert_errors=self.ignore_cert_errors)
|
||||||
self._websocket_url = self._chrome_instance.start()
|
self._websocket_url = self._chrome_instance.start()
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
@ -243,11 +249,13 @@ class Browser:
|
|||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, port, executable, user_home_dir, user_data_dir):
|
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
|
||||||
self.port = port
|
self.port = port
|
||||||
self.executable = executable
|
self.executable = executable
|
||||||
self.user_home_dir = user_home_dir
|
self.user_home_dir = user_home_dir
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
|
self.proxy_server = proxy_server
|
||||||
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
|
|
||||||
# returns websocket url to chrome window with about:blank loaded
|
# returns websocket url to chrome window with about:blank loaded
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
@ -269,9 +277,13 @@ class Chrome:
|
|||||||
"--window-size=1100,900", "--no-default-browser-check",
|
"--window-size=1100,900", "--no-default-browser-check",
|
||||||
"--disable-first-run-ui", "--no-first-run",
|
"--disable-first-run-ui", "--no-first-run",
|
||||||
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
||||||
"--disable-web-security",
|
"--disable-web-security"]
|
||||||
"about:blank"]
|
if self.ignore_cert_errors:
|
||||||
self.logger.info("running {}".format(chrome_args))
|
chrome_args.append("--ignore-certificate-errors")
|
||||||
|
if self.proxy_server:
|
||||||
|
chrome_args.append("--proxy-server={}".format(self.proxy_server))
|
||||||
|
chrome_args.append("about:blank")
|
||||||
|
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
||||||
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
|
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
|
||||||
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
|
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
|
||||||
self._start = time.time() # member variable just so that kill -QUIT reports it
|
self._start = time.time() # member variable just so that kill -QUIT reports it
|
||||||
|
Loading…
x
Reference in New Issue
Block a user