This commit is contained in:
Anderson Martínez 2022-11-28 05:16:06 -08:00 committed by GitHub
commit e4ddb79a25
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 103 additions and 48 deletions

1
.gitignore vendored
View file

@ -2,3 +2,4 @@
*.diff
.*.sw*
/brozzler.egg-info/
venv

View file

@ -288,20 +288,23 @@ class Browser:
'''
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(self, **kwargs):
def __init__(self, chrome_exe, browserless_port, **kwargs):
'''
Initializes the Browser.
Args:
**kwargs: arguments for Chrome(...)
'''
self.chrome = Chrome(**kwargs)
self.websock_url = None
self.websock = None
self.websock_thread = None
self.is_browsing = False
self._command_id = Counter()
self._wait_interval = 0.5
self.browse_port = browserless_port
self.is_browserless = chrome_exe == 'browserless'
self.chrome = Chrome(chrome_exe=chrome_exe, browserless_port=browserless_port,
is_browserless=self.is_browserless, **kwargs)
self._command_id = Counter()
def __enter__(self):
self.start()
@ -343,6 +346,14 @@ class Browser:
**kwargs: arguments for self.chrome.start(...)
'''
if not self.is_running():
if self.is_browserless:
# Open a ws to create a browser on demand
args = self.chrome._browserless_args()
self.browserless_ws = websocket.create_connection(
"ws://localhost:" + str(self.browse_port) + "?" + args
)
self.websock_url = self.chrome.start(**kwargs)
self.websock = websocket.WebSocketApp(self.websock_url)
self.websock_thread = WebsockReceiverThread(

View file

@ -30,6 +30,7 @@ import sqlite3
import json
import tempfile
import sys
import functools
def check_version(chrome_exe):
'''
@ -62,7 +63,8 @@ def check_version(chrome_exe):
class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
def __init__(self, chrome_exe, browserless_port, is_browserless,
port=9222, ignore_cert_errors=False):
'''
Initializes instance of this class.
@ -74,15 +76,25 @@ class Chrome:
ignore_cert_errors: configure chrome to accept all certs (default
False)
'''
if chrome_exe == 'browserless':
# init browserless here maybe
pass
self.is_browserless = is_browserless
self.browserless_port = browserless_port
if self.is_browserless:
# browserless isn't attached to a PID
self.chrome_exe = None
self.port = None
else:
# use a local browser
self.port = port
self.chrome_exe = chrome_exe
self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
self.chrome_process = None
def __enter__(self):
@ -139,38 +151,11 @@ class Chrome:
cookie_location, exc_info=True)
return cookie_db
def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
disk_cache_size=None, websocket_timeout=60):
'''
Starts chrome/chromium process.
Args:
proxy: http proxy 'host:port' (default None)
cookie_db: raw bytes of chrome/chromium sqlite3 cookies database,
which, if supplied, will be written to
{chrome_user_data_dir}/Default/Cookies before running the
browser (default None)
disk_cache_dir: use directory for disk cache. The default location
is inside `self._home_tmpdir` (default None).
disk_cache_size: Forces the maximum disk space to be used by the disk
cache, in bytes. (default None)
websocket_timeout: websocket timeout, in seconds
Returns:
websocket url to chrome window with about:blank loaded
'''
# these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
if cookie_db:
self._init_cookie_db(cookie_db)
self._shutdown.clear()
new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
def _chrome_args(self, disk_cache_dir=None, disk_cache_size=None,
proxy=None):
chrome_args = [
self.chrome_exe,
'--remote-debugging-port=%s' % self.port,
'--remote-debugging-port=%s' % self.port or self.browserless_port,
'--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--disable-background-networking', '--disable-breakpad',
@ -196,6 +181,42 @@ class Chrome:
if proxy:
chrome_args.append('--proxy-server=%s' % proxy)
chrome_args.append('about:blank')
return chrome_args
def start(self, proxy=None, cookie_db=None,
disk_cache_dir=None, disk_cache_size=None, websocket_timeout=60):
'''
Starts chrome/chromium process.
Args:
proxy: http proxy 'host:port' (default None)
cookie_db: raw bytes of chrome/chromium sqlite3 cookies database,
which, if supplied, will be written to
{chrome_user_data_dir}/Default/Cookies before running the
browser (default None)
disk_cache_dir: use directory for disk cache. The default location
is inside `self._home_tmpdir` (default None).
disk_cache_size: Forces the maximum disk space to be used by the disk
cache, in bytes. (default None)
websocket_timeout: websocket timeout, in seconds
Returns:
websocket url to chrome window with about:blank loaded
'''
# these can raise exceptions
if cookie_db:
self._init_cookie_db(cookie_db)
self._shutdown.clear()
new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
chrome_args = self._chrome_args(disk_cache_dir=disk_cache_dir, disk_cache_size=disk_cache_size,
proxy=proxy)
if self.is_browserless:
return self.start_browserless()
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
@ -209,7 +230,25 @@ class Chrome:
return self._websocket_url(timeout_sec=websocket_timeout)
def _websocket_url(self, timeout_sec = 60):
def _browserless_args(self):
chrome_args = self._chrome_args()
chrome_args.pop(0)
chrome_args.pop(0)
return functools.reduce(lambda a, b: a + "&" + b, chrome_args)
def start_browserless(self):
json_url = "http://localhost:" + str(self.browserless_port) + "/sessions"
brwlss_json_raw = urllib.request.urlopen(json_url, timeout=30).read()
brwlss_json = json.loads(brwlss_json_raw)
wsURL = brwlss_json[0]['webSocketDebuggerUrl']
self.logger.info('got chrome websocket debug url %s from Browserless at %s', wsURL, json_url)
self.port = brwlss_json[0]['port']
return wsURL
def _websocket_url(self, timeout_sec=60):
json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it
self._start = time.time()

View file

@ -174,6 +174,9 @@ def brozzle_page(argv=None):
'--skip-browserless', dest='skip_browserless', action='store_true')
arg_parser.add_argument(
'--simpler404', dest='simpler404', action='store_true')
arg_parser.add_argument(
'--browserless-port', dest='browserless_port', default='3000',
help='port on which the browserless instance is')
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -210,7 +213,8 @@ def brozzle_page(argv=None):
f.write(screenshot_jpeg)
logging.info('wrote screenshot to %s', filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser = brozzler.Browser(chrome_exe=args.chrome_exe,
browserless_port=args.browserless_port)
try:
browser.start(proxy=args.proxy)
outlinks = worker.brozzle_page(