mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
290 lines
12 KiB
Python
290 lines
12 KiB
Python
'''
|
|
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
|
|
|
Copyright (C) 2014-2016 Internet Archive
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
'''
|
|
|
|
import logging
|
|
import urllib.request
|
|
import time
|
|
import threading
|
|
import subprocess
|
|
import os
|
|
import brozzler
|
|
import select
|
|
import re
|
|
import signal
|
|
import sqlite3
|
|
import json
|
|
import tempfile
|
|
|
|
class Chrome:
|
|
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
|
|
|
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
|
|
'''
|
|
Initializes instance of this class.
|
|
|
|
Doesn't start the browser, start() does that.
|
|
|
|
Args:
|
|
chrome_exe: filesystem path to chrome/chromium executable
|
|
port: chrome debugging protocol port (default 9222)
|
|
ignore_cert_errors: configure chrome to accept all certs (default
|
|
False)
|
|
'''
|
|
self.port = port
|
|
self.chrome_exe = chrome_exe
|
|
self.ignore_cert_errors = ignore_cert_errors
|
|
self._shutdown = threading.Event()
|
|
self.chrome_process = None
|
|
|
|
def __enter__(self):
|
|
'''
|
|
Returns websocket url to chrome window with about:blank loaded.
|
|
'''
|
|
return self.start()
|
|
|
|
def __exit__(self, *args):
|
|
self.stop()
|
|
|
|
def _init_cookie_db(self, cookie_db):
|
|
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
|
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
|
self.logger.debug('cookie DB provided, writing to %s', cookie_location)
|
|
os.makedirs(cookie_dir, exist_ok=True)
|
|
|
|
try:
|
|
with open(cookie_location, 'wb') as cookie_file:
|
|
cookie_file.write(cookie_db)
|
|
except OSError:
|
|
self.logger.error(
|
|
'exception writing cookie file at %s',
|
|
cookie_location, exc_info=True)
|
|
|
|
def persist_and_read_cookie_db(self):
|
|
cookie_location = os.path.join(
|
|
self._chrome_user_data_dir, 'Default', 'Cookies')
|
|
self.logger.debug(
|
|
'marking cookies persistent then reading file into memory: %s',
|
|
cookie_location)
|
|
try:
|
|
with sqlite3.connect(cookie_location) as conn:
|
|
cur = conn.cursor()
|
|
cur.execute('UPDATE cookies SET persistent = 1')
|
|
except sqlite3.Error:
|
|
self.logger.error('exception updating cookie DB', exc_info=True)
|
|
|
|
cookie_db = None
|
|
try:
|
|
with open(cookie_location, 'rb') as cookie_file:
|
|
cookie_db = cookie_file.read()
|
|
except OSError:
|
|
self.logger.error(
|
|
'exception reading from cookie DB file %s',
|
|
cookie_location, exc_info=True)
|
|
return cookie_db
|
|
|
|
def start(self, proxy=None, cookie_db=None):
|
|
'''
|
|
Starts chrome/chromium process.
|
|
|
|
Args:
|
|
proxy: http proxy 'host:port' (default None)
|
|
cookie_db: raw bytes of chrome/chromium sqlite3 cookies database,
|
|
which, if supplied, will be written to
|
|
{chrome_user_data_dir}/Default/Cookies before running the
|
|
browser (default None)
|
|
|
|
Returns:
|
|
websocket url to chrome window with about:blank loaded
|
|
'''
|
|
# these can raise exceptions
|
|
self._home_tmpdir = tempfile.TemporaryDirectory()
|
|
self._chrome_user_data_dir = os.path.join(
|
|
self._home_tmpdir.name, 'chrome-user-data')
|
|
if cookie_db:
|
|
self._init_cookie_db(cookie_db)
|
|
self._shutdown.clear()
|
|
|
|
new_env = os.environ.copy()
|
|
new_env['HOME'] = self._home_tmpdir.name
|
|
chrome_args = [
|
|
self.chrome_exe,
|
|
'--remote-debugging-port=%s' % self.port,
|
|
'--use-mock-keychain', # mac thing
|
|
'--user-data-dir=%s' % self._chrome_user_data_dir,
|
|
'--disable-web-sockets', '--disable-cache',
|
|
'--window-size=1100,900', '--no-default-browser-check',
|
|
'--disable-first-run-ui', '--no-first-run',
|
|
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
|
'--disable-web-security', '--disable-notifications',
|
|
'--disable-extensions', '--disable-save-password-bubble']
|
|
if self.ignore_cert_errors:
|
|
chrome_args.append('--ignore-certificate-errors')
|
|
if proxy:
|
|
chrome_args.append('--proxy-server=%s' % proxy)
|
|
chrome_args.append('about:blank')
|
|
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
|
|
# start_new_session - new process group so we can kill the whole group
|
|
self.chrome_process = subprocess.Popen(
|
|
chrome_args, env=new_env, start_new_session=True,
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
|
|
self._out_reader_thread = threading.Thread(
|
|
target=self._read_stderr_stdout,
|
|
name='ChromeOutReaderThread:%s' % self.port, daemon=True)
|
|
self._out_reader_thread.start()
|
|
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
|
|
|
|
return self._websocket_url()
|
|
|
|
def _websocket_url(self):
|
|
timeout_sec = 600
|
|
json_url = 'http://localhost:%s/json' % self.port
|
|
# make this a member variable so that kill -QUIT reports it
|
|
self._start = time.time()
|
|
self._last_warning = self._start
|
|
while True:
|
|
try:
|
|
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
|
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
|
debug_info = [x for x in all_debug_info
|
|
if x['url'] == 'about:blank']
|
|
|
|
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
|
self.logger.debug('%s returned %s', json_url, raw_json)
|
|
url = debug_info[0]['webSocketDebuggerUrl']
|
|
self.logger.info(
|
|
'got chrome window websocket debug url %s from %s',
|
|
url, json_url)
|
|
return url
|
|
except brozzler.ShutdownRequested:
|
|
raise
|
|
except Exception as e:
|
|
if time.time() - self._last_warning > 30:
|
|
self.logger.warn(
|
|
'problem with %s (will keep trying until timeout '
|
|
'of %d seconds): %s', json_url, timeout_sec, e)
|
|
self._last_warning = time.time()
|
|
finally:
|
|
if time.time() - self._start > timeout_sec:
|
|
self.logger.error(
|
|
'killing chrome, failed to retrieve %s after % '
|
|
'seconds', json_url, time.time() - self._start)
|
|
self.stop()
|
|
raise Exception(
|
|
'killed chrome, failed to retrieve %s after %s '
|
|
'seconds' % (json_url, time.time() - self._start))
|
|
else:
|
|
time.sleep(0.5)
|
|
|
|
def _read_stderr_stdout(self):
|
|
# XXX select doesn't work on windows
|
|
def readline_nonblock(f):
|
|
buf = b''
|
|
while not self._shutdown.is_set() and (
|
|
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
|
[f],[],[],0.5)[0]:
|
|
buf += f.read(1)
|
|
return buf
|
|
|
|
try:
|
|
while not self._shutdown.is_set():
|
|
buf = readline_nonblock(self.chrome_process.stdout)
|
|
if buf:
|
|
if re.search(
|
|
b'Xlib: extension|'
|
|
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
|
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
|
b'ERROR:gpu_child_thread.cc', buf):
|
|
self.logger.trace(
|
|
'chrome pid %s STDOUT %s',
|
|
self.chrome_process.pid, buf)
|
|
else:
|
|
self.logger.debug(
|
|
'chrome pid %s STDOUT %s',
|
|
self.chrome_process.pid, buf)
|
|
|
|
buf = readline_nonblock(self.chrome_process.stderr)
|
|
if buf:
|
|
if re.search(
|
|
b'Xlib: extension|'
|
|
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
|
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
|
b'ERROR:gpu_child_thread.cc', buf):
|
|
self.logger.trace(
|
|
'chrome pid %s STDOUT %s',
|
|
self.chrome_process.pid, buf)
|
|
else:
|
|
self.logger.debug(
|
|
'chrome pid %s STDERR %s',
|
|
self.chrome_process.pid, buf)
|
|
except:
|
|
self.logger.error('unexpected exception', exc_info=True)
|
|
|
|
def stop(self):
|
|
if not self.chrome_process or self._shutdown.is_set():
|
|
return
|
|
self._shutdown.set()
|
|
|
|
timeout_sec = 300
|
|
if self.chrome_process.poll() is None:
|
|
self.logger.info(
|
|
'terminating chrome pgid %s', self.chrome_process.pid)
|
|
|
|
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
|
t0 = time.time()
|
|
|
|
try:
|
|
while time.time() - t0 < timeout_sec:
|
|
status = self.chrome_process.poll()
|
|
if status is not None:
|
|
if status == 0:
|
|
self.logger.info(
|
|
'chrome pid %s exited normally',
|
|
self.chrome_process.pid)
|
|
else:
|
|
self.logger.warn(
|
|
'chrome pid %s exited with nonzero status %s',
|
|
self.chrome_process.pid, status)
|
|
|
|
# XXX I would like to forcefully kill the process group
|
|
# here to guarantee no orphaned chromium subprocesses hang
|
|
# around, but there's a chance I suppose that some other
|
|
# process could have started with the same pgid
|
|
return
|
|
time.sleep(0.5)
|
|
|
|
self.logger.warn(
|
|
'chrome pid %s still alive %.1f seconds after sending '
|
|
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
|
time.time() - t0)
|
|
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
|
status = self.chrome_process.wait()
|
|
self.logger.warn(
|
|
'chrome pid %s reaped (status=%s) after killing with '
|
|
'SIGKILL', self.chrome_process.pid, status)
|
|
|
|
try:
|
|
self._home_tmpdir.cleanup()
|
|
except:
|
|
self.logger.error(
|
|
'exception deleting %s', self._home_tmpdir,
|
|
exc_info=True)
|
|
finally:
|
|
self._out_reader_thread.join()
|
|
self.chrome_process = None
|
|
|