split Chrome class into its own module

This commit is contained in:
Noah Levitt 2016-12-06 12:50:38 -08:00
parent 3c43fdaced
commit 74009852d6
3 changed files with 219 additions and 179 deletions

View File

@ -1,6 +1,5 @@
'''
brozzler/browser.py - classes responsible for running web browsers
(chromium/chromium) and browsing web pages in them
brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2016 Internet Archive
@ -19,23 +18,19 @@ limitations under the License.
import logging
import json
import urllib.request
import itertools
import websocket
import time
import threading
import subprocess
import tempfile
import os
import random
import brozzler
from brozzler.chrome import Chrome
from brozzler.behaviors import Behavior
from requests.structures import CaseInsensitiveDict
import select
import re
import base64
import psutil
import signal
import sqlite3
import datetime
@ -233,7 +228,7 @@ class Browser:
on_request=None, on_response=None, on_screenshot=None,
on_url_change=None):
"""
Synchronously loads a page, takes a screenshot, and runs behaviors.
Synchronously loads a page, runs behaviors, and takes a screenshot.
Raises BrowsingException if browsing the page fails in a non-critical
way.
@ -263,10 +258,10 @@ class Browser:
self._websocket_url, on_open=self._visit_page,
on_message=self._wrap_handle_message)
threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
self.chrome_port, datetime.datetime.utcnow())
websock_thread = threading.Thread(
target=self._websock.run_forever, name=threadName,
target=self._websock.run_forever, name=thread_name,
kwargs={'ping_timeout':0.5})
websock_thread.start()
self._start = time.time()
@ -570,171 +565,3 @@ __brzl_compileOutlinks(window).join('\n');
# else:
# self.logger.debug("%s", json_message)
class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
self.port = port
self.executable = executable
self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir
self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
def __enter__(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
return self.start()
def __exit__(self, *args):
self.stop()
def start(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
timeout_sec = 600
new_env = os.environ.copy()
new_env["HOME"] = self.user_home_dir
chrome_args = [
self.executable, "--use-mock-keychain", # mac thing
"--user-data-dir={}".format(self.user_data_dir),
"--remote-debugging-port={}".format(self.port),
"--disable-web-sockets", "--disable-cache",
"--window-size=1100,900", "--no-default-browser-check",
"--disable-first-run-ui", "--no-first-run",
"--homepage=about:blank", "--disable-direct-npapi-requests",
"--disable-web-security", "--disable-notifications",
"--disable-extensions",
"--disable-save-password-bubble"]
if self.ignore_cert_errors:
chrome_args.append("--ignore-certificate-errors")
if self.proxy:
chrome_args.append("--proxy-server={}".format(self.proxy))
chrome_args.append("about:blank")
self.logger.info("running: {}".format(" ".join(chrome_args)))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(chrome_args, env=new_env,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0,
start_new_session=True)
self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout,
name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid))
self._out_reader_thread.start()
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
self._start = time.time() # member variable just so that kill -QUIT reports it
json_url = "http://localhost:%s/json" % self.port
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
self.logger.debug("{} returned {}".format(json_url, raw_json))
url = debug_info[0]['webSocketDebuggerUrl']
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
return url
except BaseException as e:
if int(time.time() - self._start) % 10 == 5:
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
pass
finally:
if time.time() - self._start > timeout_sec:
self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start)
self.stop()
raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start))
else:
time.sleep(0.5)
def _read_stderr_stdout(self):
# XXX select doesn't work on windows
def readline_nonblock(f):
buf = b""
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
buf += f.read(1)
return buf
try:
while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
if re.search(
b"Xlib: extension|"
b"CERT_PKIXVerifyCert for [^ ]* failed|"
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
b"ERROR:gpu_child_thread.cc", buf):
logging.log(
brozzler.TRACE, "chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
else:
logging.debug(
"chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
if re.search(
b"Xlib: extension|"
b"CERT_PKIXVerifyCert for [^ ]* failed|"
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
b"ERROR:gpu_child_thread.cc", buf):
logging.log(
brozzler.TRACE, "chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
else:
logging.debug(
"chrome pid %s STDERR %s",
self.chrome_process.pid, buf)
except:
logging.error("unexpected exception", exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
return
timeout_sec = 300
self._shutdown.set()
self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM)
first_sigterm = time.time()
try:
while time.time() - first_sigterm < timeout_sec:
time.sleep(0.5)
status = self.chrome_process.poll()
if status is not None:
if status == 0:
self.logger.info(
"chrome pid %s exited normally",
self.chrome_process.pid)
else:
self.logger.warn(
"chrome pid %s exited with nonzero status %s",
self.chrome_process.pid, status)
# XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang
# around, but there's a chance I suppose that some other
# process could have started with the same pgid
return
self.logger.warn(
"chrome pid %s still alive %.1f seconds after sending "
"SIGTERM, sending SIGKILL", self.chrome_process.pid,
time.time() - first_sigterm)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warn(
"chrome pid %s reaped (status=%s) after killing with "
"SIGKILL", self.chrome_process.pid, status)
finally:
self._out_reader_thread.join()
self.chrome_process = None

213
brozzler/chrome.py Normal file
View File

@ -0,0 +1,213 @@
'''
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import urllib.request
import time
import threading
import subprocess
import os
import brozzler
import select
import re
import signal
import sqlite3
import datetime
import json
class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(
self, port, executable, user_home_dir, user_data_dir, proxy=None,
ignore_cert_errors=False):
self.port = port
self.executable = executable
self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir
self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
def __enter__(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
return self.start()
def __exit__(self, *args):
self.stop()
def start(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
timeout_sec = 600
new_env = os.environ.copy()
new_env['HOME'] = self.user_home_dir
chrome_args = [
self.executable, '--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self.user_data_dir,
'--remote-debugging-port=%s' % self.port,
'--disable-web-sockets', '--disable-cache',
'--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests',
'--disable-web-security', '--disable-notifications',
'--disable-extensions', '--disable-save-password-bubble']
if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors')
if self.proxy:
chrome_args.append('--proxy-server=%s' % self.proxy)
chrome_args.append('about:blank')
self.logger.info(
'running: %s' % repr(subprocess.list2cmdline(chrome_args)))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
chrome_args, env=new_env, start_new_session=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
self._out_reader_thread = threading.Thread(
target=self._read_stderr_stdout,
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid)
self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
json_url = 'http://localhost:%s/json' % self.port
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info
if x['url'] == 'about:blank']
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
self.logger.debug('%s returned %s', json_url, raw_json)
url = debug_info[0]['webSocketDebuggerUrl']
self.logger.info(
'got chrome window websocket debug url %s from %s',
url, json_url)
return url
except BaseException as e:
if int(time.time() - self._start) % 10 == 5:
self.logger.warn(
'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e)
pass
finally:
if time.time() - self._start > timeout_sec:
self.logger.error(
'killing chrome, failed to retrieve %s after %s '
'seconds', json_url, time.time() - self._start)
self.stop()
raise Exception(
'killed chrome, failed to retrieve %s after %s '
'seconds' % (json_url, time.time() - self._start))
else:
time.sleep(0.5)
def _read_stderr_stdout(self):
# XXX select doesn't work on windows
def readline_nonblock(f):
buf = b''
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
buf += f.read(1)
return buf
try:
while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
if re.search(
b'Xlib: extension|'
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
if re.search(
b'Xlib: extension|'
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
except:
logging.error('unexpected exception', exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
return
timeout_sec = 300
self._shutdown.set()
self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM)
first_sigterm = time.time()
try:
while time.time() - first_sigterm < timeout_sec:
time.sleep(0.5)
status = self.chrome_process.poll()
if status is not None:
if status == 0:
self.logger.info(
'chrome pid %s exited normally',
self.chrome_process.pid)
else:
self.logger.warn(
'chrome pid %s exited with nonzero status %s',
self.chrome_process.pid, status)
# XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang
# around, but there's a chance I suppose that some other
# process could have started with the same pgid
return
self.logger.warn(
'chrome pid %s still alive %.1f seconds after sending '
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
time.time() - first_sigterm)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warn(
'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status)
finally:
self._out_reader_thread.join()
self.chrome_process = None

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev130',
version='1.1b8.dev131',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',