diff --git a/brozzler/browser.py b/brozzler/browser.py index a616e21..62ef813 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,6 +1,5 @@ ''' -brozzler/browser.py - classes responsible for running web browsers -(chromium/chromium) and browsing web pages in them +brozzler/browser.py - manages the browsers for brozzler Copyright (C) 2014-2016 Internet Archive @@ -19,23 +18,19 @@ limitations under the License. import logging import json -import urllib.request import itertools import websocket import time import threading -import subprocess import tempfile import os import random import brozzler +from brozzler.chrome import Chrome from brozzler.behaviors import Behavior from requests.structures import CaseInsensitiveDict -import select -import re import base64 import psutil -import signal import sqlite3 import datetime @@ -233,7 +228,7 @@ class Browser: on_request=None, on_response=None, on_screenshot=None, on_url_change=None): """ - Synchronously loads a page, takes a screenshot, and runs behaviors. + Synchronously loads a page, runs behaviors, and takes a screenshot. Raises BrowsingException if browsing the page fails in a non-critical way. @@ -263,10 +258,10 @@ class Browser: self._websocket_url, on_open=self._visit_page, on_message=self._wrap_handle_message) - threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( + thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( self.chrome_port, datetime.datetime.utcnow()) websock_thread = threading.Thread( - target=self._websock.run_forever, name=threadName, + target=self._websock.run_forever, name=thread_name, kwargs={'ping_timeout':0.5}) websock_thread.start() self._start = time.time() @@ -570,171 +565,3 @@ __brzl_compileOutlinks(window).join('\n'); # else: # self.logger.debug("%s", json_message) -class Chrome: - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False): - self.port = port - self.executable = executable - self.user_home_dir = user_home_dir - self.user_data_dir = user_data_dir - self.proxy = proxy - self.ignore_cert_errors = ignore_cert_errors - self._shutdown = threading.Event() - - def __enter__(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - return self.start() - - def __exit__(self, *args): - self.stop() - - def start(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - timeout_sec = 600 - new_env = os.environ.copy() - new_env["HOME"] = self.user_home_dir - chrome_args = [ - self.executable, "--use-mock-keychain", # mac thing - "--user-data-dir={}".format(self.user_data_dir), - "--remote-debugging-port={}".format(self.port), - "--disable-web-sockets", "--disable-cache", - "--window-size=1100,900", "--no-default-browser-check", - "--disable-first-run-ui", "--no-first-run", - "--homepage=about:blank", "--disable-direct-npapi-requests", - "--disable-web-security", "--disable-notifications", - "--disable-extensions", - "--disable-save-password-bubble"] - if self.ignore_cert_errors: - chrome_args.append("--ignore-certificate-errors") - if self.proxy: - chrome_args.append("--proxy-server={}".format(self.proxy)) - chrome_args.append("about:blank") - self.logger.info("running: {}".format(" ".join(chrome_args))) - # start_new_session - new process group so we can kill the whole group - self.chrome_process = subprocess.Popen(chrome_args, env=new_env, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, - start_new_session=True) - self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout, - name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid)) - self._out_reader_thread.start() - self.logger.info("chrome running, pid {}".format(self.chrome_process.pid)) - self._start = time.time() # member variable just so that kill -QUIT reports it - - json_url = "http://localhost:%s/json" % self.port - - while True: - try: - raw_json = urllib.request.urlopen(json_url, timeout=30).read() - all_debug_info = json.loads(raw_json.decode('utf-8')) - debug_info = [x for x in all_debug_info if x['url'] == 'about:blank'] - - if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: - self.logger.debug("{} returned {}".format(json_url, raw_json)) - url = debug_info[0]['webSocketDebuggerUrl'] - self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url)) - return url - except BaseException as e: - if int(time.time() - self._start) % 10 == 5: - self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e) - pass - finally: - if time.time() - self._start > timeout_sec: - self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start) - self.stop() - raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start)) - else: - time.sleep(0.5) - - def _read_stderr_stdout(self): - # XXX select doesn't work on windows - def readline_nonblock(f): - buf = b"" - while not self._shutdown.is_set() and ( - len(buf) == 0 or buf[-1] != 0xa) and select.select( - [f],[],[],0.5)[0]: - buf += f.read(1) - return buf - - try: - while not self._shutdown.is_set(): - buf = readline_nonblock(self.chrome_process.stdout) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - - buf = readline_nonblock(self.chrome_process.stderr) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDERR %s", - self.chrome_process.pid, buf) - except: - logging.error("unexpected exception", exc_info=True) - - def stop(self): - if not self.chrome_process or self._shutdown.is_set(): - return - - timeout_sec = 300 - self._shutdown.set() - self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid) - - os.killpg(self.chrome_process.pid, signal.SIGTERM) - first_sigterm = time.time() - - try: - while time.time() - first_sigterm < timeout_sec: - time.sleep(0.5) - - status = self.chrome_process.poll() - if status is not None: - if status == 0: - self.logger.info( - "chrome pid %s exited normally", - self.chrome_process.pid) - else: - self.logger.warn( - "chrome pid %s exited with nonzero status %s", - self.chrome_process.pid, status) - - # XXX I would like to forcefully kill the process group - # here to guarantee no orphaned chromium subprocesses hang - # around, but there's a chance I suppose that some other - # process could have started with the same pgid - return - - self.logger.warn( - "chrome pid %s still alive %.1f seconds after sending " - "SIGTERM, sending SIGKILL", self.chrome_process.pid, - time.time() - first_sigterm) - os.killpg(self.chrome_process.pid, signal.SIGKILL) - status = self.chrome_process.wait() - self.logger.warn( - "chrome pid %s reaped (status=%s) after killing with " - "SIGKILL", self.chrome_process.pid, status) - finally: - self._out_reader_thread.join() - self.chrome_process = None diff --git a/brozzler/chrome.py b/brozzler/chrome.py new file mode 100644 index 0000000..9b4d816 --- /dev/null +++ b/brozzler/chrome.py @@ -0,0 +1,213 @@ +''' +brozzler/chrome.py - manages the chrome/chromium browser for brozzler + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import logging +import urllib.request +import time +import threading +import subprocess +import os +import brozzler +import select +import re +import signal +import sqlite3 +import datetime +import json + +class Chrome: + logger = logging.getLogger(__module__ + '.' + __qualname__) + + def __init__( + self, port, executable, user_home_dir, user_data_dir, proxy=None, + ignore_cert_errors=False): + self.port = port + self.executable = executable + self.user_home_dir = user_home_dir + self.user_data_dir = user_data_dir + self.proxy = proxy + self.ignore_cert_errors = ignore_cert_errors + self._shutdown = threading.Event() + + def __enter__(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + return self.start() + + def __exit__(self, *args): + self.stop() + + def start(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + timeout_sec = 600 + new_env = os.environ.copy() + new_env['HOME'] = self.user_home_dir + chrome_args = [ + self.executable, '--use-mock-keychain', # mac thing + '--user-data-dir=%s' % self.user_data_dir, + '--remote-debugging-port=%s' % self.port, + '--disable-web-sockets', '--disable-cache', + '--window-size=1100,900', '--no-default-browser-check', + '--disable-first-run-ui', '--no-first-run', + '--homepage=about:blank', '--disable-direct-npapi-requests', + '--disable-web-security', '--disable-notifications', + '--disable-extensions', '--disable-save-password-bubble'] + if self.ignore_cert_errors: + chrome_args.append('--ignore-certificate-errors') + if self.proxy: + chrome_args.append('--proxy-server=%s' % self.proxy) + chrome_args.append('about:blank') + self.logger.info( + 'running: %s' % repr(subprocess.list2cmdline(chrome_args))) + # start_new_session - new process group so we can kill the whole group + self.chrome_process = subprocess.Popen( + chrome_args, env=new_env, start_new_session=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) + self._out_reader_thread = threading.Thread( + target=self._read_stderr_stdout, + name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) + self._out_reader_thread.start() + self.logger.info('chrome running, pid %s' % self.chrome_process.pid) + # make this a member variable so that kill -QUIT reports it + self._start = time.time() + + json_url = 'http://localhost:%s/json' % self.port + + while True: + try: + raw_json = urllib.request.urlopen(json_url, timeout=30).read() + all_debug_info = json.loads(raw_json.decode('utf-8')) + debug_info = [x for x in all_debug_info + if x['url'] == 'about:blank'] + + if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: + self.logger.debug('%s returned %s', json_url, raw_json) + url = debug_info[0]['webSocketDebuggerUrl'] + self.logger.info( + 'got chrome window websocket debug url %s from %s', + url, json_url) + return url + except BaseException as e: + if int(time.time() - self._start) % 10 == 5: + self.logger.warn( + 'problem with %s (will keep trying until timeout ' + 'of %d seconds): %s', json_url, timeout_sec, e) + pass + finally: + if time.time() - self._start > timeout_sec: + self.logger.error( + 'killing chrome, failed to retrieve %s after %s ' + 'seconds', json_url, time.time() - self._start) + self.stop() + raise Exception( + 'killed chrome, failed to retrieve %s after %s ' + 'seconds' % (json_url, time.time() - self._start)) + else: + time.sleep(0.5) + + def _read_stderr_stdout(self): + # XXX select doesn't work on windows + def readline_nonblock(f): + buf = b'' + while not self._shutdown.is_set() and ( + len(buf) == 0 or buf[-1] != 0xa) and select.select( + [f],[],[],0.5)[0]: + buf += f.read(1) + return buf + + try: + while not self._shutdown.is_set(): + buf = readline_nonblock(self.chrome_process.stdout) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + + buf = readline_nonblock(self.chrome_process.stderr) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDERR %s', + self.chrome_process.pid, buf) + except: + logging.error('unexpected exception', exc_info=True) + + def stop(self): + if not self.chrome_process or self._shutdown.is_set(): + return + + timeout_sec = 300 + self._shutdown.set() + self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid) + + os.killpg(self.chrome_process.pid, signal.SIGTERM) + first_sigterm = time.time() + + try: + while time.time() - first_sigterm < timeout_sec: + time.sleep(0.5) + + status = self.chrome_process.poll() + if status is not None: + if status == 0: + self.logger.info( + 'chrome pid %s exited normally', + self.chrome_process.pid) + else: + self.logger.warn( + 'chrome pid %s exited with nonzero status %s', + self.chrome_process.pid, status) + + # XXX I would like to forcefully kill the process group + # here to guarantee no orphaned chromium subprocesses hang + # around, but there's a chance I suppose that some other + # process could have started with the same pgid + return + + self.logger.warn( + 'chrome pid %s still alive %.1f seconds after sending ' + 'SIGTERM, sending SIGKILL', self.chrome_process.pid, + time.time() - first_sigterm) + os.killpg(self.chrome_process.pid, signal.SIGKILL) + status = self.chrome_process.wait() + self.logger.warn( + 'chrome pid %s reaped (status=%s) after killing with ' + 'SIGKILL', self.chrome_process.pid, status) + finally: + self._out_reader_thread.join() + self.chrome_process = None diff --git a/setup.py b/setup.py index 6cfb1a7..545f560 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev130', + version='1.1b8.dev131', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',