From 3c43fdaced98e1375e7eab93a740e9d3fb0df10a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 Nov 2016 00:52:14 +0000 Subject: [PATCH 01/12] new utility brozzler-list-captures for looking up entries in the "captures" table --- brozzler/cli.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 3 ++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index a57396f..b022771 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -350,3 +350,69 @@ def brozzler_ensure_tables(): # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(r) + +def brozzler_list_captures(): + ''' + Handy utility for looking up entries in the rethinkdb "captures" table by + url or sha1. + ''' + import surt + import rethinkdb + + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + arg_parser.add_argument( + 'url_or_sha1', metavar='URL_or_SHA1', + help='url or sha1 to look up in captures table') + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + + class Jsonner(json.JSONEncoder): + def default(self, o): + if isinstance(o, datetime.datetime): + return o.isoformat() + return json.JSONEncoder.default(self, o) + + if args.url_or_sha1[:5] == 'sha1:': + raise Exception('not implemented') + # def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): + # if algo != "sha1": + # raise Exception( + # "digest type is %s but big captures table is indexed by " + # "sha1" % algo) + # sha1base32 = base64.b32encode(raw_digest).decode("utf-8") + # results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() + # results = list(results_iter) + # if len(results) > 0: + # if len(results) > 1: + # self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) + # result = results[0] + # else: + # result = None + # self.logger.debug("returning %s for sha1base32=%s bucket=%s", + # result, sha1base32, bucket) + # return result + else: + key = surt.surt( + args.url_or_sha1, trailing_comma=True, host_massage=False, + with_scheme=True) + reql = r.table('captures').between( + [key[:150], rethinkdb.minval], + [key[:150]+'!', rethinkdb.maxval], + index='abbr_canon_surt_timestamp') + reql = reql.order_by(index='abbr_canon_surt_timestamp') + reql = reql.filter( + lambda capture: (capture['canon_surt'] >= key) + & (capture['canon_surt'] <= key)) + logging.debug('rethinkdb query: %s', reql) + results = reql.run() + for result in results: + print(json.dumps(result, cls=Jsonner, indent=2)) + diff --git a/setup.py b/setup.py index a613be4..6cfb1a7 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev129', + version='1.1b8.dev130', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -51,6 +51,7 @@ setuptools.setup( 'brozzler-new-site=brozzler.cli:brozzler_new_site', 'brozzler-worker=brozzler.cli:brozzler_worker', 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', + 'brozzler-list-captures=brozzler.cli:brozzler_list_captures', 'brozzler-dashboard=brozzler.dashboard:main', 'brozzler-easy=brozzler.easy:main', 'brozzler-wayback=brozzler.pywb:main', From 74009852d687f2be1a6dd93817c92f7f445d26d2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Dec 2016 12:50:38 -0800 Subject: [PATCH 02/12] split Chrome class into its own module --- brozzler/browser.py | 183 ++----------------------------------- brozzler/chrome.py | 213 ++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 3 files changed, 219 insertions(+), 179 deletions(-) create mode 100644 brozzler/chrome.py diff --git a/brozzler/browser.py b/brozzler/browser.py index a616e21..62ef813 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,6 +1,5 @@ ''' -brozzler/browser.py - classes responsible for running web browsers -(chromium/chromium) and browsing web pages in them +brozzler/browser.py - manages the browsers for brozzler Copyright (C) 2014-2016 Internet Archive @@ -19,23 +18,19 @@ limitations under the License. import logging import json -import urllib.request import itertools import websocket import time import threading -import subprocess import tempfile import os import random import brozzler +from brozzler.chrome import Chrome from brozzler.behaviors import Behavior from requests.structures import CaseInsensitiveDict -import select -import re import base64 import psutil -import signal import sqlite3 import datetime @@ -233,7 +228,7 @@ class Browser: on_request=None, on_response=None, on_screenshot=None, on_url_change=None): """ - Synchronously loads a page, takes a screenshot, and runs behaviors. + Synchronously loads a page, runs behaviors, and takes a screenshot. Raises BrowsingException if browsing the page fails in a non-critical way. @@ -263,10 +258,10 @@ class Browser: self._websocket_url, on_open=self._visit_page, on_message=self._wrap_handle_message) - threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( + thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( self.chrome_port, datetime.datetime.utcnow()) websock_thread = threading.Thread( - target=self._websock.run_forever, name=threadName, + target=self._websock.run_forever, name=thread_name, kwargs={'ping_timeout':0.5}) websock_thread.start() self._start = time.time() @@ -570,171 +565,3 @@ __brzl_compileOutlinks(window).join('\n'); # else: # self.logger.debug("%s", json_message) -class Chrome: - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False): - self.port = port - self.executable = executable - self.user_home_dir = user_home_dir - self.user_data_dir = user_data_dir - self.proxy = proxy - self.ignore_cert_errors = ignore_cert_errors - self._shutdown = threading.Event() - - def __enter__(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - return self.start() - - def __exit__(self, *args): - self.stop() - - def start(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - timeout_sec = 600 - new_env = os.environ.copy() - new_env["HOME"] = self.user_home_dir - chrome_args = [ - self.executable, "--use-mock-keychain", # mac thing - "--user-data-dir={}".format(self.user_data_dir), - "--remote-debugging-port={}".format(self.port), - "--disable-web-sockets", "--disable-cache", - "--window-size=1100,900", "--no-default-browser-check", - "--disable-first-run-ui", "--no-first-run", - "--homepage=about:blank", "--disable-direct-npapi-requests", - "--disable-web-security", "--disable-notifications", - "--disable-extensions", - "--disable-save-password-bubble"] - if self.ignore_cert_errors: - chrome_args.append("--ignore-certificate-errors") - if self.proxy: - chrome_args.append("--proxy-server={}".format(self.proxy)) - chrome_args.append("about:blank") - self.logger.info("running: {}".format(" ".join(chrome_args))) - # start_new_session - new process group so we can kill the whole group - self.chrome_process = subprocess.Popen(chrome_args, env=new_env, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, - start_new_session=True) - self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout, - name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid)) - self._out_reader_thread.start() - self.logger.info("chrome running, pid {}".format(self.chrome_process.pid)) - self._start = time.time() # member variable just so that kill -QUIT reports it - - json_url = "http://localhost:%s/json" % self.port - - while True: - try: - raw_json = urllib.request.urlopen(json_url, timeout=30).read() - all_debug_info = json.loads(raw_json.decode('utf-8')) - debug_info = [x for x in all_debug_info if x['url'] == 'about:blank'] - - if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: - self.logger.debug("{} returned {}".format(json_url, raw_json)) - url = debug_info[0]['webSocketDebuggerUrl'] - self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url)) - return url - except BaseException as e: - if int(time.time() - self._start) % 10 == 5: - self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e) - pass - finally: - if time.time() - self._start > timeout_sec: - self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start) - self.stop() - raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start)) - else: - time.sleep(0.5) - - def _read_stderr_stdout(self): - # XXX select doesn't work on windows - def readline_nonblock(f): - buf = b"" - while not self._shutdown.is_set() and ( - len(buf) == 0 or buf[-1] != 0xa) and select.select( - [f],[],[],0.5)[0]: - buf += f.read(1) - return buf - - try: - while not self._shutdown.is_set(): - buf = readline_nonblock(self.chrome_process.stdout) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - - buf = readline_nonblock(self.chrome_process.stderr) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDERR %s", - self.chrome_process.pid, buf) - except: - logging.error("unexpected exception", exc_info=True) - - def stop(self): - if not self.chrome_process or self._shutdown.is_set(): - return - - timeout_sec = 300 - self._shutdown.set() - self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid) - - os.killpg(self.chrome_process.pid, signal.SIGTERM) - first_sigterm = time.time() - - try: - while time.time() - first_sigterm < timeout_sec: - time.sleep(0.5) - - status = self.chrome_process.poll() - if status is not None: - if status == 0: - self.logger.info( - "chrome pid %s exited normally", - self.chrome_process.pid) - else: - self.logger.warn( - "chrome pid %s exited with nonzero status %s", - self.chrome_process.pid, status) - - # XXX I would like to forcefully kill the process group - # here to guarantee no orphaned chromium subprocesses hang - # around, but there's a chance I suppose that some other - # process could have started with the same pgid - return - - self.logger.warn( - "chrome pid %s still alive %.1f seconds after sending " - "SIGTERM, sending SIGKILL", self.chrome_process.pid, - time.time() - first_sigterm) - os.killpg(self.chrome_process.pid, signal.SIGKILL) - status = self.chrome_process.wait() - self.logger.warn( - "chrome pid %s reaped (status=%s) after killing with " - "SIGKILL", self.chrome_process.pid, status) - finally: - self._out_reader_thread.join() - self.chrome_process = None diff --git a/brozzler/chrome.py b/brozzler/chrome.py new file mode 100644 index 0000000..9b4d816 --- /dev/null +++ b/brozzler/chrome.py @@ -0,0 +1,213 @@ +''' +brozzler/chrome.py - manages the chrome/chromium browser for brozzler + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import logging +import urllib.request +import time +import threading +import subprocess +import os +import brozzler +import select +import re +import signal +import sqlite3 +import datetime +import json + +class Chrome: + logger = logging.getLogger(__module__ + '.' + __qualname__) + + def __init__( + self, port, executable, user_home_dir, user_data_dir, proxy=None, + ignore_cert_errors=False): + self.port = port + self.executable = executable + self.user_home_dir = user_home_dir + self.user_data_dir = user_data_dir + self.proxy = proxy + self.ignore_cert_errors = ignore_cert_errors + self._shutdown = threading.Event() + + def __enter__(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + return self.start() + + def __exit__(self, *args): + self.stop() + + def start(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + timeout_sec = 600 + new_env = os.environ.copy() + new_env['HOME'] = self.user_home_dir + chrome_args = [ + self.executable, '--use-mock-keychain', # mac thing + '--user-data-dir=%s' % self.user_data_dir, + '--remote-debugging-port=%s' % self.port, + '--disable-web-sockets', '--disable-cache', + '--window-size=1100,900', '--no-default-browser-check', + '--disable-first-run-ui', '--no-first-run', + '--homepage=about:blank', '--disable-direct-npapi-requests', + '--disable-web-security', '--disable-notifications', + '--disable-extensions', '--disable-save-password-bubble'] + if self.ignore_cert_errors: + chrome_args.append('--ignore-certificate-errors') + if self.proxy: + chrome_args.append('--proxy-server=%s' % self.proxy) + chrome_args.append('about:blank') + self.logger.info( + 'running: %s' % repr(subprocess.list2cmdline(chrome_args))) + # start_new_session - new process group so we can kill the whole group + self.chrome_process = subprocess.Popen( + chrome_args, env=new_env, start_new_session=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) + self._out_reader_thread = threading.Thread( + target=self._read_stderr_stdout, + name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) + self._out_reader_thread.start() + self.logger.info('chrome running, pid %s' % self.chrome_process.pid) + # make this a member variable so that kill -QUIT reports it + self._start = time.time() + + json_url = 'http://localhost:%s/json' % self.port + + while True: + try: + raw_json = urllib.request.urlopen(json_url, timeout=30).read() + all_debug_info = json.loads(raw_json.decode('utf-8')) + debug_info = [x for x in all_debug_info + if x['url'] == 'about:blank'] + + if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: + self.logger.debug('%s returned %s', json_url, raw_json) + url = debug_info[0]['webSocketDebuggerUrl'] + self.logger.info( + 'got chrome window websocket debug url %s from %s', + url, json_url) + return url + except BaseException as e: + if int(time.time() - self._start) % 10 == 5: + self.logger.warn( + 'problem with %s (will keep trying until timeout ' + 'of %d seconds): %s', json_url, timeout_sec, e) + pass + finally: + if time.time() - self._start > timeout_sec: + self.logger.error( + 'killing chrome, failed to retrieve %s after %s ' + 'seconds', json_url, time.time() - self._start) + self.stop() + raise Exception( + 'killed chrome, failed to retrieve %s after %s ' + 'seconds' % (json_url, time.time() - self._start)) + else: + time.sleep(0.5) + + def _read_stderr_stdout(self): + # XXX select doesn't work on windows + def readline_nonblock(f): + buf = b'' + while not self._shutdown.is_set() and ( + len(buf) == 0 or buf[-1] != 0xa) and select.select( + [f],[],[],0.5)[0]: + buf += f.read(1) + return buf + + try: + while not self._shutdown.is_set(): + buf = readline_nonblock(self.chrome_process.stdout) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + + buf = readline_nonblock(self.chrome_process.stderr) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDERR %s', + self.chrome_process.pid, buf) + except: + logging.error('unexpected exception', exc_info=True) + + def stop(self): + if not self.chrome_process or self._shutdown.is_set(): + return + + timeout_sec = 300 + self._shutdown.set() + self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid) + + os.killpg(self.chrome_process.pid, signal.SIGTERM) + first_sigterm = time.time() + + try: + while time.time() - first_sigterm < timeout_sec: + time.sleep(0.5) + + status = self.chrome_process.poll() + if status is not None: + if status == 0: + self.logger.info( + 'chrome pid %s exited normally', + self.chrome_process.pid) + else: + self.logger.warn( + 'chrome pid %s exited with nonzero status %s', + self.chrome_process.pid, status) + + # XXX I would like to forcefully kill the process group + # here to guarantee no orphaned chromium subprocesses hang + # around, but there's a chance I suppose that some other + # process could have started with the same pgid + return + + self.logger.warn( + 'chrome pid %s still alive %.1f seconds after sending ' + 'SIGTERM, sending SIGKILL', self.chrome_process.pid, + time.time() - first_sigterm) + os.killpg(self.chrome_process.pid, signal.SIGKILL) + status = self.chrome_process.wait() + self.logger.warn( + 'chrome pid %s reaped (status=%s) after killing with ' + 'SIGKILL', self.chrome_process.pid, status) + finally: + self._out_reader_thread.join() + self.chrome_process = None diff --git a/setup.py b/setup.py index 6cfb1a7..545f560 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev130', + version='1.1b8.dev131', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From ce03381b92495d38a1ae9706c1d4f9be3b71950d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Dec 2016 17:12:20 -0800 Subject: [PATCH 03/12] move _find_available_ports to chrome.py, changing the way it works so that browser:9200 doesn't get stuck at 9201 forever, which pushes 9201 to 9202 etc, and add a unit test --- brozzler/browser.py | 23 +++-------------------- brozzler/chrome.py | 20 ++++++++++++++++++++ setup.py | 2 +- tests/test_units.py | 18 ++++++++++++++++++ 4 files changed, 42 insertions(+), 21 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 62ef813..a8cdd1f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -30,7 +30,6 @@ from brozzler.chrome import Chrome from brozzler.behaviors import Behavior from requests.structures import CaseInsensitiveDict import base64 -import psutil import sqlite3 import datetime @@ -103,7 +102,9 @@ class Browser: HARD_TIMEOUT_SECONDS = 20 * 60 - def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): + def __init__( + self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, + ignore_cert_errors=False): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe @@ -130,7 +131,6 @@ class Browser: def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: # these can raise exceptions - self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() if cookie_db is not None: cookie_dir = os.path.join( @@ -199,23 +199,6 @@ class Browser: cookie_location, exc_info=True) return cookie_db - def _find_available_port(self): - port_available = False - port = self.chrome_port - - try: - conns = psutil.net_connections(kind="tcp") - except psutil.AccessDenied: - return port - - for p in range(port, 65535): - if any(connection.laddr[1] == p for connection in conns): - self.logger.warn("port %s already open, will try %s", p, p+1) - else: - port = p - break - return port - def is_running(self): return bool(self._websocket_url) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 9b4d816..96ae9cd 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -29,6 +29,7 @@ import signal import sqlite3 import datetime import json +import psutil class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) @@ -53,6 +54,24 @@ class Chrome: def __exit__(self, *args): self.stop() + def _find_available_port(self, default_port=9200): + try: + conns = psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + return default_port + + if any(conn.laddr[1] == default_port for conn in conns): + return default_port + + for p in range(9999,8999,-1): + if not any(conn.laddr[1] == p for conn in conns): + self.logger.warn( + 'port %s already in use, using %s instead', + default_port, p) + return p + + return default_port + def start(self): ''' Returns websocket url to chrome window with about:blank loaded. @@ -60,6 +79,7 @@ class Chrome: timeout_sec = 600 new_env = os.environ.copy() new_env['HOME'] = self.user_home_dir + self.port = self._find_available_port(self.port) chrome_args = [ self.executable, '--use-mock-keychain', # mac thing '--user-data-dir=%s' % self.user_data_dir, diff --git a/setup.py b/setup.py index 545f560..08748a2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev131', + version='1.1b8.dev132', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index 2fee049..acd5fd0 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -22,6 +22,9 @@ import http.server import threading import os import brozzler +import brozzler.chrome +import socket +import logging @pytest.fixture(scope='module') def httpd(request): @@ -52,3 +55,18 @@ def test_robots(httpd): site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') assert not brozzler.is_permitted_by_robots(site, url) +def test_find_available_port(): + try: + psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + logging.warn( + 'skipping _find_available_port() test because ' + 'psutil.net_connections(kind="tcp") raised AccessDenied') + return + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 + sock = socket.socket() + sock.bind(('localhost', 9800)) + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9999 + sock.close() + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 + From d3063fbd2b6fbb771e8f96218dc673aeab9dc08f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Dec 2016 18:04:51 -0800 Subject: [PATCH 04/12] move cookie db management code into chrome.py --- brozzler/browser.py | 59 ++++----------------------------- brozzler/chrome.py | 79 +++++++++++++++++++++++++++++++++++++-------- setup.py | 2 +- 3 files changed, 74 insertions(+), 66 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index a8cdd1f..c575c20 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -22,7 +22,6 @@ import itertools import websocket import time import threading -import tempfile import os import random import brozzler @@ -130,31 +129,10 @@ class Browser: def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: - # these can raise exceptions - self._work_dir = tempfile.TemporaryDirectory() - if cookie_db is not None: - cookie_dir = os.path.join( - self._work_dir.name, "chrome-user-data", "Default") - cookie_location = os.path.join(cookie_dir, "Cookies") - self.logger.debug( - "cookie DB provided, writing to %s", cookie_location) - os.makedirs(cookie_dir, exist_ok=True) - - try: - with open(cookie_location, 'wb') as cookie_file: - cookie_file.write(cookie_db) - except OSError: - self.logger.error( - "exception writing cookie file at %s", - cookie_location, exc_info=True) - self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, - user_home_dir=self._work_dir.name, - user_data_dir=os.sep.join([ - self._work_dir.name, "chrome-user-data"]), ignore_cert_errors=self.ignore_cert_errors, - proxy=proxy or self.proxy) + proxy=proxy or self.proxy, cookie_db=None) try: self._websocket_url = self._chrome_instance.start() except: @@ -166,45 +144,22 @@ class Browser: if self.is_running(): self._chrome_instance.stop() self._chrome_instance = None - try: - self._work_dir.cleanup() - except: - self.logger.error("exception deleting %s", self._work_dir, - exc_info=True) - self._work_dir = None self._websocket_url = None except: self.logger.error("problem stopping", exc_info=True) - def persist_and_read_cookie_db(self): - cookie_location = os.path.join( - self._work_dir.name, "chrome-user-data", "Default", "Cookies") - self.logger.debug( - "marking cookies persistent then reading file into memory: %s", - cookie_location) - try: - with sqlite3.connect(cookie_location) as conn: - cur = conn.cursor() - cur.execute("UPDATE cookies SET persistent = 1") - except sqlite3.Error: - self.logger.error("exception updating cookie DB", exc_info=True) - - cookie_db=None - try: - with open(cookie_location, "rb") as cookie_file: - cookie_db = cookie_file.read() - except OSError: - self.logger.error( - "exception reading from cookie DB file %s", - cookie_location, exc_info=True) - return cookie_db - def is_running(self): return bool(self._websocket_url) def abort_browse_page(self): self._abort_browse_page = True + def persist_and_read_cookie_db(self): + if self._chrome_instance: + return self._chrome_instance.persist_and_read_cookie_db() + else: + return None + def browse_page( self, url, extra_headers=None, behavior_parameters=None, user_agent=None, diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 96ae9cd..e9d3ef0 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -27,22 +27,21 @@ import select import re import signal import sqlite3 -import datetime import json import psutil +import tempfile class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__( - self, port, executable, user_home_dir, user_data_dir, proxy=None, - ignore_cert_errors=False): + self, port, executable, proxy=None, ignore_cert_errors=False, + cookie_db=None): self.port = port self.executable = executable - self.user_home_dir = user_home_dir - self.user_data_dir = user_data_dir self.proxy = proxy self.ignore_cert_errors = ignore_cert_errors + self.cookie_db = cookie_db self._shutdown = threading.Event() def __enter__(self): @@ -72,17 +71,61 @@ class Chrome: return default_port + def _init_cookie_db(self): + if self.cookie_db is not None: + cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') + cookie_location = os.path.join(cookie_dir, 'Cookies') + self.logger.debug( + 'cookie DB provided, writing to %s', cookie_location) + os.makedirs(cookie_dir, exist_ok=True) + + try: + with open(cookie_location, 'wb') as cookie_file: + cookie_file.write(self.cookie_db) + except OSError: + self.logger.error( + 'exception writing cookie file at %s', + cookie_location, exc_info=True) + + def persist_and_read_cookie_db(self): + cookie_location = os.path.join( + self._chrome_user_data_dir, 'Default', 'Cookies') + self.logger.debug( + 'marking cookies persistent then reading file into memory: %s', + cookie_location) + try: + with sqlite3.connect(cookie_location) as conn: + cur = conn.cursor() + cur.execute('UPDATE cookies SET persistent = 1') + except sqlite3.Error: + self.logger.error('exception updating cookie DB', exc_info=True) + + cookie_db = None + try: + with open(cookie_location, 'rb') as cookie_file: + cookie_db = cookie_file.read() + except OSError: + self.logger.error( + 'exception reading from cookie DB file %s', + cookie_location, exc_info=True) + return cookie_db + def start(self): ''' Returns websocket url to chrome window with about:blank loaded. ''' - timeout_sec = 600 + # these can raise exceptions + self._home_tmpdir = tempfile.TemporaryDirectory() + self._chrome_user_data_dir = os.path.join( + self._home_tmpdir.name, 'chrome-user-data'), + self._init_cookie_db() + new_env = os.environ.copy() - new_env['HOME'] = self.user_home_dir + new_env['HOME'] = self._home_tmpdir.name self.port = self._find_available_port(self.port) chrome_args = [ self.executable, '--use-mock-keychain', # mac thing - '--user-data-dir=%s' % self.user_data_dir, + '--user-data-dir=%s' % self._chrome_user_data_dir, '--remote-debugging-port=%s' % self.port, '--disable-web-sockets', '--disable-cache', '--window-size=1100,900', '--no-default-browser-check', @@ -96,7 +139,7 @@ class Chrome: chrome_args.append('--proxy-server=%s' % self.proxy) chrome_args.append('about:blank') self.logger.info( - 'running: %s' % repr(subprocess.list2cmdline(chrome_args))) + 'running: %s', repr(subprocess.list2cmdline(chrome_args))) # start_new_session - new process group so we can kill the whole group self.chrome_process = subprocess.Popen( chrome_args, env=new_env, start_new_session=True, @@ -106,11 +149,14 @@ class Chrome: name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) self._out_reader_thread.start() self.logger.info('chrome running, pid %s' % self.chrome_process.pid) + + return self._websocket_url() + + def _websocket_url(self): + timeout_sec = 600 + json_url = 'http://localhost:%s/json' % self.port # make this a member variable so that kill -QUIT reports it self._start = time.time() - - json_url = 'http://localhost:%s/json' % self.port - while True: try: raw_json = urllib.request.urlopen(json_url, timeout=30).read() @@ -134,7 +180,7 @@ class Chrome: finally: if time.time() - self._start > timeout_sec: self.logger.error( - 'killing chrome, failed to retrieve %s after %s ' + 'killing chrome, failed to retrieve %s after % ' 'seconds', json_url, time.time() - self._start) self.stop() raise Exception( @@ -228,6 +274,13 @@ class Chrome: self.logger.warn( 'chrome pid %s reaped (status=%s) after killing with ' 'SIGKILL', self.chrome_process.pid, status) + + try: + self._home_tmpdir.cleanup() + except: + self.logger.error( + "exception deleting %s", self._home_tmpdir, + exc_info=True) finally: self._out_reader_thread.join() self.chrome_process = None diff --git a/setup.py b/setup.py index 08748a2..2f6723e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev132', + version='1.1b8.dev133', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From e250c4ca893edeb34f659342ca13e8d860609747 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Dec 2016 09:33:06 -0800 Subject: [PATCH 05/12] wrong branch of warcprox in ansible install --- ansible/roles/warcprox/tasks/main.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/warcprox/tasks/main.yml b/ansible/roles/warcprox/tasks/main.yml index d1724d4..be53dc5 100644 --- a/ansible/roles/warcprox/tasks/main.yml +++ b/ansible/roles/warcprox/tasks/main.yml @@ -14,7 +14,7 @@ become: true file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}} - name: install warcprox in virtualenv - pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox + pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox virtualenv={{venv_root}}/warcprox-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' diff --git a/setup.py b/setup.py index 2f6723e..63aeb20 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev133', + version='1.1b8.dev134', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 0b6c5346bd5d9fe320d720bd9af2d1f1ee9656d1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Dec 2016 11:18:41 -0800 Subject: [PATCH 06/12] avoid broken version of websocket-client to fix https://github.com/internetarchive/brozzler/issues/28 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 63aeb20..2fd868a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev134', + version='1.1b8.dev135', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -62,7 +62,7 @@ setuptools.setup( 'youtube-dl', 'reppy==0.3.4', 'requests', - 'websocket-client', + 'websocket-client!=0.39.0', 'pillow==3.3.0', 'surt>=0.3.0', 'rethinkstuff>=0.1.5', From eed8b9ec30739c6b249dd8f7f3152dc327df2d46 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Dec 2016 11:20:10 -0800 Subject: [PATCH 07/12] little fixes --- brozzler/chrome.py | 4 ++-- brozzler/pywb.py | 2 +- setup.py | 2 +- tests/test_units.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index e9d3ef0..f72ff8d 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -117,7 +117,7 @@ class Chrome: # these can raise exceptions self._home_tmpdir = tempfile.TemporaryDirectory() self._chrome_user_data_dir = os.path.join( - self._home_tmpdir.name, 'chrome-user-data'), + self._home_tmpdir.name, 'chrome-user-data') self._init_cookie_db() new_env = os.environ.copy() @@ -279,7 +279,7 @@ class Chrome: self._home_tmpdir.cleanup() except: self.logger.error( - "exception deleting %s", self._home_tmpdir, + 'exception deleting %s', self._home_tmpdir, exc_info=True) finally: self._out_reader_thread.join() diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 9b0a0f7..7b0b95a 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -70,7 +70,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): 'filename': record['filename'], } if record['warc_type'] != 'revisit': - blob['mime'] = record['content_type'] + blob['mime'] = record['content_type'] or '-' else: blob['mime'] = 'warc/revisit' # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}' diff --git a/setup.py b/setup.py index 2fd868a..7fd25c6 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev135', + version='1.1b8.dev137', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index acd5fd0..f8042f0 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -25,6 +25,7 @@ import brozzler import brozzler.chrome import socket import logging +import psutil @pytest.fixture(scope='module') def httpd(request): From 9bcec54f4b62e34d26a50230049ff8ccc48a3b12 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Dec 2016 14:08:34 -0800 Subject: [PATCH 08/12] fix _find_available_port and its unit test --- brozzler/chrome.py | 2 +- setup.py | 2 +- tests/test_units.py | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index f72ff8d..5208442 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -59,7 +59,7 @@ class Chrome: except psutil.AccessDenied: return default_port - if any(conn.laddr[1] == default_port for conn in conns): + if not any(conn.laddr[1] == default_port for conn in conns): return default_port for p in range(9999,8999,-1): diff --git a/setup.py b/setup.py index 7fd25c6..9dce99d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev137', + version='1.1b8.dev138', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index f8042f0..58b929f 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -57,6 +57,7 @@ def test_robots(httpd): assert not brozzler.is_permitted_by_robots(site, url) def test_find_available_port(): + x = brozzler.chrome.Chrome(None, None) try: psutil.net_connections(kind='tcp') except psutil.AccessDenied: @@ -64,10 +65,11 @@ def test_find_available_port(): 'skipping _find_available_port() test because ' 'psutil.net_connections(kind="tcp") raised AccessDenied') return - assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 + assert x._find_available_port(9800) == 9800 sock = socket.socket() sock.bind(('localhost', 9800)) - assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9999 + sock.listen(0) + assert x._find_available_port(9800) == 9999 sock.close() - assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 + assert x._find_available_port(9800) == 9800 From 40b4d9bfe8a5c4468533e789b5dc47c69d111868 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 7 Dec 2016 14:46:29 -0800 Subject: [PATCH 09/12] travis-ci slack integration --- .travis.yml | 4 +++- setup.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 552bf6b..3d745c6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,4 +14,6 @@ after_failure: - sudo cat /var/log/upstart/warcprox.log - sudo cat /var/log/upstart/brozzler-worker.log - sudo cat /var/log/upstart/pywb.log - +notifications: + slack: + secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= diff --git a/setup.py b/setup.py index 9dce99d..fac887d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev138', + version='1.1b8.dev139', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From f6a25aa4f03719ba9f428a6cc4607cdc80bf5092 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 8 Dec 2016 15:16:02 -0800 Subject: [PATCH 10/12] brozzler logo svg with small default size --- brozzler/dashboard/static/brozzler-icon.svg | 262 ++++++++++++++++++++ setup.py | 2 +- 2 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 brozzler/dashboard/static/brozzler-icon.svg diff --git a/brozzler/dashboard/static/brozzler-icon.svg b/brozzler/dashboard/static/brozzler-icon.svg new file mode 100644 index 0000000..c8fecdc --- /dev/null +++ b/brozzler/dashboard/static/brozzler-icon.svg @@ -0,0 +1,262 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/setup.py b/setup.py index fac887d..3c5468d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev139', + version='1.1b8.dev140', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From af1e1c75ecbaa564c1b0d44869dddeb2362aad33 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 9 Dec 2016 14:16:27 -0800 Subject: [PATCH 11/12] avoid infinite loop in case youtube-dl encounters redirect loop (which can be ok if cookies have been set or something) --- brozzler/worker.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 21aa22e..436c08b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -78,7 +78,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler): final_url = url while final_url in redirects: - final_url = redirects[final_url].response.headers['location'] + final_url = redirects.pop(final_url).response.headers['location'] final_bounces = [] for txn in self.transactions: diff --git a/setup.py b/setup.py index 3c5468d..0778064 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev140', + version='1.1b8.dev141', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From d68053764c097dc2db39613302776ac46a1cb3c7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 9 Dec 2016 16:43:23 -0800 Subject: [PATCH 12/12] fix bug handling page with zero outlinks --- brozzler/browser.py | 7 +++++-- setup.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index c575c20..f545f5f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -318,8 +318,11 @@ __brzl_compileOutlinks(window).join('\n'); chain = [] def set_outlinks(message): - self._outlinks = frozenset( - message["result"]["result"]["value"].split("\n")) + if message["result"]["result"]["value"]: + self._outlinks = frozenset( + message["result"]["result"]["value"].split("\n")) + else: + self._outlinks = frozenset() chain.append({ "info": "retrieving outlinks", diff --git a/setup.py b/setup.py index 0778064..f4f6b10 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev141', + version='1.1b8.dev142', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',