diff --git a/.travis.yml b/.travis.yml index 552bf6b..3d745c6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,4 +14,6 @@ after_failure: - sudo cat /var/log/upstart/warcprox.log - sudo cat /var/log/upstart/brozzler-worker.log - sudo cat /var/log/upstart/pywb.log - +notifications: + slack: + secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= diff --git a/ansible/roles/warcprox/tasks/main.yml b/ansible/roles/warcprox/tasks/main.yml index d1724d4..be53dc5 100644 --- a/ansible/roles/warcprox/tasks/main.yml +++ b/ansible/roles/warcprox/tasks/main.yml @@ -14,7 +14,7 @@ become: true file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}} - name: install warcprox in virtualenv - pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox + pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox virtualenv={{venv_root}}/warcprox-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' diff --git a/brozzler/browser.py b/brozzler/browser.py index 2f977b5..a9c0ad4 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,6 +1,5 @@ ''' -brozzler/browser.py - classes responsible for running web browsers -(chromium/chromium) and browsing web pages in them +brozzler/browser.py - manages the browsers for brozzler Copyright (C) 2014-2016 Internet Archive @@ -19,23 +18,17 @@ limitations under the License. import logging import json -import urllib.request import itertools import websocket import time import threading -import subprocess -import tempfile import os import random import brozzler +from brozzler.chrome import Chrome from brozzler.behaviors import Behavior from requests.structures import CaseInsensitiveDict -import select -import re import base64 -import psutil -import signal import sqlite3 import datetime @@ -108,7 +101,9 @@ class Browser: HARD_TIMEOUT_SECONDS = 20 * 60 - def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): + def __init__( + self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, + ignore_cert_errors=False): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe @@ -134,33 +129,10 @@ class Browser: def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: - # these can raise exceptions - self.chrome_port = self._find_available_port() - self._work_dir = tempfile.TemporaryDirectory() - data_dir = os.path.join(self._work_dir.name, "chrome-user-data") - os.makedirs(data_dir, exist_ok=True) - if cookie_db is not None: - cookie_dir = os.path.join(data_dir, "Default") - cookie_location = os.path.join(cookie_dir, "Cookies") - self.logger.debug( - "cookie DB provided, writing to %s", cookie_location) - os.makedirs(cookie_dir, exist_ok=True) - - try: - with open(cookie_location, 'wb') as cookie_file: - cookie_file.write(cookie_db) - except OSError: - self.logger.error( - "exception writing cookie file at %s", - cookie_location, exc_info=True) - self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, - user_home_dir=self._work_dir.name, - user_data_dir=os.sep.join([ - self._work_dir.name, "chrome-user-data"]), ignore_cert_errors=self.ignore_cert_errors, - proxy=proxy or self.proxy) + proxy=proxy or self.proxy, cookie_db=None) try: self._websocket_url = self._chrome_instance.start() except: @@ -172,69 +144,29 @@ class Browser: if self.is_running(): self._chrome_instance.stop() self._chrome_instance = None - try: - self._work_dir.cleanup() - except: - self.logger.error("exception deleting %s", self._work_dir, - exc_info=True) - self._work_dir = None self._websocket_url = None except: self.logger.error("problem stopping", exc_info=True) - def persist_and_read_cookie_db(self): - cookie_location = os.path.join( - self._work_dir.name, "chrome-user-data", "Default", "Cookies") - self.logger.debug( - "marking cookies persistent then reading file into memory: %s", - cookie_location) - try: - with sqlite3.connect(cookie_location) as conn: - cur = conn.cursor() - cur.execute("UPDATE cookies SET persistent = 1") - except sqlite3.Error: - self.logger.error("exception updating cookie DB", exc_info=True) - - cookie_db=None - try: - with open(cookie_location, "rb") as cookie_file: - cookie_db = cookie_file.read() - except OSError: - self.logger.error( - "exception reading from cookie DB file %s", - cookie_location, exc_info=True) - return cookie_db - - def _find_available_port(self): - port_available = False - port = self.chrome_port - - try: - conns = psutil.net_connections(kind="tcp") - except psutil.AccessDenied: - return port - - for p in range(port, 65535): - if any(connection.laddr[1] == p for connection in conns): - self.logger.warn("port %s already open, will try %s", p, p+1) - else: - port = p - break - return port - def is_running(self): return bool(self._websocket_url) def abort_browse_page(self): self._abort_browse_page = True + def persist_and_read_cookie_db(self): + if self._chrome_instance: + return self._chrome_instance.persist_and_read_cookie_db() + else: + return None + def browse_page( self, url, extra_headers=None, behavior_parameters=None, user_agent=None, on_request=None, on_response=None, on_screenshot=None, on_url_change=None): """ - Synchronously loads a page, takes a screenshot, and runs behaviors. + Synchronously loads a page, runs behaviors, and takes a screenshot. Raises BrowsingException if browsing the page fails in a non-critical way. @@ -264,10 +196,10 @@ class Browser: self._websocket_url, on_open=self._visit_page, on_message=self._wrap_handle_message) - threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( + thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format( self.chrome_port, datetime.datetime.utcnow()) websock_thread = threading.Thread( - target=self._websock.run_forever, name=threadName, + target=self._websock.run_forever, name=thread_name, kwargs={'ping_timeout':0.5}) websock_thread.start() self._start = time.time() @@ -386,8 +318,11 @@ __brzl_compileOutlinks(window).join('\n'); chain = [] def set_outlinks(message): - self._outlinks = frozenset( - message["result"]["result"]["value"].split("\n")) + if message["result"]["result"]["value"]: + self._outlinks = frozenset( + message["result"]["result"]["value"].split("\n")) + else: + self._outlinks = frozenset() chain.append({ "info": "retrieving outlinks", @@ -571,171 +506,3 @@ __brzl_compileOutlinks(window).join('\n'); # else: # self.logger.debug("%s", json_message) -class Chrome: - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False): - self.port = port - self.executable = executable - self.user_home_dir = user_home_dir - self.user_data_dir = user_data_dir - self.proxy = proxy - self.ignore_cert_errors = ignore_cert_errors - self._shutdown = threading.Event() - - def __enter__(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - return self.start() - - def __exit__(self, *args): - self.stop() - - def start(self): - ''' - Returns websocket url to chrome window with about:blank loaded. - ''' - timeout_sec = 600 - new_env = os.environ.copy() - new_env["HOME"] = self.user_home_dir - chrome_args = [ - self.executable, "--use-mock-keychain", # mac thing - "--user-data-dir={}".format(self.user_data_dir), - "--remote-debugging-port={}".format(self.port), - "--disable-web-sockets", "--disable-cache", - "--window-size=1100,900", "--no-default-browser-check", - "--disable-first-run-ui", "--no-first-run", - "--homepage=about:blank", "--disable-direct-npapi-requests", - "--disable-web-security", "--disable-notifications", - "--disable-extensions", - "--disable-save-password-bubble"] - if self.ignore_cert_errors: - chrome_args.append("--ignore-certificate-errors") - if self.proxy: - chrome_args.append("--proxy-server={}".format(self.proxy)) - chrome_args.append("about:blank") - self.logger.info("running: {}".format(" ".join(chrome_args))) - # start_new_session - new process group so we can kill the whole group - self.chrome_process = subprocess.Popen(chrome_args, env=new_env, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, - start_new_session=True) - self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout, - name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid)) - self._out_reader_thread.start() - self.logger.info("chrome running, pid {}".format(self.chrome_process.pid)) - self._start = time.time() # member variable just so that kill -QUIT reports it - - json_url = "http://localhost:%s/json" % self.port - - while True: - try: - raw_json = urllib.request.urlopen(json_url, timeout=30).read() - all_debug_info = json.loads(raw_json.decode('utf-8')) - debug_info = [x for x in all_debug_info if x['url'] == 'about:blank'] - - if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: - self.logger.debug("{} returned {}".format(json_url, raw_json)) - url = debug_info[0]['webSocketDebuggerUrl'] - self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url)) - return url - except BaseException as e: - if int(time.time() - self._start) % 10 == 5: - self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e) - pass - finally: - if time.time() - self._start > timeout_sec: - self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start) - self.stop() - raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start)) - else: - time.sleep(0.5) - - def _read_stderr_stdout(self): - # XXX select doesn't work on windows - def readline_nonblock(f): - buf = b"" - while not self._shutdown.is_set() and ( - len(buf) == 0 or buf[-1] != 0xa) and select.select( - [f],[],[],0.5)[0]: - buf += f.read(1) - return buf - - try: - while not self._shutdown.is_set(): - buf = readline_nonblock(self.chrome_process.stdout) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - - buf = readline_nonblock(self.chrome_process.stderr) - if buf: - if re.search( - b"Xlib: extension|" - b"CERT_PKIXVerifyCert for [^ ]* failed|" - b"^ALSA lib|ERROR:gl_surface_glx.cc|" - b"ERROR:gpu_child_thread.cc", buf): - logging.log( - brozzler.TRACE, "chrome pid %s STDOUT %s", - self.chrome_process.pid, buf) - else: - logging.debug( - "chrome pid %s STDERR %s", - self.chrome_process.pid, buf) - except: - logging.error("unexpected exception", exc_info=True) - - def stop(self): - if not self.chrome_process or self._shutdown.is_set(): - return - - timeout_sec = 300 - self._shutdown.set() - self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid) - - os.killpg(self.chrome_process.pid, signal.SIGTERM) - first_sigterm = time.time() - - try: - while time.time() - first_sigterm < timeout_sec: - time.sleep(0.5) - - status = self.chrome_process.poll() - if status is not None: - if status == 0: - self.logger.info( - "chrome pid %s exited normally", - self.chrome_process.pid) - else: - self.logger.warn( - "chrome pid %s exited with nonzero status %s", - self.chrome_process.pid, status) - - # XXX I would like to forcefully kill the process group - # here to guarantee no orphaned chromium subprocesses hang - # around, but there's a chance I suppose that some other - # process could have started with the same pgid - return - - self.logger.warn( - "chrome pid %s still alive %.1f seconds after sending " - "SIGTERM, sending SIGKILL", self.chrome_process.pid, - time.time() - first_sigterm) - os.killpg(self.chrome_process.pid, signal.SIGKILL) - status = self.chrome_process.wait() - self.logger.warn( - "chrome pid %s reaped (status=%s) after killing with " - "SIGKILL", self.chrome_process.pid, status) - finally: - self._out_reader_thread.join() - self.chrome_process = None diff --git a/brozzler/chrome.py b/brozzler/chrome.py new file mode 100644 index 0000000..5208442 --- /dev/null +++ b/brozzler/chrome.py @@ -0,0 +1,286 @@ +''' +brozzler/chrome.py - manages the chrome/chromium browser for brozzler + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import logging +import urllib.request +import time +import threading +import subprocess +import os +import brozzler +import select +import re +import signal +import sqlite3 +import json +import psutil +import tempfile + +class Chrome: + logger = logging.getLogger(__module__ + '.' + __qualname__) + + def __init__( + self, port, executable, proxy=None, ignore_cert_errors=False, + cookie_db=None): + self.port = port + self.executable = executable + self.proxy = proxy + self.ignore_cert_errors = ignore_cert_errors + self.cookie_db = cookie_db + self._shutdown = threading.Event() + + def __enter__(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + return self.start() + + def __exit__(self, *args): + self.stop() + + def _find_available_port(self, default_port=9200): + try: + conns = psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + return default_port + + if not any(conn.laddr[1] == default_port for conn in conns): + return default_port + + for p in range(9999,8999,-1): + if not any(conn.laddr[1] == p for conn in conns): + self.logger.warn( + 'port %s already in use, using %s instead', + default_port, p) + return p + + return default_port + + def _init_cookie_db(self): + if self.cookie_db is not None: + cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') + cookie_location = os.path.join(cookie_dir, 'Cookies') + self.logger.debug( + 'cookie DB provided, writing to %s', cookie_location) + os.makedirs(cookie_dir, exist_ok=True) + + try: + with open(cookie_location, 'wb') as cookie_file: + cookie_file.write(self.cookie_db) + except OSError: + self.logger.error( + 'exception writing cookie file at %s', + cookie_location, exc_info=True) + + def persist_and_read_cookie_db(self): + cookie_location = os.path.join( + self._chrome_user_data_dir, 'Default', 'Cookies') + self.logger.debug( + 'marking cookies persistent then reading file into memory: %s', + cookie_location) + try: + with sqlite3.connect(cookie_location) as conn: + cur = conn.cursor() + cur.execute('UPDATE cookies SET persistent = 1') + except sqlite3.Error: + self.logger.error('exception updating cookie DB', exc_info=True) + + cookie_db = None + try: + with open(cookie_location, 'rb') as cookie_file: + cookie_db = cookie_file.read() + except OSError: + self.logger.error( + 'exception reading from cookie DB file %s', + cookie_location, exc_info=True) + return cookie_db + + def start(self): + ''' + Returns websocket url to chrome window with about:blank loaded. + ''' + # these can raise exceptions + self._home_tmpdir = tempfile.TemporaryDirectory() + self._chrome_user_data_dir = os.path.join( + self._home_tmpdir.name, 'chrome-user-data') + self._init_cookie_db() + + new_env = os.environ.copy() + new_env['HOME'] = self._home_tmpdir.name + self.port = self._find_available_port(self.port) + chrome_args = [ + self.executable, '--use-mock-keychain', # mac thing + '--user-data-dir=%s' % self._chrome_user_data_dir, + '--remote-debugging-port=%s' % self.port, + '--disable-web-sockets', '--disable-cache', + '--window-size=1100,900', '--no-default-browser-check', + '--disable-first-run-ui', '--no-first-run', + '--homepage=about:blank', '--disable-direct-npapi-requests', + '--disable-web-security', '--disable-notifications', + '--disable-extensions', '--disable-save-password-bubble'] + if self.ignore_cert_errors: + chrome_args.append('--ignore-certificate-errors') + if self.proxy: + chrome_args.append('--proxy-server=%s' % self.proxy) + chrome_args.append('about:blank') + self.logger.info( + 'running: %s', repr(subprocess.list2cmdline(chrome_args))) + # start_new_session - new process group so we can kill the whole group + self.chrome_process = subprocess.Popen( + chrome_args, env=new_env, start_new_session=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) + self._out_reader_thread = threading.Thread( + target=self._read_stderr_stdout, + name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) + self._out_reader_thread.start() + self.logger.info('chrome running, pid %s' % self.chrome_process.pid) + + return self._websocket_url() + + def _websocket_url(self): + timeout_sec = 600 + json_url = 'http://localhost:%s/json' % self.port + # make this a member variable so that kill -QUIT reports it + self._start = time.time() + while True: + try: + raw_json = urllib.request.urlopen(json_url, timeout=30).read() + all_debug_info = json.loads(raw_json.decode('utf-8')) + debug_info = [x for x in all_debug_info + if x['url'] == 'about:blank'] + + if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: + self.logger.debug('%s returned %s', json_url, raw_json) + url = debug_info[0]['webSocketDebuggerUrl'] + self.logger.info( + 'got chrome window websocket debug url %s from %s', + url, json_url) + return url + except BaseException as e: + if int(time.time() - self._start) % 10 == 5: + self.logger.warn( + 'problem with %s (will keep trying until timeout ' + 'of %d seconds): %s', json_url, timeout_sec, e) + pass + finally: + if time.time() - self._start > timeout_sec: + self.logger.error( + 'killing chrome, failed to retrieve %s after % ' + 'seconds', json_url, time.time() - self._start) + self.stop() + raise Exception( + 'killed chrome, failed to retrieve %s after %s ' + 'seconds' % (json_url, time.time() - self._start)) + else: + time.sleep(0.5) + + def _read_stderr_stdout(self): + # XXX select doesn't work on windows + def readline_nonblock(f): + buf = b'' + while not self._shutdown.is_set() and ( + len(buf) == 0 or buf[-1] != 0xa) and select.select( + [f],[],[],0.5)[0]: + buf += f.read(1) + return buf + + try: + while not self._shutdown.is_set(): + buf = readline_nonblock(self.chrome_process.stdout) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + + buf = readline_nonblock(self.chrome_process.stderr) + if buf: + if re.search( + b'Xlib: extension|' + b'CERT_PKIXVerifyCert for [^ ]* failed|' + b'^ALSA lib|ERROR:gl_surface_glx.cc|' + b'ERROR:gpu_child_thread.cc', buf): + logging.log( + brozzler.TRACE, 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) + else: + logging.debug( + 'chrome pid %s STDERR %s', + self.chrome_process.pid, buf) + except: + logging.error('unexpected exception', exc_info=True) + + def stop(self): + if not self.chrome_process or self._shutdown.is_set(): + return + + timeout_sec = 300 + self._shutdown.set() + self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid) + + os.killpg(self.chrome_process.pid, signal.SIGTERM) + first_sigterm = time.time() + + try: + while time.time() - first_sigterm < timeout_sec: + time.sleep(0.5) + + status = self.chrome_process.poll() + if status is not None: + if status == 0: + self.logger.info( + 'chrome pid %s exited normally', + self.chrome_process.pid) + else: + self.logger.warn( + 'chrome pid %s exited with nonzero status %s', + self.chrome_process.pid, status) + + # XXX I would like to forcefully kill the process group + # here to guarantee no orphaned chromium subprocesses hang + # around, but there's a chance I suppose that some other + # process could have started with the same pgid + return + + self.logger.warn( + 'chrome pid %s still alive %.1f seconds after sending ' + 'SIGTERM, sending SIGKILL', self.chrome_process.pid, + time.time() - first_sigterm) + os.killpg(self.chrome_process.pid, signal.SIGKILL) + status = self.chrome_process.wait() + self.logger.warn( + 'chrome pid %s reaped (status=%s) after killing with ' + 'SIGKILL', self.chrome_process.pid, status) + + try: + self._home_tmpdir.cleanup() + except: + self.logger.error( + 'exception deleting %s', self._home_tmpdir, + exc_info=True) + finally: + self._out_reader_thread.join() + self.chrome_process = None diff --git a/brozzler/cli.py b/brozzler/cli.py index a57396f..b022771 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -350,3 +350,69 @@ def brozzler_ensure_tables(): # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(r) + +def brozzler_list_captures(): + ''' + Handy utility for looking up entries in the rethinkdb "captures" table by + url or sha1. + ''' + import surt + import rethinkdb + + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + arg_parser.add_argument( + 'url_or_sha1', metavar='URL_or_SHA1', + help='url or sha1 to look up in captures table') + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + + class Jsonner(json.JSONEncoder): + def default(self, o): + if isinstance(o, datetime.datetime): + return o.isoformat() + return json.JSONEncoder.default(self, o) + + if args.url_or_sha1[:5] == 'sha1:': + raise Exception('not implemented') + # def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): + # if algo != "sha1": + # raise Exception( + # "digest type is %s but big captures table is indexed by " + # "sha1" % algo) + # sha1base32 = base64.b32encode(raw_digest).decode("utf-8") + # results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() + # results = list(results_iter) + # if len(results) > 0: + # if len(results) > 1: + # self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) + # result = results[0] + # else: + # result = None + # self.logger.debug("returning %s for sha1base32=%s bucket=%s", + # result, sha1base32, bucket) + # return result + else: + key = surt.surt( + args.url_or_sha1, trailing_comma=True, host_massage=False, + with_scheme=True) + reql = r.table('captures').between( + [key[:150], rethinkdb.minval], + [key[:150]+'!', rethinkdb.maxval], + index='abbr_canon_surt_timestamp') + reql = reql.order_by(index='abbr_canon_surt_timestamp') + reql = reql.filter( + lambda capture: (capture['canon_surt'] >= key) + & (capture['canon_surt'] <= key)) + logging.debug('rethinkdb query: %s', reql) + results = reql.run() + for result in results: + print(json.dumps(result, cls=Jsonner, indent=2)) + diff --git a/brozzler/dashboard/static/brozzler-icon.svg b/brozzler/dashboard/static/brozzler-icon.svg new file mode 100644 index 0000000..c8fecdc --- /dev/null +++ b/brozzler/dashboard/static/brozzler-icon.svg @@ -0,0 +1,262 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 9b0a0f7..7b0b95a 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -70,7 +70,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): 'filename': record['filename'], } if record['warc_type'] != 'revisit': - blob['mime'] = record['content_type'] + blob['mime'] = record['content_type'] or '-' else: blob['mime'] = 'warc/revisit' # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}' diff --git a/brozzler/worker.py b/brozzler/worker.py index 21aa22e..436c08b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -78,7 +78,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler): final_url = url while final_url in redirects: - final_url = redirects[final_url].response.headers['location'] + final_url = redirects.pop(final_url).response.headers['location'] final_bounces = [] for txn in self.transactions: diff --git a/setup.py b/setup.py index a613be4..f4f6b10 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev129', + version='1.1b8.dev142', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -51,6 +51,7 @@ setuptools.setup( 'brozzler-new-site=brozzler.cli:brozzler_new_site', 'brozzler-worker=brozzler.cli:brozzler_worker', 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', + 'brozzler-list-captures=brozzler.cli:brozzler_list_captures', 'brozzler-dashboard=brozzler.dashboard:main', 'brozzler-easy=brozzler.easy:main', 'brozzler-wayback=brozzler.pywb:main', @@ -61,7 +62,7 @@ setuptools.setup( 'youtube-dl', 'reppy==0.3.4', 'requests', - 'websocket-client', + 'websocket-client!=0.39.0', 'pillow==3.3.0', 'surt>=0.3.0', 'rethinkstuff>=0.1.5', diff --git a/tests/test_units.py b/tests/test_units.py index 2fee049..58b929f 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -22,6 +22,10 @@ import http.server import threading import os import brozzler +import brozzler.chrome +import socket +import logging +import psutil @pytest.fixture(scope='module') def httpd(request): @@ -52,3 +56,20 @@ def test_robots(httpd): site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') assert not brozzler.is_permitted_by_robots(site, url) +def test_find_available_port(): + x = brozzler.chrome.Chrome(None, None) + try: + psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + logging.warn( + 'skipping _find_available_port() test because ' + 'psutil.net_connections(kind="tcp") raised AccessDenied') + return + assert x._find_available_port(9800) == 9800 + sock = socket.socket() + sock.bind(('localhost', 9800)) + sock.listen(0) + assert x._find_available_port(9800) == 9999 + sock.close() + assert x._find_available_port(9800) == 9800 +