Merge branch 'master' into qa

* master:
  fix bug handling page with zero outlinks
  avoid infinite loop in case youtube-dl encounters redirect loop (which can be ok if cookies have been set or something)
  brozzler logo svg with small default size
  travis-ci slack integration
  fix _find_available_port and its unit test
  little fixes
  avoid broken version of websocket-client to fix https://github.com/internetarchive/brozzler/issues/28
  wrong branch of warcprox in ansible install
  move cookie db management code into chrome.py
  move _find_available_ports to chrome.py, changing the way it works so that browser:9200 doesn't get stuck at 9201 forever, which pushes 9201 to 9202 etc, and add a unit test
  split Chrome class into its own module
  new utility brozzler-list-captures for looking up entries in the "captures" table
This commit is contained in:
Noah Levitt 2016-12-15 12:07:29 -08:00
commit 4186869bf9
10 changed files with 664 additions and 259 deletions

View File

@ -14,4 +14,6 @@ after_failure:
- sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log
- sudo cat /var/log/upstart/pywb.log
notifications:
slack:
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=

View File

@ -14,7 +14,7 @@
become: true
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
- name: install warcprox in virtualenv
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox
virtualenv={{venv_root}}/warcprox-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'

View File

@ -1,6 +1,5 @@
'''
brozzler/browser.py - classes responsible for running web browsers
(chromium/chromium) and browsing web pages in them
brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2016 Internet Archive
@ -19,23 +18,17 @@ limitations under the License.
import logging
import json
import urllib.request
import itertools
import websocket
import time
import threading
import subprocess
import tempfile
import os
import random
import brozzler
from brozzler.chrome import Chrome
from brozzler.behaviors import Behavior
from requests.structures import CaseInsensitiveDict
import select
import re
import base64
import psutil
import signal
import sqlite3
import datetime
@ -108,7 +101,9 @@ class Browser:
HARD_TIMEOUT_SECONDS = 20 * 60
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
def __init__(
self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
ignore_cert_errors=False):
self.command_id = itertools.count(1)
self.chrome_port = chrome_port
self.chrome_exe = chrome_exe
@ -134,33 +129,10 @@ class Browser:
def start(self, proxy=None, cookie_db=None):
if not self._chrome_instance:
# these can raise exceptions
self.chrome_port = self._find_available_port()
self._work_dir = tempfile.TemporaryDirectory()
data_dir = os.path.join(self._work_dir.name, "chrome-user-data")
os.makedirs(data_dir, exist_ok=True)
if cookie_db is not None:
cookie_dir = os.path.join(data_dir, "Default")
cookie_location = os.path.join(cookie_dir, "Cookies")
self.logger.debug(
"cookie DB provided, writing to %s", cookie_location)
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, 'wb') as cookie_file:
cookie_file.write(cookie_db)
except OSError:
self.logger.error(
"exception writing cookie file at %s",
cookie_location, exc_info=True)
self._chrome_instance = Chrome(
port=self.chrome_port, executable=self.chrome_exe,
user_home_dir=self._work_dir.name,
user_data_dir=os.sep.join([
self._work_dir.name, "chrome-user-data"]),
ignore_cert_errors=self.ignore_cert_errors,
proxy=proxy or self.proxy)
proxy=proxy or self.proxy, cookie_db=None)
try:
self._websocket_url = self._chrome_instance.start()
except:
@ -172,69 +144,29 @@ class Browser:
if self.is_running():
self._chrome_instance.stop()
self._chrome_instance = None
try:
self._work_dir.cleanup()
except:
self.logger.error("exception deleting %s", self._work_dir,
exc_info=True)
self._work_dir = None
self._websocket_url = None
except:
self.logger.error("problem stopping", exc_info=True)
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
self.logger.debug(
"marking cookies persistent then reading file into memory: %s",
cookie_location)
try:
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute("UPDATE cookies SET persistent = 1")
except sqlite3.Error:
self.logger.error("exception updating cookie DB", exc_info=True)
cookie_db=None
try:
with open(cookie_location, "rb") as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
"exception reading from cookie DB file %s",
cookie_location, exc_info=True)
return cookie_db
def _find_available_port(self):
port_available = False
port = self.chrome_port
try:
conns = psutil.net_connections(kind="tcp")
except psutil.AccessDenied:
return port
for p in range(port, 65535):
if any(connection.laddr[1] == p for connection in conns):
self.logger.warn("port %s already open, will try %s", p, p+1)
else:
port = p
break
return port
def is_running(self):
return bool(self._websocket_url)
def abort_browse_page(self):
self._abort_browse_page = True
def persist_and_read_cookie_db(self):
if self._chrome_instance:
return self._chrome_instance.persist_and_read_cookie_db()
else:
return None
def browse_page(
self, url, extra_headers=None, behavior_parameters=None,
user_agent=None,
on_request=None, on_response=None, on_screenshot=None,
on_url_change=None):
"""
Synchronously loads a page, takes a screenshot, and runs behaviors.
Synchronously loads a page, runs behaviors, and takes a screenshot.
Raises BrowsingException if browsing the page fails in a non-critical
way.
@ -264,10 +196,10 @@ class Browser:
self._websocket_url, on_open=self._visit_page,
on_message=self._wrap_handle_message)
threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
self.chrome_port, datetime.datetime.utcnow())
websock_thread = threading.Thread(
target=self._websock.run_forever, name=threadName,
target=self._websock.run_forever, name=thread_name,
kwargs={'ping_timeout':0.5})
websock_thread.start()
self._start = time.time()
@ -386,8 +318,11 @@ __brzl_compileOutlinks(window).join('\n');
chain = []
def set_outlinks(message):
self._outlinks = frozenset(
message["result"]["result"]["value"].split("\n"))
if message["result"]["result"]["value"]:
self._outlinks = frozenset(
message["result"]["result"]["value"].split("\n"))
else:
self._outlinks = frozenset()
chain.append({
"info": "retrieving outlinks",
@ -571,171 +506,3 @@ __brzl_compileOutlinks(window).join('\n');
# else:
# self.logger.debug("%s", json_message)
class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
self.port = port
self.executable = executable
self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir
self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
def __enter__(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
return self.start()
def __exit__(self, *args):
self.stop()
def start(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
timeout_sec = 600
new_env = os.environ.copy()
new_env["HOME"] = self.user_home_dir
chrome_args = [
self.executable, "--use-mock-keychain", # mac thing
"--user-data-dir={}".format(self.user_data_dir),
"--remote-debugging-port={}".format(self.port),
"--disable-web-sockets", "--disable-cache",
"--window-size=1100,900", "--no-default-browser-check",
"--disable-first-run-ui", "--no-first-run",
"--homepage=about:blank", "--disable-direct-npapi-requests",
"--disable-web-security", "--disable-notifications",
"--disable-extensions",
"--disable-save-password-bubble"]
if self.ignore_cert_errors:
chrome_args.append("--ignore-certificate-errors")
if self.proxy:
chrome_args.append("--proxy-server={}".format(self.proxy))
chrome_args.append("about:blank")
self.logger.info("running: {}".format(" ".join(chrome_args)))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(chrome_args, env=new_env,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0,
start_new_session=True)
self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout,
name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid))
self._out_reader_thread.start()
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
self._start = time.time() # member variable just so that kill -QUIT reports it
json_url = "http://localhost:%s/json" % self.port
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
self.logger.debug("{} returned {}".format(json_url, raw_json))
url = debug_info[0]['webSocketDebuggerUrl']
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
return url
except BaseException as e:
if int(time.time() - self._start) % 10 == 5:
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
pass
finally:
if time.time() - self._start > timeout_sec:
self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start)
self.stop()
raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start))
else:
time.sleep(0.5)
def _read_stderr_stdout(self):
# XXX select doesn't work on windows
def readline_nonblock(f):
buf = b""
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
buf += f.read(1)
return buf
try:
while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
if re.search(
b"Xlib: extension|"
b"CERT_PKIXVerifyCert for [^ ]* failed|"
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
b"ERROR:gpu_child_thread.cc", buf):
logging.log(
brozzler.TRACE, "chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
else:
logging.debug(
"chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
if re.search(
b"Xlib: extension|"
b"CERT_PKIXVerifyCert for [^ ]* failed|"
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
b"ERROR:gpu_child_thread.cc", buf):
logging.log(
brozzler.TRACE, "chrome pid %s STDOUT %s",
self.chrome_process.pid, buf)
else:
logging.debug(
"chrome pid %s STDERR %s",
self.chrome_process.pid, buf)
except:
logging.error("unexpected exception", exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
return
timeout_sec = 300
self._shutdown.set()
self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM)
first_sigterm = time.time()
try:
while time.time() - first_sigterm < timeout_sec:
time.sleep(0.5)
status = self.chrome_process.poll()
if status is not None:
if status == 0:
self.logger.info(
"chrome pid %s exited normally",
self.chrome_process.pid)
else:
self.logger.warn(
"chrome pid %s exited with nonzero status %s",
self.chrome_process.pid, status)
# XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang
# around, but there's a chance I suppose that some other
# process could have started with the same pgid
return
self.logger.warn(
"chrome pid %s still alive %.1f seconds after sending "
"SIGTERM, sending SIGKILL", self.chrome_process.pid,
time.time() - first_sigterm)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warn(
"chrome pid %s reaped (status=%s) after killing with "
"SIGKILL", self.chrome_process.pid, status)
finally:
self._out_reader_thread.join()
self.chrome_process = None

286
brozzler/chrome.py Normal file
View File

@ -0,0 +1,286 @@
'''
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import urllib.request
import time
import threading
import subprocess
import os
import brozzler
import select
import re
import signal
import sqlite3
import json
import psutil
import tempfile
class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(
self, port, executable, proxy=None, ignore_cert_errors=False,
cookie_db=None):
self.port = port
self.executable = executable
self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors
self.cookie_db = cookie_db
self._shutdown = threading.Event()
def __enter__(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
return self.start()
def __exit__(self, *args):
self.stop()
def _find_available_port(self, default_port=9200):
try:
conns = psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
return default_port
if not any(conn.laddr[1] == default_port for conn in conns):
return default_port
for p in range(9999,8999,-1):
if not any(conn.laddr[1] == p for conn in conns):
self.logger.warn(
'port %s already in use, using %s instead',
default_port, p)
return p
return default_port
def _init_cookie_db(self):
if self.cookie_db is not None:
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies')
self.logger.debug(
'cookie DB provided, writing to %s', cookie_location)
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, 'wb') as cookie_file:
cookie_file.write(self.cookie_db)
except OSError:
self.logger.error(
'exception writing cookie file at %s',
cookie_location, exc_info=True)
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(
self._chrome_user_data_dir, 'Default', 'Cookies')
self.logger.debug(
'marking cookies persistent then reading file into memory: %s',
cookie_location)
try:
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute('UPDATE cookies SET persistent = 1')
except sqlite3.Error:
self.logger.error('exception updating cookie DB', exc_info=True)
cookie_db = None
try:
with open(cookie_location, 'rb') as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
'exception reading from cookie DB file %s',
cookie_location, exc_info=True)
return cookie_db
def start(self):
'''
Returns websocket url to chrome window with about:blank loaded.
'''
# these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
self._init_cookie_db()
new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
self.port = self._find_available_port(self.port)
chrome_args = [
self.executable, '--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--remote-debugging-port=%s' % self.port,
'--disable-web-sockets', '--disable-cache',
'--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests',
'--disable-web-security', '--disable-notifications',
'--disable-extensions', '--disable-save-password-bubble']
if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors')
if self.proxy:
chrome_args.append('--proxy-server=%s' % self.proxy)
chrome_args.append('about:blank')
self.logger.info(
'running: %s', repr(subprocess.list2cmdline(chrome_args)))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
chrome_args, env=new_env, start_new_session=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
self._out_reader_thread = threading.Thread(
target=self._read_stderr_stdout,
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid)
self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
return self._websocket_url()
def _websocket_url(self):
timeout_sec = 600
json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info
if x['url'] == 'about:blank']
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
self.logger.debug('%s returned %s', json_url, raw_json)
url = debug_info[0]['webSocketDebuggerUrl']
self.logger.info(
'got chrome window websocket debug url %s from %s',
url, json_url)
return url
except BaseException as e:
if int(time.time() - self._start) % 10 == 5:
self.logger.warn(
'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e)
pass
finally:
if time.time() - self._start > timeout_sec:
self.logger.error(
'killing chrome, failed to retrieve %s after % '
'seconds', json_url, time.time() - self._start)
self.stop()
raise Exception(
'killed chrome, failed to retrieve %s after %s '
'seconds' % (json_url, time.time() - self._start))
else:
time.sleep(0.5)
def _read_stderr_stdout(self):
# XXX select doesn't work on windows
def readline_nonblock(f):
buf = b''
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
buf += f.read(1)
return buf
try:
while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
if re.search(
b'Xlib: extension|'
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
if re.search(
b'Xlib: extension|'
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
except:
logging.error('unexpected exception', exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
return
timeout_sec = 300
self._shutdown.set()
self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM)
first_sigterm = time.time()
try:
while time.time() - first_sigterm < timeout_sec:
time.sleep(0.5)
status = self.chrome_process.poll()
if status is not None:
if status == 0:
self.logger.info(
'chrome pid %s exited normally',
self.chrome_process.pid)
else:
self.logger.warn(
'chrome pid %s exited with nonzero status %s',
self.chrome_process.pid, status)
# XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang
# around, but there's a chance I suppose that some other
# process could have started with the same pgid
return
self.logger.warn(
'chrome pid %s still alive %.1f seconds after sending '
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
time.time() - first_sigterm)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warn(
'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status)
try:
self._home_tmpdir.cleanup()
except:
self.logger.error(
'exception deleting %s', self._home_tmpdir,
exc_info=True)
finally:
self._out_reader_thread.join()
self.chrome_process = None

View File

@ -350,3 +350,69 @@ def brozzler_ensure_tables():
# sites, pages, jobs tables
brozzler.frontier.RethinkDbFrontier(r)
def brozzler_list_captures():
'''
Handy utility for looking up entries in the rethinkdb "captures" table by
url or sha1.
'''
import surt
import rethinkdb
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
arg_parser.add_argument(
'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
class Jsonner(json.JSONEncoder):
def default(self, o):
if isinstance(o, datetime.datetime):
return o.isoformat()
return json.JSONEncoder.default(self, o)
if args.url_or_sha1[:5] == 'sha1:':
raise Exception('not implemented')
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
# if algo != "sha1":
# raise Exception(
# "digest type is %s but big captures table is indexed by "
# "sha1" % algo)
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
# results = list(results_iter)
# if len(results) > 0:
# if len(results) > 1:
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
# result = results[0]
# else:
# result = None
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
# result, sha1base32, bucket)
# return result
else:
key = surt.surt(
args.url_or_sha1, trailing_comma=True, host_massage=False,
with_scheme=True)
reql = r.table('captures').between(
[key[:150], rethinkdb.minval],
[key[:150]+'!', rethinkdb.maxval],
index='abbr_canon_surt_timestamp')
reql = reql.order_by(index='abbr_canon_surt_timestamp')
reql = reql.filter(
lambda capture: (capture['canon_surt'] >= key)
& (capture['canon_surt'] <= key))
logging.debug('rethinkdb query: %s', reql)
results = reql.run()
for result in results:
print(json.dumps(result, cls=Jsonner, indent=2))

View File

@ -0,0 +1,262 @@
<svg width='23' height='17' xmlns='http://www.w3.org/2000/svg' version='1.1'
xmlns:svg="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
>
<!-- make sure glyph is visible within svg window -->
<g fill-rule='nonzero' transform='scale(0.0293) translate(30 0)'>
<!-- draw actual outline using lines and Bezier curves-->
<path fill='#666' stroke='black' stroke-width='0' transform='scale(1.35) translate(-46 517)' d='
M 582,-127
L 582,-229
Q 582,-240 571,-240
L 485,-240
L 410,-323
Q 364,-374 297,-374
Q 237,-374 193,-332
L 42,-193
Q 39,-187 39,-186
Q 39,-181 42,-178
L 110,-110
Q 112,-107 117,-107
Q 123,-107 125,-110
L 230,-214
L 279,-156
Q 315,-117 367,-117
L 571,-117
Q 582,-117 582,-127
Z
'/>
<defs
id="defs3043">
<linearGradient
id="linearGradient3803">
<stop
style="stop-color:#d7def0;stop-opacity:1;"
offset="0"
id="stop3805" />
<stop
id="stop3811"
offset="0.5"
style="stop-color:#ffffff;stop-opacity:1" />
<stop
style="stop-color:#d5def0;stop-opacity:1"
offset="1"
id="stop3807" />
</linearGradient>
<linearGradient
id="linearGradient3776"
inkscape:collect="always">
<stop
id="stop3778"
offset="0"
style="stop-color:#b2cde9;stop-opacity:1" />
<stop
id="stop3780"
offset="1"
style="stop-color:#c4dbee;stop-opacity:1" />
</linearGradient>
<linearGradient
id="linearGradient3750">
<stop
id="stop3752"
offset="0"
style="stop-color:#d0e2f1;stop-opacity:1" />
<stop
style="stop-color:#cadef0;stop-opacity:1"
offset="0.85580856"
id="stop3756" />
<stop
id="stop3754"
offset="1"
style="stop-color:#95bee3;stop-opacity:1" />
</linearGradient>
<linearGradient
id="linearGradient3708">
<stop
style="stop-color:#658db6;stop-opacity:1"
offset="0"
id="stop3710" />
<stop
id="stop3716"
offset="0.76777935"
style="stop-color:#527fab;stop-opacity:1;" />
<stop
style="stop-color:#4071a0;stop-opacity:1"
offset="1"
id="stop3712" />
</linearGradient>
<linearGradient
id="linearGradient3698">
<stop
style="stop-color:#96d0e1;stop-opacity:1"
offset="0"
id="stop3700" />
<stop
id="stop3706"
offset="0.67819428"
style="stop-color:#89b7e1;stop-opacity:1" />
<stop
style="stop-color:#699dd3;stop-opacity:1"
offset="1"
id="stop3702" />
</linearGradient>
<linearGradient
inkscape:collect="always"
id="linearGradient3647">
<stop
style="stop-color:#3b79bc;stop-opacity:1;"
offset="0"
id="stop3649" />
<stop
style="stop-color:#94b8e0;stop-opacity:1"
offset="1"
id="stop3651" />
</linearGradient>
<linearGradient
inkscape:collect="always"
id="linearGradient3588">
<stop
style="stop-color:#ffffff;stop-opacity:1"
offset="0"
id="stop3590" />
<stop
style="stop-color:#000000;stop-opacity:0;"
offset="1"
id="stop3592" />
</linearGradient>
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient3588"
id="radialGradient3594"
cx="-118.77966"
cy="121.49152"
fx="-118.77966"
fy="121.49152"
r="25.491526"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.02177942,-0.95743591,0.97872327,0.02221687,-235.0993,5.0684454)" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient3647"
id="linearGradient3653"
x1="-397.81323"
y1="149.18764"
x2="-397.55933"
y2="51.355946"
gradientUnits="userSpaceOnUse" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient3698"
id="radialGradient3704"
cx="-383.2746"
cy="217.91029"
fx="-383.2746"
fy="217.91029"
r="59.401995"
gradientTransform="matrix(-1.2861568,-0.08596317,0.11453678,-1.7136762,-425.01982,469.50099)"
gradientUnits="userSpaceOnUse" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient3708"
id="radialGradient3714"
cx="-123.5"
cy="-11.570732"
fx="-123.5"
fy="-11.570732"
r="95.627118"
gradientTransform="matrix(-0.00756512,0.55751399,-1.0314585,-0.01398286,113.23967,103.212)"
gradientUnits="userSpaceOnUse" />
<radialGradient
inkscape:collect="always"
xlink:href="#linearGradient3750"
id="radialGradient3748"
cx="-94.87291"
cy="165.27281"
fx="-94.87291"
fy="165.27281"
r="60.481357"
gradientTransform="matrix(0.81293878,1.6998003,-2.1519091,1.0291615,564.39485,118.47915)"
gradientUnits="userSpaceOnUse" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient3776"
id="linearGradient3774"
x1="162.07127"
y1="85.239708"
x2="220.76114"
y2="78.875748"
gradientUnits="userSpaceOnUse"
gradientTransform="translate(3.3917128,7.418629)" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient3803"
id="linearGradient3809"
x1="-382.04123"
y1="37.280548"
x2="-381.39438"
y2="165.56691"
gradientUnits="userSpaceOnUse" />
</defs>
<path
sodipodi:type="arc"
style="fill:url(#radialGradient3594);fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path2814"
sodipodi:cx="-118.23729"
sodipodi:cy="122.57627"
sodipodi:rx="25.491526"
sodipodi:ry="25.491526"
d="m -92.745764,122.57627 a 25.491526,25.491526 0 1 1 -50.983056,0 25.491526,25.491526 0 1 1 50.983056,0 z"
transform="matrix(4.680851,0,0,4.7978723,685.10478,-449.69946)" />
<path
style="fill:url(#linearGradient3774);fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 232.17258,88.120422 c 0,15.673918 -19.79135,34.931518 -45.84395,34.931518 -26.0526,0 -59.92241,-16.08123 -59.92241,-31.755152 0,-15.673924 21.11981,-28.38015 47.17241,-28.38015 19.90254,0 46.36122,18.293224 56.45971,20.3521 0.79179,1.710571 1.36862,2.925087 2.13424,4.851684 z"
id="path3655"
sodipodi:nodetypes="cssscc" />
<path
style="fill:#2e5c91;fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 38.822019,65.971523 c 12.38148,-9.610993 35.314514,-1.245318 51.289554,19.334679 15.975027,20.579998 17.694937,51.065068 5.31349,60.676058 -12.38147,9.61099 -34.17571,-5.29155 -50.15074,-25.87156 -12.20392,-15.72181 -4.05062,-41.19089 -8.61646,-50.430553 0.61589,-1.122052 1.381696,-2.456607 2.164156,-3.708624 z"
id="path3655-4-8"
sodipodi:nodetypes="cssscc" />
<path
style="fill:url(#radialGradient3714);fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 230.04347,83.261765 c -7.0081,-0.03265 -61.07025,0.289575 -107.66568,0.0654 -17.371,5.108098 -31.704627,13.258827 -39.181777,29.154945 -5.33639,-4.54237 -40.74576,-42.215609 -44.40678,-46.440684 31.38983,-41.648805 74.528017,-45.559321 82.915257,-45.559321 8.38724,0 70.64407,-8.631855 108.33898,62.77966 z"
id="path3596"
sodipodi:nodetypes="ccccsc" />
<path
style="fill:#699dd3;fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 117.12454,243.96815 c -12.49835,-9.45851 -14.5752,-36.93927 1.14635,-57.71356 15.72155,-20.77428 41.03582,-34.94753 53.53417,-25.48904 12.49834,9.4585 7.44792,38.96701 -8.27364,59.74129 -12.01027,15.87024 -35.4911,16.88498 -43.22681,23.69505 -1.23894,-0.0455 -1.95523,-0.0605 -3.18007,-0.23374 z"
id="path3655-4"
sodipodi:nodetypes="cssscc" />
<path
style="fill:url(#radialGradient3748);fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 120.3032,244.20103 c 3.58354,-6.02268 28.85859,-52.8991 52.69131,-92.9389 4.41104,-17.56095 5.34663,-33.64185 -4.5584,-48.14993 6.62173,-2.29412 58.23852,-13.976353 63.73684,-14.987686 19.9656,48.180076 1.44992,87.338276 -2.80522,94.565966 -4.25515,7.22768 -28.40179,65.25666 -109.06453,61.51055 z"
id="path3596-1"
sodipodi:nodetypes="ccccsc" />
<path
style="fill:url(#radialGradient3704);fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 36.696853,69.642524 c 3.46858,6.089612 30.72312,52.780196 53.77852,93.272576 13.094367,12.50527 27.684997,19.48512 45.191737,18.03328 -1.2738,6.89113 -16.62898,57.75037 -18.4638,63.03126 -51.756237,-6.42158 -76.669777,-41.85476 -80.854757,-49.1233 -4.18497,-7.26855 -42.7297502,-56.91452 0.3483,-125.213816 z"
id="path3596-1-7"
sodipodi:nodetypes="ccccsc" />
<path
transform="matrix(0.77294737,0,0,0.77619098,435.90647,53.275706)"
style="fill:url(#linearGradient3653);fill-opacity:1;stroke:url(#linearGradient3809);stroke-width:10.07013607;stroke-miterlimit:4;stroke-opacity:1"
d="m -338.44068,101.42373 c 0,32.65032 -26.46832,59.11864 -59.11865,59.11864 -32.65032,0 -59.11864,-26.46832 -59.11864,-59.11864 0,-32.650327 26.46832,-59.118646 59.11864,-59.118646 32.65033,0 59.11865,26.468319 59.11865,59.118646 z"
id="path3645" />
<path
style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 163.54619,108.89582 c 18.52979,17.09836 16.03302,29.55794 10.0625,44 -3.10892,-22.25001 -2.34478,-32.42697 -10.0625,-44 z"
id="rect3782"
sodipodi:nodetypes="ccc" />
<path
style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
d="m 101.42092,173.63924 c -22.645593,-14.47335 -29.809884,-45.71983 -8.813354,-62.99032 -10.847561,19.77514 -6.225429,32.39863 8.813354,62.99032 z"
id="rect3782-4"
sodipodi:nodetypes="ccc" />
<!-- <text x="400" y="100" fill="black" font-size="70">brozzler</text> -->
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -70,7 +70,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
'filename': record['filename'],
}
if record['warc_type'] != 'revisit':
blob['mime'] = record['content_type']
blob['mime'] = record['content_type'] or '-'
else:
blob['mime'] = 'warc/revisit'
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'

View File

@ -78,7 +78,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
final_url = url
while final_url in redirects:
final_url = redirects[final_url].response.headers['location']
final_url = redirects.pop(final_url).response.headers['location']
final_bounces = []
for txn in self.transactions:

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev129',
version='1.1b8.dev142',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -51,6 +51,7 @@ setuptools.setup(
'brozzler-new-site=brozzler.cli:brozzler_new_site',
'brozzler-worker=brozzler.cli:brozzler_worker',
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
'brozzler-dashboard=brozzler.dashboard:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
@ -61,7 +62,7 @@ setuptools.setup(
'youtube-dl',
'reppy==0.3.4',
'requests',
'websocket-client',
'websocket-client!=0.39.0',
'pillow==3.3.0',
'surt>=0.3.0',
'rethinkstuff>=0.1.5',

View File

@ -22,6 +22,10 @@ import http.server
import threading
import os
import brozzler
import brozzler.chrome
import socket
import logging
import psutil
@pytest.fixture(scope='module')
def httpd(request):
@ -52,3 +56,20 @@ def test_robots(httpd):
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url)
def test_find_available_port():
x = brozzler.chrome.Chrome(None, None)
try:
psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
logging.warn(
'skipping _find_available_port() test because '
'psutil.net_connections(kind="tcp") raised AccessDenied')
return
assert x._find_available_port(9800) == 9800
sock = socket.socket()
sock.bind(('localhost', 9800))
sock.listen(0)
assert x._find_available_port(9800) == 9999
sock.close()
assert x._find_available_port(9800) == 9800