mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: fix bug handling page with zero outlinks avoid infinite loop in case youtube-dl encounters redirect loop (which can be ok if cookies have been set or something) brozzler logo svg with small default size travis-ci slack integration fix _find_available_port and its unit test little fixes avoid broken version of websocket-client to fix https://github.com/internetarchive/brozzler/issues/28 wrong branch of warcprox in ansible install move cookie db management code into chrome.py move _find_available_ports to chrome.py, changing the way it works so that browser:9200 doesn't get stuck at 9201 forever, which pushes 9201 to 9202 etc, and add a unit test split Chrome class into its own module new utility brozzler-list-captures for looking up entries in the "captures" table
This commit is contained in:
commit
4186869bf9
@ -14,4 +14,6 @@ after_failure:
|
||||
- sudo cat /var/log/upstart/warcprox.log
|
||||
- sudo cat /var/log/upstart/brozzler-worker.log
|
||||
- sudo cat /var/log/upstart/pywb.log
|
||||
|
||||
notifications:
|
||||
slack:
|
||||
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
|
||||
|
@ -14,7 +14,7 @@
|
||||
become: true
|
||||
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
|
||||
- name: install warcprox in virtualenv
|
||||
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||
pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox
|
||||
virtualenv={{venv_root}}/warcprox-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||
|
@ -1,6 +1,5 @@
|
||||
'''
|
||||
brozzler/browser.py - classes responsible for running web browsers
|
||||
(chromium/chromium) and browsing web pages in them
|
||||
brozzler/browser.py - manages the browsers for brozzler
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
@ -19,23 +18,17 @@ limitations under the License.
|
||||
|
||||
import logging
|
||||
import json
|
||||
import urllib.request
|
||||
import itertools
|
||||
import websocket
|
||||
import time
|
||||
import threading
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
import random
|
||||
import brozzler
|
||||
from brozzler.chrome import Chrome
|
||||
from brozzler.behaviors import Behavior
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import select
|
||||
import re
|
||||
import base64
|
||||
import psutil
|
||||
import signal
|
||||
import sqlite3
|
||||
import datetime
|
||||
|
||||
@ -108,7 +101,9 @@ class Browser:
|
||||
|
||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||
|
||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
|
||||
def __init__(
|
||||
self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
|
||||
ignore_cert_errors=False):
|
||||
self.command_id = itertools.count(1)
|
||||
self.chrome_port = chrome_port
|
||||
self.chrome_exe = chrome_exe
|
||||
@ -134,33 +129,10 @@ class Browser:
|
||||
|
||||
def start(self, proxy=None, cookie_db=None):
|
||||
if not self._chrome_instance:
|
||||
# these can raise exceptions
|
||||
self.chrome_port = self._find_available_port()
|
||||
self._work_dir = tempfile.TemporaryDirectory()
|
||||
data_dir = os.path.join(self._work_dir.name, "chrome-user-data")
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
if cookie_db is not None:
|
||||
cookie_dir = os.path.join(data_dir, "Default")
|
||||
cookie_location = os.path.join(cookie_dir, "Cookies")
|
||||
self.logger.debug(
|
||||
"cookie DB provided, writing to %s", cookie_location)
|
||||
os.makedirs(cookie_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(cookie_location, 'wb') as cookie_file:
|
||||
cookie_file.write(cookie_db)
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
"exception writing cookie file at %s",
|
||||
cookie_location, exc_info=True)
|
||||
|
||||
self._chrome_instance = Chrome(
|
||||
port=self.chrome_port, executable=self.chrome_exe,
|
||||
user_home_dir=self._work_dir.name,
|
||||
user_data_dir=os.sep.join([
|
||||
self._work_dir.name, "chrome-user-data"]),
|
||||
ignore_cert_errors=self.ignore_cert_errors,
|
||||
proxy=proxy or self.proxy)
|
||||
proxy=proxy or self.proxy, cookie_db=None)
|
||||
try:
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
except:
|
||||
@ -172,69 +144,29 @@ class Browser:
|
||||
if self.is_running():
|
||||
self._chrome_instance.stop()
|
||||
self._chrome_instance = None
|
||||
try:
|
||||
self._work_dir.cleanup()
|
||||
except:
|
||||
self.logger.error("exception deleting %s", self._work_dir,
|
||||
exc_info=True)
|
||||
self._work_dir = None
|
||||
self._websocket_url = None
|
||||
except:
|
||||
self.logger.error("problem stopping", exc_info=True)
|
||||
|
||||
def persist_and_read_cookie_db(self):
|
||||
cookie_location = os.path.join(
|
||||
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
|
||||
self.logger.debug(
|
||||
"marking cookies persistent then reading file into memory: %s",
|
||||
cookie_location)
|
||||
try:
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute("UPDATE cookies SET persistent = 1")
|
||||
except sqlite3.Error:
|
||||
self.logger.error("exception updating cookie DB", exc_info=True)
|
||||
|
||||
cookie_db=None
|
||||
try:
|
||||
with open(cookie_location, "rb") as cookie_file:
|
||||
cookie_db = cookie_file.read()
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
"exception reading from cookie DB file %s",
|
||||
cookie_location, exc_info=True)
|
||||
return cookie_db
|
||||
|
||||
def _find_available_port(self):
|
||||
port_available = False
|
||||
port = self.chrome_port
|
||||
|
||||
try:
|
||||
conns = psutil.net_connections(kind="tcp")
|
||||
except psutil.AccessDenied:
|
||||
return port
|
||||
|
||||
for p in range(port, 65535):
|
||||
if any(connection.laddr[1] == p for connection in conns):
|
||||
self.logger.warn("port %s already open, will try %s", p, p+1)
|
||||
else:
|
||||
port = p
|
||||
break
|
||||
return port
|
||||
|
||||
def is_running(self):
|
||||
return bool(self._websocket_url)
|
||||
|
||||
def abort_browse_page(self):
|
||||
self._abort_browse_page = True
|
||||
|
||||
def persist_and_read_cookie_db(self):
|
||||
if self._chrome_instance:
|
||||
return self._chrome_instance.persist_and_read_cookie_db()
|
||||
else:
|
||||
return None
|
||||
|
||||
def browse_page(
|
||||
self, url, extra_headers=None, behavior_parameters=None,
|
||||
user_agent=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
on_url_change=None):
|
||||
"""
|
||||
Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||
Synchronously loads a page, runs behaviors, and takes a screenshot.
|
||||
|
||||
Raises BrowsingException if browsing the page fails in a non-critical
|
||||
way.
|
||||
@ -264,10 +196,10 @@ class Browser:
|
||||
self._websocket_url, on_open=self._visit_page,
|
||||
on_message=self._wrap_handle_message)
|
||||
|
||||
threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
|
||||
thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
|
||||
self.chrome_port, datetime.datetime.utcnow())
|
||||
websock_thread = threading.Thread(
|
||||
target=self._websock.run_forever, name=threadName,
|
||||
target=self._websock.run_forever, name=thread_name,
|
||||
kwargs={'ping_timeout':0.5})
|
||||
websock_thread.start()
|
||||
self._start = time.time()
|
||||
@ -386,8 +318,11 @@ __brzl_compileOutlinks(window).join('\n');
|
||||
chain = []
|
||||
|
||||
def set_outlinks(message):
|
||||
self._outlinks = frozenset(
|
||||
message["result"]["result"]["value"].split("\n"))
|
||||
if message["result"]["result"]["value"]:
|
||||
self._outlinks = frozenset(
|
||||
message["result"]["result"]["value"].split("\n"))
|
||||
else:
|
||||
self._outlinks = frozenset()
|
||||
|
||||
chain.append({
|
||||
"info": "retrieving outlinks",
|
||||
@ -571,171 +506,3 @@ __brzl_compileOutlinks(window).join('\n');
|
||||
# else:
|
||||
# self.logger.debug("%s", json_message)
|
||||
|
||||
class Chrome:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
|
||||
self.port = port
|
||||
self.executable = executable
|
||||
self.user_home_dir = user_home_dir
|
||||
self.user_data_dir = user_data_dir
|
||||
self.proxy = proxy
|
||||
self.ignore_cert_errors = ignore_cert_errors
|
||||
self._shutdown = threading.Event()
|
||||
|
||||
def __enter__(self):
|
||||
'''
|
||||
Returns websocket url to chrome window with about:blank loaded.
|
||||
'''
|
||||
return self.start()
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.stop()
|
||||
|
||||
def start(self):
|
||||
'''
|
||||
Returns websocket url to chrome window with about:blank loaded.
|
||||
'''
|
||||
timeout_sec = 600
|
||||
new_env = os.environ.copy()
|
||||
new_env["HOME"] = self.user_home_dir
|
||||
chrome_args = [
|
||||
self.executable, "--use-mock-keychain", # mac thing
|
||||
"--user-data-dir={}".format(self.user_data_dir),
|
||||
"--remote-debugging-port={}".format(self.port),
|
||||
"--disable-web-sockets", "--disable-cache",
|
||||
"--window-size=1100,900", "--no-default-browser-check",
|
||||
"--disable-first-run-ui", "--no-first-run",
|
||||
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
||||
"--disable-web-security", "--disable-notifications",
|
||||
"--disable-extensions",
|
||||
"--disable-save-password-bubble"]
|
||||
if self.ignore_cert_errors:
|
||||
chrome_args.append("--ignore-certificate-errors")
|
||||
if self.proxy:
|
||||
chrome_args.append("--proxy-server={}".format(self.proxy))
|
||||
chrome_args.append("about:blank")
|
||||
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
self.chrome_process = subprocess.Popen(chrome_args, env=new_env,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0,
|
||||
start_new_session=True)
|
||||
self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout,
|
||||
name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid))
|
||||
self._out_reader_thread.start()
|
||||
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
|
||||
self._start = time.time() # member variable just so that kill -QUIT reports it
|
||||
|
||||
json_url = "http://localhost:%s/json" % self.port
|
||||
|
||||
while True:
|
||||
try:
|
||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
||||
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
|
||||
|
||||
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
||||
self.logger.debug("{} returned {}".format(json_url, raw_json))
|
||||
url = debug_info[0]['webSocketDebuggerUrl']
|
||||
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
|
||||
return url
|
||||
except BaseException as e:
|
||||
if int(time.time() - self._start) % 10 == 5:
|
||||
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
|
||||
pass
|
||||
finally:
|
||||
if time.time() - self._start > timeout_sec:
|
||||
self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start)
|
||||
self.stop()
|
||||
raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start))
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
|
||||
def _read_stderr_stdout(self):
|
||||
# XXX select doesn't work on windows
|
||||
def readline_nonblock(f):
|
||||
buf = b""
|
||||
while not self._shutdown.is_set() and (
|
||||
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
||||
[f],[],[],0.5)[0]:
|
||||
buf += f.read(1)
|
||||
return buf
|
||||
|
||||
try:
|
||||
while not self._shutdown.is_set():
|
||||
buf = readline_nonblock(self.chrome_process.stdout)
|
||||
if buf:
|
||||
if re.search(
|
||||
b"Xlib: extension|"
|
||||
b"CERT_PKIXVerifyCert for [^ ]* failed|"
|
||||
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
|
||||
b"ERROR:gpu_child_thread.cc", buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, "chrome pid %s STDOUT %s",
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
"chrome pid %s STDOUT %s",
|
||||
self.chrome_process.pid, buf)
|
||||
|
||||
buf = readline_nonblock(self.chrome_process.stderr)
|
||||
if buf:
|
||||
if re.search(
|
||||
b"Xlib: extension|"
|
||||
b"CERT_PKIXVerifyCert for [^ ]* failed|"
|
||||
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
|
||||
b"ERROR:gpu_child_thread.cc", buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, "chrome pid %s STDOUT %s",
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
"chrome pid %s STDERR %s",
|
||||
self.chrome_process.pid, buf)
|
||||
except:
|
||||
logging.error("unexpected exception", exc_info=True)
|
||||
|
||||
def stop(self):
|
||||
if not self.chrome_process or self._shutdown.is_set():
|
||||
return
|
||||
|
||||
timeout_sec = 300
|
||||
self._shutdown.set()
|
||||
self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid)
|
||||
|
||||
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
||||
first_sigterm = time.time()
|
||||
|
||||
try:
|
||||
while time.time() - first_sigterm < timeout_sec:
|
||||
time.sleep(0.5)
|
||||
|
||||
status = self.chrome_process.poll()
|
||||
if status is not None:
|
||||
if status == 0:
|
||||
self.logger.info(
|
||||
"chrome pid %s exited normally",
|
||||
self.chrome_process.pid)
|
||||
else:
|
||||
self.logger.warn(
|
||||
"chrome pid %s exited with nonzero status %s",
|
||||
self.chrome_process.pid, status)
|
||||
|
||||
# XXX I would like to forcefully kill the process group
|
||||
# here to guarantee no orphaned chromium subprocesses hang
|
||||
# around, but there's a chance I suppose that some other
|
||||
# process could have started with the same pgid
|
||||
return
|
||||
|
||||
self.logger.warn(
|
||||
"chrome pid %s still alive %.1f seconds after sending "
|
||||
"SIGTERM, sending SIGKILL", self.chrome_process.pid,
|
||||
time.time() - first_sigterm)
|
||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||
status = self.chrome_process.wait()
|
||||
self.logger.warn(
|
||||
"chrome pid %s reaped (status=%s) after killing with "
|
||||
"SIGKILL", self.chrome_process.pid, status)
|
||||
finally:
|
||||
self._out_reader_thread.join()
|
||||
self.chrome_process = None
|
||||
|
286
brozzler/chrome.py
Normal file
286
brozzler/chrome.py
Normal file
@ -0,0 +1,286 @@
|
||||
'''
|
||||
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import logging
|
||||
import urllib.request
|
||||
import time
|
||||
import threading
|
||||
import subprocess
|
||||
import os
|
||||
import brozzler
|
||||
import select
|
||||
import re
|
||||
import signal
|
||||
import sqlite3
|
||||
import json
|
||||
import psutil
|
||||
import tempfile
|
||||
|
||||
class Chrome:
|
||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||
|
||||
def __init__(
|
||||
self, port, executable, proxy=None, ignore_cert_errors=False,
|
||||
cookie_db=None):
|
||||
self.port = port
|
||||
self.executable = executable
|
||||
self.proxy = proxy
|
||||
self.ignore_cert_errors = ignore_cert_errors
|
||||
self.cookie_db = cookie_db
|
||||
self._shutdown = threading.Event()
|
||||
|
||||
def __enter__(self):
|
||||
'''
|
||||
Returns websocket url to chrome window with about:blank loaded.
|
||||
'''
|
||||
return self.start()
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.stop()
|
||||
|
||||
def _find_available_port(self, default_port=9200):
|
||||
try:
|
||||
conns = psutil.net_connections(kind='tcp')
|
||||
except psutil.AccessDenied:
|
||||
return default_port
|
||||
|
||||
if not any(conn.laddr[1] == default_port for conn in conns):
|
||||
return default_port
|
||||
|
||||
for p in range(9999,8999,-1):
|
||||
if not any(conn.laddr[1] == p for conn in conns):
|
||||
self.logger.warn(
|
||||
'port %s already in use, using %s instead',
|
||||
default_port, p)
|
||||
return p
|
||||
|
||||
return default_port
|
||||
|
||||
def _init_cookie_db(self):
|
||||
if self.cookie_db is not None:
|
||||
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
||||
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
||||
self.logger.debug(
|
||||
'cookie DB provided, writing to %s', cookie_location)
|
||||
os.makedirs(cookie_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(cookie_location, 'wb') as cookie_file:
|
||||
cookie_file.write(self.cookie_db)
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
'exception writing cookie file at %s',
|
||||
cookie_location, exc_info=True)
|
||||
|
||||
def persist_and_read_cookie_db(self):
|
||||
cookie_location = os.path.join(
|
||||
self._chrome_user_data_dir, 'Default', 'Cookies')
|
||||
self.logger.debug(
|
||||
'marking cookies persistent then reading file into memory: %s',
|
||||
cookie_location)
|
||||
try:
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute('UPDATE cookies SET persistent = 1')
|
||||
except sqlite3.Error:
|
||||
self.logger.error('exception updating cookie DB', exc_info=True)
|
||||
|
||||
cookie_db = None
|
||||
try:
|
||||
with open(cookie_location, 'rb') as cookie_file:
|
||||
cookie_db = cookie_file.read()
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
'exception reading from cookie DB file %s',
|
||||
cookie_location, exc_info=True)
|
||||
return cookie_db
|
||||
|
||||
def start(self):
|
||||
'''
|
||||
Returns websocket url to chrome window with about:blank loaded.
|
||||
'''
|
||||
# these can raise exceptions
|
||||
self._home_tmpdir = tempfile.TemporaryDirectory()
|
||||
self._chrome_user_data_dir = os.path.join(
|
||||
self._home_tmpdir.name, 'chrome-user-data')
|
||||
self._init_cookie_db()
|
||||
|
||||
new_env = os.environ.copy()
|
||||
new_env['HOME'] = self._home_tmpdir.name
|
||||
self.port = self._find_available_port(self.port)
|
||||
chrome_args = [
|
||||
self.executable, '--use-mock-keychain', # mac thing
|
||||
'--user-data-dir=%s' % self._chrome_user_data_dir,
|
||||
'--remote-debugging-port=%s' % self.port,
|
||||
'--disable-web-sockets', '--disable-cache',
|
||||
'--window-size=1100,900', '--no-default-browser-check',
|
||||
'--disable-first-run-ui', '--no-first-run',
|
||||
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
||||
'--disable-web-security', '--disable-notifications',
|
||||
'--disable-extensions', '--disable-save-password-bubble']
|
||||
if self.ignore_cert_errors:
|
||||
chrome_args.append('--ignore-certificate-errors')
|
||||
if self.proxy:
|
||||
chrome_args.append('--proxy-server=%s' % self.proxy)
|
||||
chrome_args.append('about:blank')
|
||||
self.logger.info(
|
||||
'running: %s', repr(subprocess.list2cmdline(chrome_args)))
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
self.chrome_process = subprocess.Popen(
|
||||
chrome_args, env=new_env, start_new_session=True,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
|
||||
self._out_reader_thread = threading.Thread(
|
||||
target=self._read_stderr_stdout,
|
||||
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid)
|
||||
self._out_reader_thread.start()
|
||||
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
|
||||
|
||||
return self._websocket_url()
|
||||
|
||||
def _websocket_url(self):
|
||||
timeout_sec = 600
|
||||
json_url = 'http://localhost:%s/json' % self.port
|
||||
# make this a member variable so that kill -QUIT reports it
|
||||
self._start = time.time()
|
||||
while True:
|
||||
try:
|
||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
||||
debug_info = [x for x in all_debug_info
|
||||
if x['url'] == 'about:blank']
|
||||
|
||||
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
||||
self.logger.debug('%s returned %s', json_url, raw_json)
|
||||
url = debug_info[0]['webSocketDebuggerUrl']
|
||||
self.logger.info(
|
||||
'got chrome window websocket debug url %s from %s',
|
||||
url, json_url)
|
||||
return url
|
||||
except BaseException as e:
|
||||
if int(time.time() - self._start) % 10 == 5:
|
||||
self.logger.warn(
|
||||
'problem with %s (will keep trying until timeout '
|
||||
'of %d seconds): %s', json_url, timeout_sec, e)
|
||||
pass
|
||||
finally:
|
||||
if time.time() - self._start > timeout_sec:
|
||||
self.logger.error(
|
||||
'killing chrome, failed to retrieve %s after % '
|
||||
'seconds', json_url, time.time() - self._start)
|
||||
self.stop()
|
||||
raise Exception(
|
||||
'killed chrome, failed to retrieve %s after %s '
|
||||
'seconds' % (json_url, time.time() - self._start))
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
|
||||
def _read_stderr_stdout(self):
|
||||
# XXX select doesn't work on windows
|
||||
def readline_nonblock(f):
|
||||
buf = b''
|
||||
while not self._shutdown.is_set() and (
|
||||
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
||||
[f],[],[],0.5)[0]:
|
||||
buf += f.read(1)
|
||||
return buf
|
||||
|
||||
try:
|
||||
while not self._shutdown.is_set():
|
||||
buf = readline_nonblock(self.chrome_process.stdout)
|
||||
if buf:
|
||||
if re.search(
|
||||
b'Xlib: extension|'
|
||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||
b'ERROR:gpu_child_thread.cc', buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
|
||||
buf = readline_nonblock(self.chrome_process.stderr)
|
||||
if buf:
|
||||
if re.search(
|
||||
b'Xlib: extension|'
|
||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||
b'ERROR:gpu_child_thread.cc', buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
'chrome pid %s STDERR %s',
|
||||
self.chrome_process.pid, buf)
|
||||
except:
|
||||
logging.error('unexpected exception', exc_info=True)
|
||||
|
||||
def stop(self):
|
||||
if not self.chrome_process or self._shutdown.is_set():
|
||||
return
|
||||
|
||||
timeout_sec = 300
|
||||
self._shutdown.set()
|
||||
self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid)
|
||||
|
||||
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
||||
first_sigterm = time.time()
|
||||
|
||||
try:
|
||||
while time.time() - first_sigterm < timeout_sec:
|
||||
time.sleep(0.5)
|
||||
|
||||
status = self.chrome_process.poll()
|
||||
if status is not None:
|
||||
if status == 0:
|
||||
self.logger.info(
|
||||
'chrome pid %s exited normally',
|
||||
self.chrome_process.pid)
|
||||
else:
|
||||
self.logger.warn(
|
||||
'chrome pid %s exited with nonzero status %s',
|
||||
self.chrome_process.pid, status)
|
||||
|
||||
# XXX I would like to forcefully kill the process group
|
||||
# here to guarantee no orphaned chromium subprocesses hang
|
||||
# around, but there's a chance I suppose that some other
|
||||
# process could have started with the same pgid
|
||||
return
|
||||
|
||||
self.logger.warn(
|
||||
'chrome pid %s still alive %.1f seconds after sending '
|
||||
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
||||
time.time() - first_sigterm)
|
||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||
status = self.chrome_process.wait()
|
||||
self.logger.warn(
|
||||
'chrome pid %s reaped (status=%s) after killing with '
|
||||
'SIGKILL', self.chrome_process.pid, status)
|
||||
|
||||
try:
|
||||
self._home_tmpdir.cleanup()
|
||||
except:
|
||||
self.logger.error(
|
||||
'exception deleting %s', self._home_tmpdir,
|
||||
exc_info=True)
|
||||
finally:
|
||||
self._out_reader_thread.join()
|
||||
self.chrome_process = None
|
@ -350,3 +350,69 @@ def brozzler_ensure_tables():
|
||||
|
||||
# sites, pages, jobs tables
|
||||
brozzler.frontier.RethinkDbFrontier(r)
|
||||
|
||||
def brozzler_list_captures():
|
||||
'''
|
||||
Handy utility for looking up entries in the rethinkdb "captures" table by
|
||||
url or sha1.
|
||||
'''
|
||||
import surt
|
||||
import rethinkdb
|
||||
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'url_or_sha1', metavar='URL_or_SHA1',
|
||||
help='url or sha1 to look up in captures table')
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
|
||||
class Jsonner(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.isoformat()
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
if args.url_or_sha1[:5] == 'sha1:':
|
||||
raise Exception('not implemented')
|
||||
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||
# if algo != "sha1":
|
||||
# raise Exception(
|
||||
# "digest type is %s but big captures table is indexed by "
|
||||
# "sha1" % algo)
|
||||
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
||||
# results = list(results_iter)
|
||||
# if len(results) > 0:
|
||||
# if len(results) > 1:
|
||||
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
|
||||
# result = results[0]
|
||||
# else:
|
||||
# result = None
|
||||
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
|
||||
# result, sha1base32, bucket)
|
||||
# return result
|
||||
else:
|
||||
key = surt.surt(
|
||||
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
||||
with_scheme=True)
|
||||
reql = r.table('captures').between(
|
||||
[key[:150], rethinkdb.minval],
|
||||
[key[:150]+'!', rethinkdb.maxval],
|
||||
index='abbr_canon_surt_timestamp')
|
||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
||||
reql = reql.filter(
|
||||
lambda capture: (capture['canon_surt'] >= key)
|
||||
& (capture['canon_surt'] <= key))
|
||||
logging.debug('rethinkdb query: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
||||
|
262
brozzler/dashboard/static/brozzler-icon.svg
Normal file
262
brozzler/dashboard/static/brozzler-icon.svg
Normal file
@ -0,0 +1,262 @@
|
||||
<svg width='23' height='17' xmlns='http://www.w3.org/2000/svg' version='1.1'
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
>
|
||||
|
||||
<!-- make sure glyph is visible within svg window -->
|
||||
<g fill-rule='nonzero' transform='scale(0.0293) translate(30 0)'>
|
||||
<!-- draw actual outline using lines and Bezier curves-->
|
||||
<path fill='#666' stroke='black' stroke-width='0' transform='scale(1.35) translate(-46 517)' d='
|
||||
M 582,-127
|
||||
L 582,-229
|
||||
Q 582,-240 571,-240
|
||||
L 485,-240
|
||||
L 410,-323
|
||||
Q 364,-374 297,-374
|
||||
Q 237,-374 193,-332
|
||||
L 42,-193
|
||||
Q 39,-187 39,-186
|
||||
Q 39,-181 42,-178
|
||||
L 110,-110
|
||||
Q 112,-107 117,-107
|
||||
Q 123,-107 125,-110
|
||||
L 230,-214
|
||||
L 279,-156
|
||||
Q 315,-117 367,-117
|
||||
L 571,-117
|
||||
Q 582,-117 582,-127
|
||||
Z
|
||||
'/>
|
||||
|
||||
<defs
|
||||
id="defs3043">
|
||||
<linearGradient
|
||||
id="linearGradient3803">
|
||||
<stop
|
||||
style="stop-color:#d7def0;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop3805" />
|
||||
<stop
|
||||
id="stop3811"
|
||||
offset="0.5"
|
||||
style="stop-color:#ffffff;stop-opacity:1" />
|
||||
<stop
|
||||
style="stop-color:#d5def0;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop3807" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient3776"
|
||||
inkscape:collect="always">
|
||||
<stop
|
||||
id="stop3778"
|
||||
offset="0"
|
||||
style="stop-color:#b2cde9;stop-opacity:1" />
|
||||
<stop
|
||||
id="stop3780"
|
||||
offset="1"
|
||||
style="stop-color:#c4dbee;stop-opacity:1" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient3750">
|
||||
<stop
|
||||
id="stop3752"
|
||||
offset="0"
|
||||
style="stop-color:#d0e2f1;stop-opacity:1" />
|
||||
<stop
|
||||
style="stop-color:#cadef0;stop-opacity:1"
|
||||
offset="0.85580856"
|
||||
id="stop3756" />
|
||||
<stop
|
||||
id="stop3754"
|
||||
offset="1"
|
||||
style="stop-color:#95bee3;stop-opacity:1" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient3708">
|
||||
<stop
|
||||
style="stop-color:#658db6;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop3710" />
|
||||
<stop
|
||||
id="stop3716"
|
||||
offset="0.76777935"
|
||||
style="stop-color:#527fab;stop-opacity:1;" />
|
||||
<stop
|
||||
style="stop-color:#4071a0;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop3712" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
id="linearGradient3698">
|
||||
<stop
|
||||
style="stop-color:#96d0e1;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop3700" />
|
||||
<stop
|
||||
id="stop3706"
|
||||
offset="0.67819428"
|
||||
style="stop-color:#89b7e1;stop-opacity:1" />
|
||||
<stop
|
||||
style="stop-color:#699dd3;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop3702" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient3647">
|
||||
<stop
|
||||
style="stop-color:#3b79bc;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop3649" />
|
||||
<stop
|
||||
style="stop-color:#94b8e0;stop-opacity:1"
|
||||
offset="1"
|
||||
id="stop3651" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient3588">
|
||||
<stop
|
||||
style="stop-color:#ffffff;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop3590" />
|
||||
<stop
|
||||
style="stop-color:#000000;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop3592" />
|
||||
</linearGradient>
|
||||
<radialGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3588"
|
||||
id="radialGradient3594"
|
||||
cx="-118.77966"
|
||||
cy="121.49152"
|
||||
fx="-118.77966"
|
||||
fy="121.49152"
|
||||
r="25.491526"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
gradientTransform="matrix(0.02177942,-0.95743591,0.97872327,0.02221687,-235.0993,5.0684454)" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3647"
|
||||
id="linearGradient3653"
|
||||
x1="-397.81323"
|
||||
y1="149.18764"
|
||||
x2="-397.55933"
|
||||
y2="51.355946"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<radialGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3698"
|
||||
id="radialGradient3704"
|
||||
cx="-383.2746"
|
||||
cy="217.91029"
|
||||
fx="-383.2746"
|
||||
fy="217.91029"
|
||||
r="59.401995"
|
||||
gradientTransform="matrix(-1.2861568,-0.08596317,0.11453678,-1.7136762,-425.01982,469.50099)"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<radialGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3708"
|
||||
id="radialGradient3714"
|
||||
cx="-123.5"
|
||||
cy="-11.570732"
|
||||
fx="-123.5"
|
||||
fy="-11.570732"
|
||||
r="95.627118"
|
||||
gradientTransform="matrix(-0.00756512,0.55751399,-1.0314585,-0.01398286,113.23967,103.212)"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<radialGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3750"
|
||||
id="radialGradient3748"
|
||||
cx="-94.87291"
|
||||
cy="165.27281"
|
||||
fx="-94.87291"
|
||||
fy="165.27281"
|
||||
r="60.481357"
|
||||
gradientTransform="matrix(0.81293878,1.6998003,-2.1519091,1.0291615,564.39485,118.47915)"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3776"
|
||||
id="linearGradient3774"
|
||||
x1="162.07127"
|
||||
y1="85.239708"
|
||||
x2="220.76114"
|
||||
y2="78.875748"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
gradientTransform="translate(3.3917128,7.418629)" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient3803"
|
||||
id="linearGradient3809"
|
||||
x1="-382.04123"
|
||||
y1="37.280548"
|
||||
x2="-381.39438"
|
||||
y2="165.56691"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
</defs>
|
||||
<path
|
||||
sodipodi:type="arc"
|
||||
style="fill:url(#radialGradient3594);fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
id="path2814"
|
||||
sodipodi:cx="-118.23729"
|
||||
sodipodi:cy="122.57627"
|
||||
sodipodi:rx="25.491526"
|
||||
sodipodi:ry="25.491526"
|
||||
d="m -92.745764,122.57627 a 25.491526,25.491526 0 1 1 -50.983056,0 25.491526,25.491526 0 1 1 50.983056,0 z"
|
||||
transform="matrix(4.680851,0,0,4.7978723,685.10478,-449.69946)" />
|
||||
<path
|
||||
style="fill:url(#linearGradient3774);fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 232.17258,88.120422 c 0,15.673918 -19.79135,34.931518 -45.84395,34.931518 -26.0526,0 -59.92241,-16.08123 -59.92241,-31.755152 0,-15.673924 21.11981,-28.38015 47.17241,-28.38015 19.90254,0 46.36122,18.293224 56.45971,20.3521 0.79179,1.710571 1.36862,2.925087 2.13424,4.851684 z"
|
||||
id="path3655"
|
||||
sodipodi:nodetypes="cssscc" />
|
||||
<path
|
||||
style="fill:#2e5c91;fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 38.822019,65.971523 c 12.38148,-9.610993 35.314514,-1.245318 51.289554,19.334679 15.975027,20.579998 17.694937,51.065068 5.31349,60.676058 -12.38147,9.61099 -34.17571,-5.29155 -50.15074,-25.87156 -12.20392,-15.72181 -4.05062,-41.19089 -8.61646,-50.430553 0.61589,-1.122052 1.381696,-2.456607 2.164156,-3.708624 z"
|
||||
id="path3655-4-8"
|
||||
sodipodi:nodetypes="cssscc" />
|
||||
<path
|
||||
style="fill:url(#radialGradient3714);fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 230.04347,83.261765 c -7.0081,-0.03265 -61.07025,0.289575 -107.66568,0.0654 -17.371,5.108098 -31.704627,13.258827 -39.181777,29.154945 -5.33639,-4.54237 -40.74576,-42.215609 -44.40678,-46.440684 31.38983,-41.648805 74.528017,-45.559321 82.915257,-45.559321 8.38724,0 70.64407,-8.631855 108.33898,62.77966 z"
|
||||
id="path3596"
|
||||
sodipodi:nodetypes="ccccsc" />
|
||||
<path
|
||||
style="fill:#699dd3;fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 117.12454,243.96815 c -12.49835,-9.45851 -14.5752,-36.93927 1.14635,-57.71356 15.72155,-20.77428 41.03582,-34.94753 53.53417,-25.48904 12.49834,9.4585 7.44792,38.96701 -8.27364,59.74129 -12.01027,15.87024 -35.4911,16.88498 -43.22681,23.69505 -1.23894,-0.0455 -1.95523,-0.0605 -3.18007,-0.23374 z"
|
||||
id="path3655-4"
|
||||
sodipodi:nodetypes="cssscc" />
|
||||
<path
|
||||
style="fill:url(#radialGradient3748);fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 120.3032,244.20103 c 3.58354,-6.02268 28.85859,-52.8991 52.69131,-92.9389 4.41104,-17.56095 5.34663,-33.64185 -4.5584,-48.14993 6.62173,-2.29412 58.23852,-13.976353 63.73684,-14.987686 19.9656,48.180076 1.44992,87.338276 -2.80522,94.565966 -4.25515,7.22768 -28.40179,65.25666 -109.06453,61.51055 z"
|
||||
id="path3596-1"
|
||||
sodipodi:nodetypes="ccccsc" />
|
||||
<path
|
||||
style="fill:url(#radialGradient3704);fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 36.696853,69.642524 c 3.46858,6.089612 30.72312,52.780196 53.77852,93.272576 13.094367,12.50527 27.684997,19.48512 45.191737,18.03328 -1.2738,6.89113 -16.62898,57.75037 -18.4638,63.03126 -51.756237,-6.42158 -76.669777,-41.85476 -80.854757,-49.1233 -4.18497,-7.26855 -42.7297502,-56.91452 0.3483,-125.213816 z"
|
||||
id="path3596-1-7"
|
||||
sodipodi:nodetypes="ccccsc" />
|
||||
<path
|
||||
transform="matrix(0.77294737,0,0,0.77619098,435.90647,53.275706)"
|
||||
style="fill:url(#linearGradient3653);fill-opacity:1;stroke:url(#linearGradient3809);stroke-width:10.07013607;stroke-miterlimit:4;stroke-opacity:1"
|
||||
d="m -338.44068,101.42373 c 0,32.65032 -26.46832,59.11864 -59.11865,59.11864 -32.65032,0 -59.11864,-26.46832 -59.11864,-59.11864 0,-32.650327 26.46832,-59.118646 59.11864,-59.118646 32.65033,0 59.11865,26.468319 59.11865,59.118646 z"
|
||||
id="path3645" />
|
||||
<path
|
||||
style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 163.54619,108.89582 c 18.52979,17.09836 16.03302,29.55794 10.0625,44 -3.10892,-22.25001 -2.34478,-32.42697 -10.0625,-44 z"
|
||||
id="rect3782"
|
||||
sodipodi:nodetypes="ccc" />
|
||||
<path
|
||||
style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
|
||||
d="m 101.42092,173.63924 c -22.645593,-14.47335 -29.809884,-45.71983 -8.813354,-62.99032 -10.847561,19.77514 -6.225429,32.39863 8.813354,62.99032 z"
|
||||
id="rect3782-4"
|
||||
sodipodi:nodetypes="ccc" />
|
||||
|
||||
<!-- <text x="400" y="100" fill="black" font-size="70">brozzler</text> -->
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 9.1 KiB |
@ -70,7 +70,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
'filename': record['filename'],
|
||||
}
|
||||
if record['warc_type'] != 'revisit':
|
||||
blob['mime'] = record['content_type']
|
||||
blob['mime'] = record['content_type'] or '-'
|
||||
else:
|
||||
blob['mime'] = 'warc/revisit'
|
||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||
|
@ -78,7 +78,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
|
||||
final_url = url
|
||||
while final_url in redirects:
|
||||
final_url = redirects[final_url].response.headers['location']
|
||||
final_url = redirects.pop(final_url).response.headers['location']
|
||||
|
||||
final_bounces = []
|
||||
for txn in self.transactions:
|
||||
|
5
setup.py
5
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b8.dev129',
|
||||
version='1.1b8.dev142',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -51,6 +51,7 @@ setuptools.setup(
|
||||
'brozzler-new-site=brozzler.cli:brozzler_new_site',
|
||||
'brozzler-worker=brozzler.cli:brozzler_worker',
|
||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
|
||||
'brozzler-dashboard=brozzler.dashboard:main',
|
||||
'brozzler-easy=brozzler.easy:main',
|
||||
'brozzler-wayback=brozzler.pywb:main',
|
||||
@ -61,7 +62,7 @@ setuptools.setup(
|
||||
'youtube-dl',
|
||||
'reppy==0.3.4',
|
||||
'requests',
|
||||
'websocket-client',
|
||||
'websocket-client!=0.39.0',
|
||||
'pillow==3.3.0',
|
||||
'surt>=0.3.0',
|
||||
'rethinkstuff>=0.1.5',
|
||||
|
@ -22,6 +22,10 @@ import http.server
|
||||
import threading
|
||||
import os
|
||||
import brozzler
|
||||
import brozzler.chrome
|
||||
import socket
|
||||
import logging
|
||||
import psutil
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
@ -52,3 +56,20 @@ def test_robots(httpd):
|
||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_find_available_port():
|
||||
x = brozzler.chrome.Chrome(None, None)
|
||||
try:
|
||||
psutil.net_connections(kind='tcp')
|
||||
except psutil.AccessDenied:
|
||||
logging.warn(
|
||||
'skipping _find_available_port() test because '
|
||||
'psutil.net_connections(kind="tcp") raised AccessDenied')
|
||||
return
|
||||
assert x._find_available_port(9800) == 9800
|
||||
sock = socket.socket()
|
||||
sock.bind(('localhost', 9800))
|
||||
sock.listen(0)
|
||||
assert x._find_available_port(9800) == 9999
|
||||
sock.close()
|
||||
assert x._find_available_port(9800) == 9800
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user