Merge pull request #271 from internetarchive/avdempsey/use-black

Use black, enforce with GitHub Actions
This commit is contained in:
Alex Dempsey 2024-02-08 22:35:27 -08:00 committed by GitHub
commit 955cae6421
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 4048 additions and 2797 deletions

31
.github/workflows/python-formatting.yml vendored Normal file
View File

@ -0,0 +1,31 @@
name: Python Formatting Check
on:
push:
branches:
- main
- master
pull_request:
branches:
- main
- master
jobs:
formatting:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Create virtual environment
run: python -m venv venv
- name: Install black
run: |
./venv/bin/pip install --upgrade pip
./venv/bin/pip install black
- name: Run formatting check
run: make ck-format

2
.gitignore vendored
View File

@ -2,3 +2,5 @@
*.diff
.*.sw*
/brozzler.egg-info/
venv
.idea

7
Makefile Normal file
View File

@ -0,0 +1,7 @@
.PHONY: format
format:
venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
.PHONY: ck-format
ck-format:
venv/bin/black --check .

View File

@ -19,33 +19,41 @@ limitations under the License.
import logging
from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution('brozzler').version
__version__ = _get_distribution("brozzler").version
class ShutdownRequested(Exception):
pass
class NothingToClaim(Exception):
pass
class CrawlStopped(Exception):
pass
class PageInterstitialShown(Exception):
pass
class ProxyError(Exception):
pass
class ReachedTimeLimit(Exception):
pass
class ReachedLimit(Exception):
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
import json
if http_error:
if "warcprox-meta" in http_error.headers:
self.warcprox_meta = json.loads(
http_error.headers["warcprox-meta"])
self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
else:
self.warcprox_meta = None
self.http_payload = http_error.read()
@ -55,28 +63,39 @@ class ReachedLimit(Exception):
def __repr__(self):
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
self.http_payload if hasattr(self, 'http_payload') else None)
self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
self.http_payload if hasattr(self, "http_payload") else None,
)
def __str__(self):
return self.__repr__()
# monkey-patch log levels TRACE and NOTICE
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.TRACE):
self._log(logging.TRACE, msg, args, **kwargs)
logging.Logger.trace = _logger_trace
logging.trace = logging.root.trace
logging.addLevelName(logging.TRACE, 'TRACE')
logging.addLevelName(logging.TRACE, "TRACE")
logging.NOTICE = (logging.INFO + logging.WARN) // 2
def _logger_notice(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.NOTICE):
self._log(logging.NOTICE, msg, args, **kwargs)
logging.Logger.notice = _logger_notice
logging.notice = logging.root.notice
logging.addLevelName(logging.NOTICE, 'NOTICE')
logging.addLevelName(logging.NOTICE, "NOTICE")
# see https://github.com/internetarchive/brozzler/issues/91
def _logging_handler_handle(self, record):
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
except:
pass
return rv
logging.Handler.handle = _logging_handler_handle
_behaviors = None
def behaviors(behaviors_dir=None):
"""Return list of JS behaviors loaded from YAML file.
@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
`js-templates/`. Defaults to brozzler dir.
"""
import os, yaml, string
global _behaviors
if _behaviors is None:
d = behaviors_dir or os.path.dirname(__file__)
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
behaviors_yaml = os.path.join(d, "behaviors.yaml")
with open(behaviors_yaml) as fin:
_behaviors = yaml.safe_load(fin)
return _behaviors
def behavior_script(url, template_parameters=None, behaviors_dir=None):
'''
"""
Returns the javascript behavior string populated with template_parameters.
'''
"""
import re, logging, json
for behavior in behaviors(behaviors_dir=behaviors_dir):
if re.match(behavior['url_regex'], url):
if re.match(behavior["url_regex"], url):
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if "default_parameters" in behavior:
parameters.update(behavior["default_parameters"])
if template_parameters:
parameters.update(template_parameters)
template = jinja2_environment(behaviors_dir).get_template(
behavior['behavior_js_template'])
behavior["behavior_js_template"]
)
script = template.render(parameters)
logging.info(
'using template=%r populated with parameters=%r for %r',
behavior['behavior_js_template'], json.dumps(parameters), url)
"using template=%r populated with parameters=%r for %r",
behavior["behavior_js_template"],
json.dumps(parameters),
url,
)
return script
return None
class ThreadExceptionGate:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -142,8 +173,7 @@ class ThreadExceptionGate:
def __enter__(self):
assert self.thread == threading.current_thread()
if self.pending_exception:
self.logger.info(
'raising pending exception %s', self.pending_exception)
self.logger.info("raising pending exception %s", self.pending_exception)
tmp = self.pending_exception
self.pending_exception = None
raise tmp
@ -154,25 +184,32 @@ class ThreadExceptionGate:
def __exit__(self, exc_type, exc_value, traceback):
assert self.thread == threading.current_thread()
self.ok_to_raise.clear()
return False # don't swallow exception
return False # don't swallow exception
def queue_exception(self, e):
with self.lock:
if self.pending_exception:
self.logger.warning(
'%r already pending for thread %r, discarding %r',
self.pending_exception, self.thread, e)
"%r already pending for thread %r, discarding %r",
self.pending_exception,
self.thread,
e,
)
else:
self.pending_exception = e
def __repr__(self):
return '<ThreadExceptionGate(%s)>' % self.thread
return "<ThreadExceptionGate(%s)>" % self.thread
import threading
_thread_exception_gates = {}
_thread_exception_gates_lock = threading.Lock()
def thread_exception_gate(thread=None):
'''
"""
Returns a `ThreadExceptionGate` for `thread` (current thread by default).
`ThreadExceptionGate` is a context manager which allows exceptions to be
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
is queued, and raised immediately if and when the thread enters the
context. Only one exception will be queued this way at a time, others are
discarded.
'''
"""
if not thread:
thread = threading.current_thread()
@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):
return _thread_exception_gates[thread]
thread_accept_exceptions = thread_exception_gate
def thread_raise(thread, exctype):
'''
"""
Raises or queues the exception `exctype` for the thread `thread`.
See the documentation on the function `thread_exception_gate()` for more
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
Raises:
TypeError if `exctype` is not a class
ValueError, SystemError in case of unexpected problems
'''
"""
import ctypes, inspect, threading, logging
if not inspect.isclass(exctype):
raise TypeError(
'cannot raise %s, only exception types can be raised (not '
'instances)' % exctype)
"cannot raise %s, only exception types can be raised (not "
"instances)" % exctype
)
gate = thread_exception_gate(thread)
with gate.lock:
if gate.ok_to_raise.is_set() and thread.is_alive():
gate.ok_to_raise.clear()
logging.info('raising %s in thread %s', exctype, thread)
logging.info("raising %s in thread %s", exctype, thread)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(thread.ident), ctypes.py_object(exctype))
ctypes.c_long(thread.ident), ctypes.py_object(exctype)
)
if res == 0:
raise ValueError(
'invalid thread id? thread.ident=%s' % thread.ident)
raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
elif res != 1:
# if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
raise SystemError('PyThreadState_SetAsyncExc failed')
raise SystemError("PyThreadState_SetAsyncExc failed")
else:
logging.info('queueing %s for thread %s', exctype, thread)
logging.info("queueing %s for thread %s", exctype, thread)
gate.queue_exception(exctype)
def sleep(duration):
'''
"""
Sleeps for duration seconds in increments of 0.5 seconds.
Use this so that the sleep can be interrupted by thread_raise().
'''
"""
import time
start = time.time()
while True:
elapsed = time.time() - start
@ -259,32 +301,41 @@ def sleep(duration):
break
time.sleep(min(duration - elapsed, 0.5))
_jinja2_env = None
def jinja2_environment(behaviors_dir=None):
global _jinja2_env
if not _jinja2_env:
import os, jinja2, json
if behaviors_dir:
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
'js-templates'))
_loader = jinja2.FileSystemLoader(
os.path.join(behaviors_dir, "js-templates")
)
else:
_loader=jinja2.PackageLoader('brozzler', 'js-templates')
_loader = jinja2.PackageLoader("brozzler", "js-templates")
_jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
_jinja2_env.filters['json'] = json.dumps
_jinja2_env.filters["json"] = json.dumps
return _jinja2_env
import urlcanon
def _remove_query(url):
url.question_mark = b''
url.query = b''
url.question_mark = b""
url.query = b""
# XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer(
urlcanon.semantic.steps + [_remove_query])
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
import doublethink
import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.model import (
new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf)
new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
from brozzler.cli import suggest_default_chrome_exe
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf',
'sleep', 'thread_accept_exceptions', 'thread_raise']
__all__ = [
"Page",
"Site",
"BrozzlerWorker",
"is_permitted_by_robots",
"RethinkDbFrontier",
"Browser",
"BrowserPool",
"BrowsingException",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
"sleep",
"thread_accept_exceptions",
"thread_raise",
]

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import logging
import urllib.request
@ -31,39 +31,43 @@ import json
import tempfile
import sys
def check_version(chrome_exe):
'''
"""
Raises SystemExit if `chrome_exe` is not a supported browser version.
Must run in the main thread to have the desired effect.
'''
"""
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
# Google Chrome 64.0.3282.140
# Google Chrome 64.0.3282.140
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
# Google Chrome 66.0.3341.0 canary
# linux$ chromium-browser --version
# Using PPAPI flash.
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
cmd = [chrome_exe, '--version']
cmd = [chrome_exe, "--version"]
out = subprocess.check_output(cmd, timeout=60)
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
if not m:
sys.exit(
'unable to parse browser version from output of '
'%r: %r' % (subprocess.list2cmdline(cmd), out))
"unable to parse browser version from output of "
"%r: %r" % (subprocess.list2cmdline(cmd), out)
)
version_str = m.group(2).decode()
major_version = int(version_str.split('.')[0])
major_version = int(version_str.split(".")[0])
if major_version < 64:
sys.exit('brozzler requires chrome/chromium version 64 or '
'later but %s reports version %s' % (
chrome_exe, version_str))
sys.exit(
"brozzler requires chrome/chromium version 64 or "
"later but %s reports version %s" % (chrome_exe, version_str)
)
class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__)
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
'''
"""
Initializes instance of this class.
Doesn't start the browser, start() does that.
@ -73,7 +77,7 @@ class Chrome:
port: chrome debugging protocol port (default 9222)
ignore_cert_errors: configure chrome to accept all certs (default
False)
'''
"""
self.port = port
self.chrome_exe = chrome_exe
self.ignore_cert_errors = ignore_cert_errors
@ -81,63 +85,72 @@ class Chrome:
self.chrome_process = None
def __enter__(self):
'''
"""
Returns websocket url to chrome window with about:blank loaded.
'''
"""
return self.start()
def __exit__(self, *args):
self.stop()
def _init_cookie_db(self, cookie_db):
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies')
self.logger.debug('cookie DB provided, writing to %s', cookie_location)
cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
cookie_location = os.path.join(cookie_dir, "Cookies")
self.logger.debug("cookie DB provided, writing to %s", cookie_location)
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, 'wb') as cookie_file:
with open(cookie_location, "wb") as cookie_file:
cookie_file.write(cookie_db)
except OSError:
self.logger.error(
'exception writing cookie file at %s',
cookie_location, exc_info=True)
"exception writing cookie file at %s", cookie_location, exc_info=True
)
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(
self._chrome_user_data_dir, 'Default', 'Cookies')
cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
self.logger.debug(
'marking cookies persistent then reading file into memory: %s',
cookie_location)
"marking cookies persistent then reading file into memory: %s",
cookie_location,
)
try:
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute('UPDATE cookies SET is_persistent = 1')
cur.execute("UPDATE cookies SET is_persistent = 1")
except sqlite3.Error:
try:
# db schema changed around version 66, this is the old schema
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute('UPDATE cookies SET persistent = 1')
cur.execute("UPDATE cookies SET persistent = 1")
except sqlite3.Error:
self.logger.error(
'exception updating cookie DB %s', cookie_location,
exc_info=True)
"exception updating cookie DB %s", cookie_location, exc_info=True
)
cookie_db = None
try:
with open(cookie_location, 'rb') as cookie_file:
with open(cookie_location, "rb") as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
'exception reading from cookie DB file %s',
cookie_location, exc_info=True)
"exception reading from cookie DB file %s",
cookie_location,
exc_info=True,
)
return cookie_db
def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
disk_cache_size=None, websocket_timeout=60,
window_height=900, window_width=1400):
'''
def start(
self,
proxy=None,
cookie_db=None,
disk_cache_dir=None,
disk_cache_size=None,
websocket_timeout=60,
window_height=900,
window_width=1400,
):
"""
Starts chrome/chromium process.
Args:
@ -154,103 +167,126 @@ class Chrome:
window_height, window_width: window height and width, in pixels
Returns:
websocket url to chrome window with about:blank loaded
'''
"""
# these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
self._home_tmpdir.name, "chrome-user-data"
)
if cookie_db:
self._init_cookie_db(cookie_db)
self._shutdown.clear()
new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
new_env["HOME"] = self._home_tmpdir.name
chrome_args = [
self.chrome_exe,
'-v',
'--headless',
'--remote-debugging-port=%s' % self.port,
'--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--disable-background-networking', '--disable-breakpad',
'--disable-renderer-backgrounding', '--disable-hang-monitor',
'--disable-background-timer-throttling', '--mute-audio',
'--disable-web-sockets',
f'--window-size={window_width},{window_height}',
'--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests',
'--disable-web-security', '--disable-notifications',
'--disable-extensions', '--disable-save-password-bubble',
'--disable-sync']
self.chrome_exe,
"-v",
"--headless",
"--remote-debugging-port=%s" % self.port,
"--use-mock-keychain", # mac thing
"--user-data-dir=%s" % self._chrome_user_data_dir,
"--disable-background-networking",
"--disable-breakpad",
"--disable-renderer-backgrounding",
"--disable-hang-monitor",
"--disable-background-timer-throttling",
"--mute-audio",
"--disable-web-sockets",
f"--window-size={window_width},{window_height}",
"--no-default-browser-check",
"--disable-first-run-ui",
"--no-first-run",
"--homepage=about:blank",
"--disable-direct-npapi-requests",
"--disable-web-security",
"--disable-notifications",
"--disable-extensions",
"--disable-save-password-bubble",
"--disable-sync",
]
extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
if extra_chrome_args:
chrome_args.extend(extra_chrome_args.split())
if disk_cache_dir:
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
if disk_cache_size:
chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors')
chrome_args.append("--ignore-certificate-errors")
if proxy:
chrome_args.append('--proxy-server=%s' % proxy)
chrome_args.append('about:blank')
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
chrome_args.append("--proxy-server=%s" % proxy)
chrome_args.append("about:blank")
self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
chrome_args, env=new_env, start_new_session=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
chrome_args,
env=new_env,
start_new_session=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0,
)
self._out_reader_thread = threading.Thread(
target=self._read_stderr_stdout,
name='ChromeOutReaderThread:%s' % self.port, daemon=True)
target=self._read_stderr_stdout,
name="ChromeOutReaderThread:%s" % self.port,
daemon=True,
)
self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
return self._websocket_url(timeout_sec=websocket_timeout)
def _websocket_url(self, timeout_sec = 60):
json_url = 'http://localhost:%s/json' % self.port
def _websocket_url(self, timeout_sec=60):
json_url = "http://localhost:%s/json" % self.port
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
self._last_warning = self._start
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info
if x['url'] == 'about:blank']
all_debug_info = json.loads(raw_json.decode("utf-8"))
debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
self.logger.debug('%s returned %s', json_url, raw_json)
url = debug_info[0]['webSocketDebuggerUrl']
if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
self.logger.debug("%s returned %s", json_url, raw_json)
url = debug_info[0]["webSocketDebuggerUrl"]
self.logger.info(
'got chrome window websocket debug url %s from %s',
url, json_url)
"got chrome window websocket debug url %s from %s",
url,
json_url,
)
return url
except brozzler.ShutdownRequested:
raise
except Exception as e:
if time.time() - self._last_warning > 30:
self.logger.warning(
'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e)
"problem with %s (will keep trying until timeout "
"of %d seconds): %s",
json_url,
timeout_sec,
e,
)
self._last_warning = time.time()
finally:
e = None
if self.chrome_process:
if time.time() - self._start > timeout_sec:
e = Exception(
'killing chrome, failed to retrieve %s after '
'%s seconds' % (
json_url, time.time() - self._start))
"killing chrome, failed to retrieve %s after "
"%s seconds" % (json_url, time.time() - self._start)
)
elif self.chrome_process.poll() is not None:
e = Exception(
'chrome process died with status %s' % self.chrome_process.poll())
"chrome process died with status %s"
% self.chrome_process.poll()
)
else:
time.sleep(0.5)
else:
e = Exception('??? self.chrome_process is not set ???')
e = Exception("??? self.chrome_process is not set ???")
if e:
self.stop()
raise e
@ -258,11 +294,13 @@ class Chrome:
def _read_stderr_stdout(self):
# XXX select doesn't work on windows
def readline_nonblock(f):
buf = b''
buf = b""
try:
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
while (
not self._shutdown.is_set()
and (len(buf) == 0 or buf[-1] != 0xA)
and select.select([f], [], [], 0.5)[0]
):
buf += f.read(1)
except (ValueError, OSError):
# When the chrome process crashes, stdout & stderr are closed
@ -276,16 +314,16 @@ class Chrome:
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
self.logger.trace(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
"chrome pid %s STDOUT %s", self.chrome_process.pid, buf
)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
self.logger.trace(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
)
except:
self.logger.error('unexpected exception', exc_info=True)
self.logger.error("unexpected exception", exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
@ -294,8 +332,7 @@ class Chrome:
timeout_sec = 300
if self.chrome_process.poll() is None:
self.logger.info(
'terminating chrome pgid %s', self.chrome_process.pid)
self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM)
t0 = time.time()
@ -306,12 +343,14 @@ class Chrome:
if status is not None:
if status == 0:
self.logger.info(
'chrome pid %s exited normally',
self.chrome_process.pid)
"chrome pid %s exited normally", self.chrome_process.pid
)
else:
self.logger.warning(
'chrome pid %s exited with nonzero status %s',
self.chrome_process.pid, status)
"chrome pid %s exited with nonzero status %s",
self.chrome_process.pid,
status,
)
# XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang
@ -321,14 +360,18 @@ class Chrome:
time.sleep(0.5)
self.logger.warning(
'chrome pid %s still alive %.1f seconds after sending '
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
time.time() - t0)
"chrome pid %s still alive %.1f seconds after sending "
"SIGTERM, sending SIGKILL",
self.chrome_process.pid,
time.time() - t0,
)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warning(
'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status)
"chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
self.chrome_process.pid,
status,
)
finally:
self.chrome_process.stdout.close()
@ -337,8 +380,7 @@ class Chrome:
self._home_tmpdir.cleanup()
except:
self.logger.error(
'exception deleting %s', self._home_tmpdir,
exc_info=True)
"exception deleting %s", self._home_tmpdir, exc_info=True
)
self._out_reader_thread.join()
self.chrome_process = None

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc
@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import logging
import sys
try:
import flask
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__, e)
'%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__,
e,
)
sys.exit(1)
import doublethink
import json
@ -41,33 +44,44 @@ app = flask.Flask(__name__)
# configure with environment variables
SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get(
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
'WAYBACK_BASEURL': os.environ.get(
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'),
'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost')
"RETHINKDB_SERVERS": os.environ.get(
"BROZZLER_RETHINKDB_SERVERS", "localhost"
).split(","),
"RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
"WAYBACK_BASEURL": os.environ.get(
"WAYBACK_BASEURL", "http://localhost:8880/brozzler"
),
"DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
"DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
}
rr = doublethink.Rethinker(
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
_svc_reg = None
def service_registry():
global _svc_reg
if not _svc_reg:
_svc_reg = doublethink.ServiceRegistry(rr)
return _svc_reg
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id):
reql = rr.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site").count()
reql = (
rr.table("pages")
.between(
[site_id, 0, False, r.minval],
[site_id, 0, False, r.maxval],
index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
def queue(site_id):
@ -75,38 +89,52 @@ def queue(site_id):
start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90)
reql = rr.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site")[start:end]
[site_id, 0, False, r.minval],
[site_id, 0, False, r.maxval],
index="priority_by_site",
)[start:end]
logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count")
@app.route("/api/site/<site_id>/pages_count")
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
def page_count(site_id):
reql = rr.table("pages").between(
reql = (
rr.table("pages")
.between(
[site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval],
index="priority_by_site").count()
index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
def pages(site_id):
"""Pages already crawled."""
start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90))
reql = rr.table("pages").between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
index="least_hops").order_by(index="least_hops")[start:end]
reql = (
rr.table("pages")
.between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
)
.order_by(index="least_hops")[start:end]
)
logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
def page(page_id):
@ -115,6 +143,7 @@ def page(page_id):
page_ = reql.run()
return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id):
@ -122,8 +151,9 @@ def page_yaml(page_id):
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return app.response_class(
yaml.dump(page_, default_flow_style=False),
mimetype="application/yaml")
yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
@ -135,6 +165,7 @@ def site(site_id):
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id):
@ -142,8 +173,9 @@ def site_yaml(site_id):
logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run()
return app.response_class(
yaml.dump(site_, default_flow_style=False),
mimetype="application/yaml")
yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/stats/<bucket>")
def stats(bucket):
@ -152,6 +184,7 @@ def stats(bucket):
stats_ = reql.run()
return flask.jsonify(stats_)
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
def sites(job_id):
@ -168,6 +201,7 @@ def sites(job_id):
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobless-sites")
def jobless_sites():
# XXX inefficient (unindexed) query
@ -180,6 +214,7 @@ def jobless_sites():
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
def job(job_id):
@ -192,6 +227,7 @@ def job(job_id):
job_ = reql.run()
return flask.jsonify(job_)
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id):
@ -203,19 +239,22 @@ def job_yaml(job_id):
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return app.response_class(
yaml.dump(job_, default_flow_style=False),
mimetype="application/yaml")
yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/workers")
def workers():
workers_ = service_registry().available_services("brozzler-worker")
return flask.jsonify(workers=list(workers_))
@app.route("/api/services")
def services():
services_ = service_registry().available_services()
return flask.jsonify(services=list(services_))
@app.route("/api/jobs")
def jobs():
reql = rr.table("jobs").order_by(r.desc("id"))
@ -223,20 +262,24 @@ def jobs():
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_)
@app.route("/api/config")
def config():
return flask.jsonify(config=SETTINGS)
@app.route("/api/<path:path>")
@app.route("/api", defaults={"path":""})
@app.route("/api", defaults={"path": ""})
def api404(path):
flask.abort(404)
@app.route("/", defaults={"path": ""})
@app.route("/<path:path>")
def root(path):
return flask.render_template("index.html")
try:
import gunicorn.app.base
from gunicorn.six import iteritems
@ -255,8 +298,12 @@ try:
def load_config(self):
config = dict(
[(key, value) for key, value in iteritems(self.options)
if key in self.cfg.settings and value is not None])
[
(key, value)
for key, value in iteritems(self.options)
if key in self.cfg.settings and value is not None
]
)
for key, value in iteritems(config):
self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging)
@ -270,37 +317,42 @@ try:
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info("running brozzler-dashboard using simple flask app.run")
app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT'])
app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
def main(argv=None):
import argparse
import brozzler.cli
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=(
'brozzler-dashboard - web application for viewing brozzler '
'crawl status'),
epilog=(
'brozzler-dashboard has no command line options, but can be '
'configured using the following environment variables:\n\n'
' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
'localhost)\n'
' BROZZLER_RETHINKDB_DB rethinkdb database name '
'(default: brozzler)\n'
' WAYBACK_BASEURL base url for constructing wayback '
'links (default http://localhost:8880/brozzler)'
' DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n'
' DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)'))
prog=os.path.basename(argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=(
"brozzler-dashboard - web application for viewing brozzler " "crawl status"
),
epilog=(
"brozzler-dashboard has no command line options, but can be "
"configured using the following environment variables:\n\n"
" BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. "
"db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
"localhost)\n"
" BROZZLER_RETHINKDB_DB rethinkdb database name "
"(default: brozzler)\n"
" WAYBACK_BASEURL base url for constructing wayback "
"links (default http://localhost:8880/brozzler)"
" DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n"
" DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
),
)
brozzler.cli.add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
brozzler.cli.configure_logging(args)
run()
if __name__ == "__main__":
main()

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
working together in a single process
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import sys
import logging
try:
import warcprox
import warcprox.main
@ -30,9 +31,11 @@ try:
import brozzler.dashboard
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e)
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__,
e,
)
sys.exit(1)
import argparse
import brozzler
@ -46,76 +49,112 @@ import doublethink
import traceback
import socketserver
def _build_arg_parser(argv=None):
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
prog=os.path.basename(argv[0]), description=(
'brozzler-easy - easy deployment of brozzler, with '
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
'running in a single process'))
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
prog=os.path.basename(argv[0]),
description=(
"brozzler-easy - easy deployment of brozzler, with "
"brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
"running in a single process"
),
)
# common args
brozzler.cli.add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
help='where to write warcs')
"-d",
"--warcs-dir",
dest="warcs_dir",
default="./warcs",
help="where to write warcs",
)
# warcprox args
arg_parser.add_argument(
'-c', '--cacert', dest='cacert',
default='./%s-warcprox-ca.pem' % socket.gethostname(),
help=(
'warcprox CA certificate file; if file does not exist, it '
'will be created'))
"-c",
"--cacert",
dest="cacert",
default="./%s-warcprox-ca.pem" % socket.gethostname(),
help=(
"warcprox CA certificate file; if file does not exist, it "
"will be created"
),
)
arg_parser.add_argument(
'--certs-dir', dest='certs_dir',
default='./%s-warcprox-ca' % socket.gethostname(),
help='where warcprox will store and load generated certificates')
"--certs-dir",
dest="certs_dir",
default="./%s-warcprox-ca" % socket.gethostname(),
help="where warcprox will store and load generated certificates",
)
arg_parser.add_argument(
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
default=None, help=(
'host:port of tor socks proxy, used only to connect to '
'.onion sites'))
"--onion-tor-socks-proxy",
dest="onion_tor_socks_proxy",
default=None,
help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
)
# brozzler-worker args
arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe',
default=brozzler.cli.suggest_default_chrome_exe(),
help='executable to use to invoke chrome')
"-e",
"--chrome-exe",
dest="chrome_exe",
default=brozzler.cli.suggest_default_chrome_exe(),
help="executable to use to invoke chrome",
)
arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers',
type=int, default=1, help=(
'max number of chrome instances simultaneously '
'browsing pages'))
"-n",
"--max-browsers",
dest="max_browsers",
type=int,
default=1,
help=("max number of chrome instances simultaneously " "browsing pages"),
)
# pywb args
arg_parser.add_argument(
'--pywb-address', dest='pywb_address',
default='0.0.0.0',
help='pywb wayback address to listen on')
"--pywb-address",
dest="pywb_address",
default="0.0.0.0",
help="pywb wayback address to listen on",
)
arg_parser.add_argument(
'--pywb-port', dest='pywb_port', type=int,
default=8880, help='pywb wayback port')
"--pywb-port",
dest="pywb_port",
type=int,
default=8880,
help="pywb wayback port",
)
# dashboard args
arg_parser.add_argument(
'--dashboard-address', dest='dashboard_address',
default='localhost',
help='brozzler dashboard address to listen on')
"--dashboard-address",
dest="dashboard_address",
default="localhost",
help="brozzler dashboard address to listen on",
)
arg_parser.add_argument(
'--dashboard-port', dest='dashboard_port',
type=int, default=8881, help='brozzler dashboard port')
"--dashboard-port",
dest="dashboard_port",
type=int,
default=8881,
help="brozzler dashboard port",
)
# common at the bottom args
brozzler.cli.add_common_options(arg_parser, argv)
return arg_parser
class ThreadingWSGIServer(
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
):
pass
class BrozzlerEasyController:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -123,25 +162,31 @@ class BrozzlerEasyController:
self.stop = threading.Event()
self.args = args
self.warcprox_controller = warcprox.controller.WarcproxController(
self._warcprox_opts(args))
self._warcprox_opts(args)
)
self.brozzler_worker = self._init_brozzler_worker(args)
self.pywb_httpd = self._init_pywb(args)
self.dashboard_httpd = self._init_brozzler_dashboard(args)
def _init_brozzler_dashboard(self, args):
return wsgiref.simple_server.make_server(
args.dashboard_address, args.dashboard_port,
brozzler.dashboard.app, ThreadingWSGIServer)
args.dashboard_address,
args.dashboard_port,
brozzler.dashboard.app,
ThreadingWSGIServer,
)
def _init_brozzler_worker(self, args):
rr = doublethink.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)
rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, chrome_exe=args.chrome_exe,
proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
max_browsers=args.max_browsers)
frontier,
service_registry,
chrome_exe=args.chrome_exe,
proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
max_browsers=args.max_browsers,
)
return worker
def _init_pywb(self, args):
@ -152,66 +197,67 @@ class BrozzlerEasyController:
brozzler.pywb.monkey_patch_fuzzy_query()
brozzler.pywb.monkey_patch_calc_search_range()
if args.warcs_dir.endswith('/'):
if args.warcs_dir.endswith("/"):
warcs_dir = args.warcs_dir
else:
warcs_dir = args.warcs_dir + '/'
warcs_dir = args.warcs_dir + "/"
conf = {
'collections': {
'brozzler': {
'index_paths': brozzler.pywb.RethinkCDXSource(
"collections": {
"brozzler": {
"index_paths": brozzler.pywb.RethinkCDXSource(
servers=args.rethinkdb_servers.split(","),
db=args.rethinkdb_db, table='captures')
db=args.rethinkdb_db,
table="captures",
)
},
},
# 'enable_http_proxy': True,
# 'enable_memento': True,
'archive_paths': warcs_dir,
'enable_cdx_api': True,
'framed_replay': True,
'port': args.pywb_port,
'enable_auto_colls': False,
"archive_paths": warcs_dir,
"enable_cdx_api": True,
"framed_replay": True,
"port": args.pywb_port,
"enable_auto_colls": False,
}
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
pywb.webapp.pywb_init.create_wb_router, config=conf,
load_yaml=False)
pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
)
# disable is_hop_by_hop restrictions
wsgiref.handlers.is_hop_by_hop = lambda x: False
return wsgiref.simple_server.make_server(
args.pywb_address, args.pywb_port, wsgi_app,
ThreadingWSGIServer)
args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
)
def start(self):
self.logger.info('starting warcprox')
self.logger.info("starting warcprox")
self.warcprox_controller.start()
# XXX wait til fully started?
self.logger.info('starting brozzler-worker')
self.logger.info("starting brozzler-worker")
self.brozzler_worker.start()
self.logger.info(
'starting pywb at %s:%s', *self.pywb_httpd.server_address)
self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info(
'starting brozzler-dashboard at %s:%s',
*self.dashboard_httpd.server_address)
"starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
)
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
def shutdown(self):
self.logger.info('shutting down brozzler-dashboard')
self.logger.info("shutting down brozzler-dashboard")
self.dashboard_httpd.shutdown()
self.logger.info('shutting down brozzler-worker')
self.logger.info("shutting down brozzler-worker")
self.brozzler_worker.shutdown_now()
# brozzler-worker is fully shut down at this point
self.logger.info('shutting down pywb')
self.logger.info("shutting down pywb")
self.pywb_httpd.shutdown()
self.logger.info('shutting down warcprox')
self.logger.info("shutting down warcprox")
self.warcprox_controller.shutdown()
def wait_for_shutdown_request(self):
@ -222,14 +268,14 @@ class BrozzlerEasyController:
self.shutdown()
def _warcprox_opts(self, args):
'''
"""
Takes args as produced by the argument parser built by
_build_arg_parser and builds warcprox arguments object suitable to pass
to warcprox.main.init_controller. Copies some arguments, renames some,
populates some with defaults appropriate for brozzler-easy, etc.
'''
"""
warcprox_opts = warcprox.Options()
warcprox_opts.address = 'localhost'
warcprox_opts.address = "localhost"
# let the OS choose an available port; discover it later using
# sock.getsockname()[1]
warcprox_opts.port = 0
@ -237,17 +283,18 @@ class BrozzlerEasyController:
warcprox_opts.certs_dir = args.certs_dir
warcprox_opts.directory = args.warcs_dir
warcprox_opts.gzip = True
warcprox_opts.prefix = 'brozzler'
warcprox_opts.size = 1000 * 1000* 1000
warcprox_opts.prefix = "brozzler"
warcprox_opts.size = 1000 * 1000 * 1000
warcprox_opts.rollover_idle_time = 3 * 60
warcprox_opts.digest_algorithm = 'sha1'
warcprox_opts.digest_algorithm = "sha1"
warcprox_opts.base32 = True
warcprox_opts.stats_db_file = None
warcprox_opts.playback_port = None
warcprox_opts.playback_index_db_file = None
warcprox_opts.rethinkdb_big_table_url = (
'rethinkdb://%s/%s/captures' % (
args.rethinkdb_servers, args.rethinkdb_db))
warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
args.rethinkdb_servers,
args.rethinkdb_db,
)
warcprox_opts.queue_size = 500
warcprox_opts.max_threads = None
warcprox_opts.profile = False
@ -259,9 +306,11 @@ class BrozzlerEasyController:
for th in threading.enumerate():
state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append(''.join(stack))
logging.warning('dumping state (caught signal {})\n{}'.format(
signum, '\n'.join(state_strs)))
state_strs.append("".join(stack))
logging.warning(
"dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
)
def main(argv=None):
argv = argv or sys.argv
@ -271,8 +320,8 @@ def main(argv=None):
brozzler.chrome.check_version(args.chrome_exe)
controller = BrozzlerEasyController(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
signal.signal(signal.SIGTERM, lambda a, b: controller.stop.set())
signal.signal(signal.SIGINT, lambda a, b: controller.stop.set())
signal.signal(signal.SIGQUIT, controller.dump_state)
controller.start()
controller.wait_for_shutdown_request()

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
Copyright (C) 2014-2018 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import logging
import brozzler
@ -27,9 +27,11 @@ import urlcanon
r = rdb.RethinkDB()
class UnexpectedDbResult(Exception):
pass
class RethinkDbFrontier:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -47,40 +49,49 @@ class RethinkDbFrontier:
tables = self.rr.table_list().run()
if not "sites" in tables:
self.logger.info(
"creating rethinkdb table 'sites' in database %r",
self.rr.dbname)
"creating rethinkdb table 'sites' in database %r", self.rr.dbname
)
self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas).run()
self.rr.table("sites").index_create("sites_last_disclaimed", [
r.row["status"], r.row["last_disclaimed"]]).run()
"sites", shards=self.shards, replicas=self.replicas
).run()
self.rr.table("sites").index_create(
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
).run()
self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables:
self.logger.info(
"creating rethinkdb table 'pages' in database %r",
self.rr.dbname)
"creating rethinkdb table 'pages' in database %r", self.rr.dbname
)
self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas).run()
self.rr.table("pages").index_create("priority_by_site", [
r.row["site_id"], r.row["brozzle_count"],
r.row["claimed"], r.row["priority"]]).run()
"pages", shards=self.shards, replicas=self.replicas
).run()
self.rr.table("pages").index_create(
"priority_by_site",
[
r.row["site_id"],
r.row["brozzle_count"],
r.row["claimed"],
r.row["priority"],
],
).run()
# this index is for displaying pages in a sensible order in the web
# console
self.rr.table("pages").index_create("least_hops", [
r.row["site_id"], r.row["brozzle_count"],
r.row["hops_from_seed"]]).run()
self.rr.table("pages").index_create(
"least_hops",
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run()
if not "jobs" in tables:
self.logger.info(
"creating rethinkdb table 'jobs' in database %r",
self.rr.dbname)
"creating rethinkdb table 'jobs' in database %r", self.rr.dbname
)
self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas).run()
"jobs", shards=self.shards, replicas=self.replicas
).run()
def _vet_result(self, result, **kwargs):
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
# {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
for k in [
"replaced", "errors", "skipped", "inserted", "deleted",
"unchanged"]:
for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
if k in kwargs:
expected = kwargs[k]
else:
@ -88,81 +99,110 @@ class RethinkDbFrontier:
if isinstance(expected, list):
if result.get(k) not in kwargs[k]:
raise UnexpectedDbResult(
"expected %r to be one of %r in %r" % (
k, expected, result))
"expected %r to be one of %r in %r" % (k, expected, result)
)
else:
if result.get(k) != expected:
raise UnexpectedDbResult("expected %r to be %r in %r" % (
k, expected, result))
raise UnexpectedDbResult(
"expected %r to be %r in %r" % (k, expected, result)
)
def claim_sites(self, n=1):
self.logger.trace('claiming up to %s sites to brozzle', n)
self.logger.trace("claiming up to %s sites to brozzle", n)
result = (
self.rr.table('sites').get_all(r.args(
r.db(self.rr.dbname).table('sites', read_mode='majority')
.between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
index='sites_last_disclaimed')
.order_by(r.desc('claimed'), 'last_disclaimed')
.fold(
{}, lambda acc, site: acc.merge(
r.branch(
site.has_fields('job_id'),
r.object(
site['job_id'].coerce_to('string'),
acc[site['job_id'].coerce_to('string')].default(0).add(1)),
{})),
emit=lambda acc, site, new_acc: r.branch(
r.and_(
r.or_(
site['claimed'].not_(),
site['last_claimed'].lt(r.now().sub(60*60))),
r.or_(
site.has_fields('max_claimed_sites').not_(),
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
[site['id']], []))
.limit(n)))
self.rr.table("sites")
.get_all(
r.args(
r.db(self.rr.dbname)
.table("sites", read_mode="majority")
.between(
["ACTIVE", r.minval],
["ACTIVE", r.maxval],
index="sites_last_disclaimed",
)
.order_by(r.desc("claimed"), "last_disclaimed")
.fold(
{},
lambda acc, site: acc.merge(
r.branch(
site.has_fields("job_id"),
r.object(
site["job_id"].coerce_to("string"),
acc[site["job_id"].coerce_to("string")]
.default(0)
.add(1),
),
{},
)
),
emit=lambda acc, site, new_acc: r.branch(
r.and_(
r.or_(
site["claimed"].not_(),
site["last_claimed"].lt(r.now().sub(60 * 60)),
),
r.or_(
site.has_fields("max_claimed_sites").not_(),
new_acc[site["job_id"].coerce_to("string")].le(
site["max_claimed_sites"]
),
),
),
[site["id"]],
[],
),
)
.limit(n)
)
)
.update(
# try to avoid a race condition resulting in multiple
# brozzler-workers claiming the same site
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
r.branch(
r.or_(
r.row['claimed'].not_(),
r.row['last_claimed'].lt(r.now().sub(60*60))),
{'claimed': True, 'last_claimed': r.now()},
{}),
return_changes=True)).run()
r.row["claimed"].not_(),
r.row["last_claimed"].lt(r.now().sub(60 * 60)),
),
{"claimed": True, "last_claimed": r.now()},
{},
),
return_changes=True,
)
).run()
self._vet_result(
result, replaced=list(range(n+1)),
unchanged=list(range(n+1)))
result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
)
sites = []
for i in range(result["replaced"]):
if result["changes"][i]["old_val"]["claimed"]:
self.logger.warning(
"re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from "
"being disclaimed",
result["changes"][i]["old_val"]["last_claimed"])
"re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from "
"being disclaimed",
result["changes"][i]["old_val"]["last_claimed"],
)
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
sites.append(site)
self.logger.debug('claimed %s sites', len(sites))
self.logger.debug("claimed %s sites", len(sites))
if sites:
return sites
else:
raise brozzler.NothingToClaim
def enforce_time_limit(self, site):
'''
"""
Raises `brozzler.ReachedTimeLimit` if appropriate.
'''
if (site.time_limit and site.time_limit > 0
and site.elapsed() > site.time_limit):
"""
if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s "
"elapsed=%s %s", site.time_limit, site.elapsed(), site)
"site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
site.time_limit,
site.elapsed(),
site,
)
raise brozzler.ReachedTimeLimit
def claim_page(self, site, worker_id):
@ -170,26 +210,37 @@ class RethinkDbFrontier:
# brozzler-worker can be working on a site at a time, and that would
# have to be the worker calling this method, so if something is claimed
# already, it must have been left that way because of some error
result = self.rr.table("pages").between(
result = (
self.rr.table("pages")
.between(
[site.id, 0, r.minval, r.minval],
[site.id, 0, r.maxval, r.maxval],
index="priority_by_site").order_by(
index=r.desc("priority_by_site")).limit(
1).update({
"claimed":True,
"last_claimed_by":worker_id},
return_changes="always").run()
self._vet_result(result, unchanged=[0,1], replaced=[0,1])
index="priority_by_site",
)
.order_by(index=r.desc("priority_by_site"))
.limit(1)
.update(
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
)
.run()
)
self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
if result["unchanged"] == 0 and result["replaced"] == 0:
raise brozzler.NothingToClaim
else:
return brozzler.Page(self.rr, result["changes"][0]["new_val"])
def has_outstanding_pages(self, site):
results_iter = self.rr.table("pages").between(
results_iter = (
self.rr.table("pages")
.between(
[site.id, 0, r.minval, r.minval],
[site.id, 0, r.maxval, r.maxval],
index="priority_by_site").limit(1).run()
index="priority_by_site",
)
.limit(1)
.run()
)
return len(list(results_iter)) > 0
def completed_page(self, site, page):
@ -202,22 +253,24 @@ class RethinkDbFrontier:
site.save()
def active_jobs(self):
results = self.rr.table("jobs").filter({"status":"ACTIVE"}).run()
results = self.rr.table("jobs").filter({"status": "ACTIVE"}).run()
for result in results:
yield brozzler.Job(self.rr, result)
def honor_stop_request(self, site):
"""Raises brozzler.CrawlStopped if stop has been requested."""
site.refresh()
if (site.stop_requested
and site.stop_requested <= doublethink.utcnow()):
if site.stop_requested and site.stop_requested <= doublethink.utcnow():
self.logger.info("stop requested for site %s", site.id)
raise brozzler.CrawlStopped
if site.job_id:
job = brozzler.Job.load(self.rr, site.job_id)
if (job and job.stop_requested
and job.stop_requested <= doublethink.utcnow()):
if (
job
and job.stop_requested
and job.stop_requested <= doublethink.utcnow()
):
self.logger.info("stop requested for job %s", site.job_id)
raise brozzler.CrawlStopped
@ -239,8 +292,7 @@ class RethinkDbFrontier:
return False
n += 1
self.logger.info(
"all %s sites finished, job %s is FINISHED!", n, job.id)
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
job.finish()
job.save()
return True
@ -270,13 +322,11 @@ class RethinkDbFrontier:
def resume_job(self, job):
job.status = "ACTIVE"
job.stop_requested = None
job.starts_and_stops.append(
{"start":doublethink.utcnow(), "stop":None})
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
job.save()
for site in self.job_sites(job.id):
site.status = "ACTIVE"
site.starts_and_stops.append(
{"start":doublethink.utcnow(), "stop":None})
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
site.save()
def resume_site(self, site):
@ -285,51 +335,55 @@ class RethinkDbFrontier:
job = brozzler.Job.load(self.rr, site.job_id)
job.status = "ACTIVE"
site.stop_requested = None
job.starts_and_stops.append(
{"start":doublethink.utcnow(), "stop":None})
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
job.save()
site.status = "ACTIVE"
site.starts_and_stops.append(
{"start":doublethink.utcnow(), "stop":None})
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
site.save()
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign
+ url_for_crawling.fragment).decode('utf-8')
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
"utf-8"
)
urlcanon.canon.remove_fragment(url_for_crawling)
page = brozzler.Page(self.rr, {
'url': str(url_for_crawling),
'site_id': site.id,
'job_id': site.job_id,
'hops_from_seed': parent_page.hops_from_seed + 1,
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
'via_page_id': parent_page.id,
'via_page_url': parent_page.url,
'hops_off_surt': hops_off,
'hashtags': [hashtag] if hashtag else []})
page = brozzler.Page(
self.rr,
{
"url": str(url_for_crawling),
"site_id": site.id,
"job_id": site.job_id,
"hops_from_seed": parent_page.hops_from_seed + 1,
"hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
+ "L",
"via_page_id": parent_page.id,
"via_page_url": parent_page.url,
"hops_off_surt": hops_off,
"hashtags": [hashtag] if hashtag else [],
},
)
return page
def _merge_page(self, existing_page, fresh_page):
'''
"""
Utility method for merging info from `brozzler.Page` instances
representing the same url but with possibly different metadata.
'''
"""
existing_page.priority += fresh_page.priority
existing_page.hashtags = list(set(
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
existing_page.hops_off = min(
existing_page.hops_off, fresh_page.hops_off)
existing_page.hashtags = list(
set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
)
existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
'''
"""
Returns tuple (
dict of {page_id: Page} of fresh `brozzler.Page` representing in
scope links accepted by robots policy,
set of in scope urls (canonicalized) blocked by robots policy,
set of out-of-scope urls (canonicalized)).
'''
"""
pages = {} # {page_id: Page, ...}
blocked = set()
out_of_scope = set()
@ -337,17 +391,18 @@ class RethinkDbFrontier:
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
decision = site.accept_reject_or_neither(
url_for_scoping, parent_page=parent_page)
url_for_scoping, parent_page=parent_page
)
if decision is True:
hops_off = 0
elif decision is None:
decision = parent_page.hops_off < site.scope.get(
'max_hops_off', 0)
decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
hops_off = parent_page.hops_off + 1
if decision is True:
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
fresh_page = self._build_fresh_page(
site, parent_page, url, hops_off)
site, parent_page, url, hops_off
)
if fresh_page.id in pages:
self._merge_page(pages[fresh_page.id], fresh_page)
else:
@ -359,31 +414,32 @@ class RethinkDbFrontier:
return pages, blocked, out_of_scope
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
site, parent_page, outlinks)
decisions['blocked'] = blocked
decisions['rejected'] = out_of_scope
counts['blocked'] += len(blocked)
counts['rejected'] += len(out_of_scope)
site, parent_page, outlinks
)
decisions["blocked"] = blocked
decisions["rejected"] = out_of_scope
counts["blocked"] += len(blocked)
counts["rejected"] += len(out_of_scope)
# get existing pages from rethinkdb
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}
# build list of pages to save, consisting of new pages, and existing
# pages updated with higher priority and new hashtags
for fresh_page in fresh_pages.values():
decisions['accepted'].add(fresh_page.url)
decisions["accepted"].add(fresh_page.url)
if fresh_page.id in pages:
page = pages[fresh_page.id]
self._merge_page(page, fresh_page)
counts['updated'] += 1
counts["updated"] += 1
else:
pages[fresh_page.id] = fresh_page
counts['added'] += 1
counts["added"] += 1
# make sure we're not stepping on our own toes in case we have a link
# back to parent_page, which I think happens because of hashtags
@ -396,19 +452,22 @@ class RethinkDbFrontier:
# there can be many pages and each one can be very large (many videos,
# in and out of scope links, etc)
l = list(pages.values())
for batch in (l[i:i+50] for i in range(0, len(l), 50)):
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
try:
self.logger.debug(
'inserting/replacing batch of %s pages', len(batch))
reql = self.rr.table('pages').insert(batch, conflict='replace')
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
reql = self.rr.table("pages").insert(batch, conflict="replace")
self.logger.trace(
'running query self.rr.table("pages").insert(%r, '
'conflict="replace")', batch)
'running query self.rr.table("pages").insert(%r, '
'conflict="replace")',
batch,
)
result = reql.run()
except Exception as e:
self.logger.error(
'problem inserting/replacing batch of %s pages',
len(batch), exc_info=True)
"problem inserting/replacing batch of %s pages",
len(batch),
exc_info=True,
)
parent_page.outlinks = {}
for k in decisions:
@ -416,43 +475,56 @@ class RethinkDbFrontier:
parent_page.save()
self.logger.info(
'%s new links added, %s existing links updated, %s links '
'rejected, %s links blocked by robots from %s',
counts['added'], counts['updated'], counts['rejected'],
counts['blocked'], parent_page)
"%s new links added, %s existing links updated, %s links "
"rejected, %s links blocked by robots from %s",
counts["added"],
counts["updated"],
counts["rejected"],
counts["blocked"],
parent_page,
)
def reached_limit(self, site, e):
self.logger.info("reached_limit site=%s e=%s", site, e)
assert isinstance(e, brozzler.ReachedLimit)
if (site.reached_limit
and site.reached_limit != e.warcprox_meta["reached-limit"]):
if (
site.reached_limit
and site.reached_limit != e.warcprox_meta["reached-limit"]
):
self.logger.warning(
"reached limit %s but site had already reached limit %s",
e.warcprox_meta["reached-limit"], self.reached_limit)
"reached limit %s but site had already reached limit %s",
e.warcprox_meta["reached-limit"],
self.reached_limit,
)
else:
site.reached_limit = e.warcprox_meta["reached-limit"]
self.finished(site, "FINISHED_REACHED_LIMIT")
def job_sites(self, job_id):
results = self.rr.table('sites').get_all(job_id, index="job_id").run()
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
for result in results:
yield brozzler.Site(self.rr, result)
def seed_page(self, site_id):
results = self.rr.table("pages").between(
results = (
self.rr.table("pages")
.between(
[site_id, r.minval, r.minval, r.minval],
[site_id, r.maxval, r.maxval, r.maxval],
index="priority_by_site").filter({"hops_from_seed":0}).run()
index="priority_by_site",
)
.filter({"hops_from_seed": 0})
.run()
)
pages = list(results)
if len(pages) > 1:
self.logger.warning(
"more than one seed page for site_id %s ?", site_id)
self.logger.warning("more than one seed page for site_id %s ?", site_id)
if len(pages) < 1:
return None
return brozzler.Page(self.rr, pages[0])
def site_pages(self, site_id, brozzled=None):
'''
"""
Args:
site_id (str or int):
brozzled (bool): if true, results include only pages that have
@ -460,16 +532,14 @@ class RethinkDbFrontier:
not been brozzled; and if None (the default), all pages
Returns:
iterator of brozzler.Page
'''
"""
query = self.rr.table("pages").between(
[site_id, 1 if brozzled is True else 0,
r.minval, r.minval],
[site_id, 0 if brozzled is False else r.maxval,
r.maxval, r.maxval],
index="priority_by_site")
[site_id, 1 if brozzled is True else 0, r.minval, r.minval],
[site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
index="priority_by_site",
)
self.logger.trace("running query: %r", query)
results = query.run()
for result in results:
self.logger.trace("yielding result: %r", result)
yield brozzler.Page(self.rr, result)

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/models.py - model classes representing jobs, sites, and pages, with
related logic
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import brozzler
import base64
@ -36,15 +36,18 @@ import yaml
import zlib
from typing import Optional
def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
with open(schema_file) as f:
return yaml.safe_load(f)
class JobValidator(cerberus.Validator):
def _validate_type_url(self, value):
url = urllib.parse.urlparse(value)
return url.scheme in ('http', 'https', 'ftp')
return url.scheme in ("http", "https", "ftp")
class InvalidJobConf(Exception):
def __init__(self, validator):
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
# Cerberus does a nice job hiding the bad value. In the case I
# debugged, I found it here. Maybe there's a better way to see it.
value = validator._errors[0].info[0][0].info[0][0].value
self.errors['bad value'] = value
self.errors["bad value"] = value
except:
value = None
def validate_conf(job_conf, schema=load_schema()):
v = JobValidator(schema)
if not v.validate(job_conf, normalize=False):
raise InvalidJobConf(v)
def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict):
merged = dict(a)
@ -75,19 +80,22 @@ def merge(a, b):
else:
return a
def new_job_file(frontier, job_conf_file):
'''Returns new Job.'''
"""Returns new Job."""
logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f:
job_conf = yaml.safe_load(f)
return new_job(frontier, job_conf)
def new_job(frontier, job_conf):
'''Returns new Job.'''
"""Returns new Job."""
validate_conf(job_conf)
job = Job(frontier.rr, {
"conf": job_conf, "status": "ACTIVE",
"started": doublethink.utcnow()})
job = Job(
frontier.rr,
{"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
)
if "id" in job_conf:
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
@ -108,32 +116,40 @@ def new_job(frontier, job_conf):
# insert in batches to avoid this error
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
logging.info('inserting batch of %s pages', len(batch))
result = frontier.rr.table('pages').insert(batch).run()
for batch in (sites[i:i+100] for i in range(0, len(sites), 100)):
logging.info('inserting batch of %s sites', len(batch))
result = frontier.rr.table('sites').insert(batch).run()
logging.info('job %s fully started', job.id)
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
logging.info("inserting batch of %s pages", len(batch))
result = frontier.rr.table("pages").insert(batch).run()
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
logging.info("inserting batch of %s sites", len(batch))
result = frontier.rr.table("sites").insert(batch).run()
logging.info("job %s fully started", job.id)
return job
def new_seed_page(frontier, site):
url = urlcanon.parse_url(site.seed)
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
urlcanon.canon.remove_fragment(url)
page = brozzler.Page(frontier.rr, {
"url": str(url),
"site_id": site.get("id"),
"job_id": site.get("job_id"),
"hops_from_seed": 0,
"priority": 1000,
"needs_robots_check": True,
"hop_path": None})
page = brozzler.Page(
frontier.rr,
{
"url": str(url),
"site_id": site.get("id"),
"job_id": site.get("job_id"),
"hops_from_seed": 0,
"priority": 1000,
"needs_robots_check": True,
"hop_path": None,
},
)
if hashtag:
page.hashtags = [hashtag,]
page.hashtags = [
hashtag,
]
return page
def new_site(frontier, site):
logging.info("new site %s", site)
site.id = site.id or str(uuid.uuid4())
@ -148,9 +164,10 @@ def new_site(frontier, site):
# finally block because we want to insert the Site no matter what
site.save()
class ElapsedMixIn(object):
def elapsed(self):
'''
"""
Returns elapsed crawl time as a float in seconds.
This metric includes all the time that a site was in active rotation,
@ -158,21 +175,22 @@ class ElapsedMixIn(object):
In contrast `Site.active_brozzling_time` only counts time when a
brozzler worker claimed the site and was actively brozzling it.
'''
"""
dt = 0
for ss in self.starts_and_stops[:-1]:
if ss['stop']:
dt += (ss['stop'] - ss['start']).total_seconds()
if ss["stop"]:
dt += (ss["stop"] - ss["start"]).total_seconds()
else:
self.logger.warning("missing expected ss['stop']")
dt += (doublethink.utcnow() - ss['start']).total_seconds()
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
ss = self.starts_and_stops[-1]
if ss['stop']:
dt += (ss['stop'] - ss['start']).total_seconds()
else: # crawl is active
dt += (doublethink.utcnow() - ss['start']).total_seconds()
if ss["stop"]:
dt += (ss["stop"] - ss["start"]).total_seconds()
else: # crawl is active
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
return dt
class Job(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "jobs"
@ -181,29 +199,30 @@ class Job(doublethink.Document, ElapsedMixIn):
if not "status" in self:
self.status = "ACTIVE"
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [{
"start": self.get("started"),
"stop": self.get("finished")}]
if self.get("started"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("started"), "stop": self.get("finished")}
]
del self["started"]
if "finished" in self:
del self["finished"]
else:
self.starts_and_stops = [
{"start":doublethink.utcnow(),"stop":None}]
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
def finish(self):
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
self.logger.error(
"job is already finished status=%s "
"starts_and_stops[-1]['stop']=%s", self.status,
self.starts_and_stops[-1]["stop"])
"job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
self.status,
self.starts_and_stops[-1]["stop"],
)
self.status = "FINISHED"
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = 'sites'
table = "sites"
def populate_defaults(self):
if not "status" in self:
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
del self.scope["surt"]
# backward compatibility
if ("max_hops_off_surt" in self.scope
and not "max_hops_off" in self.scope):
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
if self.seed:
self._accept_ssurt_if_not_redundant(
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)
if not "starts_and_stops" in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [{
"start":self.get("start_time"),"stop":None}]
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
]
if self.get("status") != "ACTIVE":
self.starts_and_stops[0]["stop"] = self.last_disclaimed
del self["start_time"]
else:
self.starts_and_stops = [
{"start":doublethink.utcnow(),"stop":None}]
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
if not "accepts" in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"] for rule in self.scope["accepts"]
if set(rule.keys()) == {'ssurt'})
rule["ssurt"]
for rule in self.scope["accepts"]
if set(rule.keys()) == {"ssurt"}
)
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
self.logger.info(
"adding ssurt %s to scope accept rules", ssurt)
self.logger.info("adding ssurt %s to scope accept rules", ssurt)
self.scope["accepts"].append({"ssurt": ssurt})
def note_seed_redirect(self, url):
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also
# put all of https://foo.com/ in scope
if (canon_seed_redirect.authority == canon_seed.authority
and canon_seed_redirect.scheme != canon_seed.scheme):
if (
canon_seed_redirect.authority == canon_seed.authority
and canon_seed_redirect.scheme != canon_seed.scheme
):
canon_seed.scheme = canon_seed_redirect.scheme
self._accept_ssurt_if_not_redundant(
canon_seed.ssurt().decode('ascii'))
self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))
self._accept_ssurt_if_not_redundant(
canon_seed_redirect.ssurt().decode('ascii'))
self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))
def extra_headers(self, page: Optional["Page"] = None):
hdrs = {}
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
if "blocks" in self.warcprox_meta:
# delete temp_warcprox_meta's 'blocks' (they may be big!)
del temp_warcprox_meta['blocks']
del temp_warcprox_meta["blocks"]
# str-ify blocks
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
blocks_str = json.dumps(
self.warcprox_meta["blocks"], separators=(",", ":")
)
# encode(), compress, b64encode, decode()
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
zlib.compress(blocks_str.encode())
).decode()
if page is not None:
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
hdrs["Warcprox-Meta"] = json.dumps(
temp_warcprox_meta, separators=(",", ":")
)
return hdrs
def accept_reject_or_neither(self, url, parent_page=None):
'''
"""
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
`None` usually means rejected, unless `max_hops_off` comes into play.
'''
"""
if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url)
if not url.scheme in (b'http', b'https'):
if not url.scheme in (b"http", b"https"):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
if parent_page:
try_parent_urls.append(urlcanon.semantic(parent_page.url))
if parent_page.redirect_url:
try_parent_urls.append(
urlcanon.semantic(parent_page.redirect_url))
try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))
# enforce max_hops
if (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]):
if (
parent_page
and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]
):
return False
# enforce reject rules
@ -326,7 +354,7 @@ class Site(doublethink.Document, ElapsedMixIn):
if try_parent_urls:
for parent_url in try_parent_urls:
if rule.applies(url, parent_url):
return False
return False
else:
if rule.applies(url):
return False
@ -337,7 +365,7 @@ class Site(doublethink.Document, ElapsedMixIn):
if try_parent_urls:
for parent_url in try_parent_urls:
if rule.applies(url, parent_url):
return True
return True
else:
if rule.applies(url):
return True
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
# no decision if we reach here
return None
class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "pages"
@ -398,4 +427,3 @@ class Page(doublethink.Document):
if self._canon_hurl is None:
self._canon_hurl = urlcanon.semantic(self.url)
return str(self._canon_hurl)

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, canonicalization rules matching
brozzler conventions, support for screenshot: and thumbnail: urls
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import sys
import logging
try:
import pywb.apps.cli
import pywb.cdx.cdxdomainspecific
@ -30,9 +31,11 @@ try:
import pywb.rewrite.wburl
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e)
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__,
e,
)
sys.exit(1)
import doublethink
import rethinkdb as rdb
@ -43,6 +46,7 @@ import argparse
r = rdb.RethinkDB()
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table):
self.servers = servers
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
# XXX inefficient, it gets parsed later, figure out how to
# short-circuit this step and create the CDXObject directly
blob = {
'url': record['url'],
'status': str(record['response_code']),
'digest': record['sha1base32'],
'length': str(record.get('record_length', '-')),
'offset': str(record['offset']),
'filename': record['filename'],
"url": record["url"],
"status": str(record["response_code"]),
"digest": record["sha1base32"],
"length": str(record.get("record_length", "-")),
"offset": str(record["offset"]),
"filename": record["filename"],
}
if record['warc_type'] != 'revisit':
blob['mime'] = record['content_type'] or '-'
if record["warc_type"] != "revisit":
blob["mime"] = record["content_type"] or "-"
else:
blob['mime'] = 'warc/revisit'
blob["mime"] = "warc/revisit"
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
record['canon_surt'], record['timestamp'],
json.dumps(blob))
yield cdx_line.encode('utf-8')
cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
record["canon_surt"], record["timestamp"], json.dumps(blob)
)
yield cdx_line.encode("utf-8")
def _query_rethinkdb(self, cdx_query):
start_key = cdx_query.key.decode('utf-8')
end_key = cdx_query.end_key.decode('utf-8')
start_key = cdx_query.key.decode("utf-8")
end_key = cdx_query.end_key.decode("utf-8")
reql = self.rr.table(self.table).between(
[start_key[:150], r.minval], [end_key[:150], r.maxval],
index='abbr_canon_surt_timestamp', right_bound='closed')
reql = reql.order_by(index='abbr_canon_surt_timestamp')
[start_key[:150], r.minval],
[end_key[:150], r.maxval],
index="abbr_canon_surt_timestamp",
right_bound="closed",
)
reql = reql.order_by(index="abbr_canon_surt_timestamp")
# TODO support for POST, etc
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
reql = reql.filter(
lambda capture: r.expr(
['WARCPROX_WRITE_RECORD','GET']).contains(
capture['http_method']))
lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
capture["http_method"]
)
)
reql = reql.filter(
lambda capture: (capture['canon_surt'] >= start_key)
& (capture['canon_surt'] < end_key))
lambda capture: (capture["canon_surt"] >= start_key)
& (capture["canon_surt"] < end_key)
)
if cdx_query.limit:
reql = reql.limit(cdx_query.limit)
logging.debug('rethinkdb query: %s', reql)
logging.debug("rethinkdb query: %s", reql)
results = reql.run()
return results
class TheGoodUrlCanonicalizer(object):
'''
"""
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
surts with scheme and with trailing comma, and does not "massage"
www.foo.org into foo.org.
'''
"""
def __init__(self, surt_ordered=True):
'''We are always surt ordered (surt_ordered param is ignored)'''
"""We are always surt ordered (surt_ordered param is ignored)"""
self.surt_ordered = True
def __call__(self, url):
try:
key = urlcanon.semantic(url).surt().decode('ascii')
key = urlcanon.semantic(url).surt().decode("ascii")
# logging.debug('%s -> %s', url, key)
return key
except Exception as e:
return url
def replace_default_canonicalizer():
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
"""Replace parent class of CustomUrlCanonicalizer with this class."""
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
TheGoodUrlCanonicalizer,)
TheGoodUrlCanonicalizer,
)
def good_surts_from_default(default_surt):
'''
"""
Takes a standard surt without scheme and without trailing comma, and
returns a list of "good" surts that together match the same set of
urls. For example:
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
'http://(com,example,www,)/path',
'https://(com,example,www,)/path']
'''
if default_surt == '':
return ['']
"""
if default_surt == "":
return [""]
parts = default_surt.split(')', 1)
parts = default_surt.split(")", 1)
if len(parts) == 2:
orig_host_part, path_part = parts
good_surts = [
'http://(%s,)%s' % (orig_host_part, path_part),
'https://(%s,)%s' % (orig_host_part, path_part),
'http://(%s,www,)%s' % (orig_host_part, path_part),
'https://(%s,www,)%s' % (orig_host_part, path_part),
"http://(%s,)%s" % (orig_host_part, path_part),
"https://(%s,)%s" % (orig_host_part, path_part),
"http://(%s,www,)%s" % (orig_host_part, path_part),
"https://(%s,www,)%s" % (orig_host_part, path_part),
]
else: # no path part
else: # no path part
host_part = parts[0]
good_surts = [
'http://(%s' % host_part,
'https://(%s' % host_part,
"http://(%s" % host_part,
"https://(%s" % host_part,
]
return good_surts
def monkey_patch_dsrules_init():
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
def cdx_dsrule_init(self, url_prefix, rules):
good_surts = []
url_prefixes = [url_prefix] if isinstance(
url_prefix, str) else url_prefix
url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
for bad_surt in url_prefixes:
good_surts.extend(
TheGoodUrlCanonicalizer.good_surts_from_default(
bad_surt))
if 'match' in rules and 'regex' in rules['match']:
rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
)
if "match" in rules and "regex" in rules["match"]:
rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
orig_init(self, good_surts, rules)
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
def support_in_progress_warcs():
'''
"""
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
being written to (warcs having ".open" suffix). This way if a cdx entry
references foo.warc.gz, pywb will try both foo.warc.gz and
foo.warc.gz.open.
'''
"""
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
def _prefix_resolver_call(self, filename, cdx=None):
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
results = []
for warc_path in raw_results:
results.append(warc_path)
results.append('%s.open' % warc_path)
results.append("%s.open" % warc_path)
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url):
import re
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
if six.PY2 and isinstance(orig_url, six.text_type):
orig_url = orig_url.encode('utf-8')
orig_url = orig_url.encode("utf-8")
orig_url = quote(orig_url)
self._original_url = orig_url
if not self._init_query(orig_url):
if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url)
raise Exception("Invalid WbUrl: ", orig_url)
new_uri = WbUrl.to_uri(self.url)
@ -227,21 +244,24 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
self.url = new_uri
# begin brozzler changes
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
or self.url.startswith('thumbnail:')):
if (
self.url.startswith("urn:")
or self.url.startswith("screenshot:")
or self.url.startswith("thumbnail:")
):
return
# end brozzler changes
# protocol agnostic url -> http://
# no protocol -> http://
#inx = self.url.find('://')
# inx = self.url.find('://')
inx = -1
m = self.SCHEME_RX.match(self.url)
if m:
inx = m.span(1)[0]
#if inx < 0:
# check for other partially encoded variants
# if inx < 0:
# check for other partially encoded variants
# m = self.PARTIAL_ENC_RX.match(self.url)
# if m:
# len_ = len(m.group(0))
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
self.url = self.DEFAULT_SCHEME + self.url
else:
inx += 2
if inx < len(self.url) and self.url[inx] != '/':
self.url = self.url[:inx] + '/' + self.url[inx:]
if inx < len(self.url) and self.url[inx] != "/":
self.url = self.url[:inx] + "/" + self.url[inx:]
def _get_wburl_type(self):
return SomeWbUrl
def monkey_patch_wburl():
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
def _extend_parser(self, arg_parser):
super()._extend_parser(arg_parser)
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
arg_parser.epilog = '''
arg_parser.epilog = """
Run pywb like so:
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
See README.rst for more information.
'''
"""
# copied and pasted from cdxdomainspecific.py, only changes are commented as
# such below
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):
matched_rule = None
urlkey = to_native_str(query.key, 'utf-8')
urlkey = to_native_str(query.key, "utf-8")
url = query.url
filter_ = query.filters
output = query.output
@ -306,42 +330,42 @@ def _fuzzy_query_call(self, query):
if not matched_rule:
return None
repl = '?'
repl = "?"
if matched_rule.replace:
repl = matched_rule.replace
inx = url.find(repl)
if inx > 0:
url = url[:inx + len(repl)]
url = url[: inx + len(repl)]
# begin brozzler changes
if matched_rule.match_type == 'domain':
if matched_rule.match_type == "domain":
orig_split_url = urlsplit(url)
# remove the subdomain, path, query and fragment
host = orig_split_url.netloc.split('.', 1)[1]
new_split_url = (orig_split_url.scheme, host, '', '', '')
host = orig_split_url.netloc.split(".", 1)[1]
new_split_url = (orig_split_url.scheme, host, "", "", "")
url = urlunsplit(new_split_url)
# end brozzler changes
params = query.params
params.update({'url': url,
'matchType': matched_rule.match_type,
'filter': filter_})
params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})
if 'reverse' in params:
del params['reverse']
if "reverse" in params:
del params["reverse"]
if 'closest' in params:
del params['closest']
if "closest" in params:
del params["closest"]
if 'end_key' in params:
del params['end_key']
if "end_key" in params:
del params["end_key"]
return params
def monkey_patch_fuzzy_query():
pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
# copied and pasted from pywb/utils/canonicalize.py, only changes are commented
# as such
def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
if match_type == "exact":
end_key = start_key + "!"
elif match_type == 'prefix':
elif match_type == "prefix":
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
if url.endswith("/") and not start_key.endswith("/"):
start_key += "/"
end_key = inc_last_char(start_key)
elif match_type == 'host':
elif match_type == "host":
if surt_ordered:
host = start_key.split(')/')[0]
host = start_key.split(")/")[0]
start_key = host + ')/'
end_key = host + '*'
start_key = host + ")/"
end_key = host + "*"
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
start_key = host + "/"
end_key = host + "0"
elif match_type == 'domain':
elif match_type == "domain":
if not surt_ordered:
msg = 'matchType=domain unsupported for non-surt'
msg = "matchType=domain unsupported for non-surt"
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0]
host = start_key.split(")/")[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if ',' not in host:
start_key = host + ','
if "," not in host:
start_key = host + ","
else:
start_key = host + ')/'
start_key = host + ")/"
# begin brozzler changes
end_key = host + '~'
end_key = host + "~"
# end brozzler changes
else:
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
raise UrlCanonicalizeException("Invalid match_type: " + match_type)
return (start_key, end_key)
def monkey_patch_calc_search_range():
pywb.utils.canonicalize.calc_search_range = _calc_search_range
pywb.cdx.query.calc_search_range = _calc_search_range
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
@ -417,7 +443,10 @@ def main(argv=sys.argv):
brozzler.pywb.monkey_patch_fuzzy_query()
brozzler.pywb.monkey_patch_calc_search_range()
wayback_cli = BrozzlerWaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))
args=argv[1:],
default_port=8880,
desc=(
"brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
),
)
wayback_cli.run()

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/robots.py - robots.txt support
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import json
import logging
@ -34,48 +34,60 @@ __all__ = ["is_permitted_by_robots"]
# monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent):
'''
"""
Find the user-agent token matching the supplied full user-agent, using
a case-insensitive substring search.
'''
"""
lc_agent = agent.lower()
for s in self.agents:
if s in lc_agent:
return self.agents[s]
return self.agents.get('*')
return self.agents.get("*")
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
class _SessionRaiseOn420(requests.Session):
timeout = 60
def get(self, url, *args, **kwargs):
res = super().get(url, timeout=self.timeout, *args, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers:
if res.status_code == 420 and "warcprox-meta" in res.headers:
raise brozzler.ReachedLimit(
warcprox_meta=json.loads(res.headers['warcprox-meta']),
http_payload=res.text)
warcprox_meta=json.loads(res.headers["warcprox-meta"]),
http_payload=res.text,
)
else:
return res
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None):
if not site.id in _robots_caches:
req_sesh = _SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors
req_sesh.verify = False # ignore cert errors
if proxy:
proxie = "http://%s" % proxy
req_sesh.proxies = {"http":proxie,"https":proxie}
req_sesh.proxies = {"http": proxie, "https": proxie}
if site.extra_headers():
req_sesh.headers.update(site.extra_headers())
if site.user_agent:
req_sesh.headers['User-Agent'] = site.user_agent
req_sesh.headers["User-Agent"] = site.user_agent
_robots_caches[site.id] = reppy.cache.RobotsCache(
session=req_sesh, disallow_forbidden=False)
session=req_sesh, disallow_forbidden=False
)
return _robots_caches[site.id]
def is_permitted_by_robots(site, url, proxy=None):
'''
"""
Checks if `url` is permitted by robots.txt.
Treats any kind of error fetching robots.txt as "allow all". See
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
Raises:
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
requests.exceptions.ProxyError: if the proxy is down
'''
"""
if site.ignore_robots:
return True
try:
result = _robots_cache(site, proxy).allowed(
url, site.user_agent or "brozzler")
result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
return result
except Exception as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
e.args[0], brozzler.ReachedLimit):
e.args[0], brozzler.ReachedLimit
):
raise e.args[0]
elif hasattr(e, 'args') and isinstance(
e.args[0], requests.exceptions.ProxyError):
elif hasattr(e, "args") and isinstance(
e.args[0], requests.exceptions.ProxyError
):
# reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e)
else:
logging.warning(
"returning true (permitted) after problem fetching "
"robots.txt for %r: %r", url, e)
"returning true (permitted) after problem fetching "
"robots.txt for %r: %r",
url,
e,
)
return True

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import logging
import brozzler
@ -39,6 +39,7 @@ from . import ydl
r = rdb.RethinkDB()
class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -50,13 +51,26 @@ class BrozzlerWorker:
SITE_SESSION_MINUTES = 15
def __init__(
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
download_throughput=-1, stealth=False,
window_height=900, window_width=1400):
self,
frontier,
service_registry=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
proxy=None,
skip_extract_outlinks=False,
skip_visit_hashtags=False,
skip_youtube_dl=False,
simpler404=False,
screenshot_full_page=False,
page_timeout=300,
behavior_timeout=900,
extract_outlinks_timeout=60,
download_throughput=-1,
stealth=False,
window_height=900,
window_width=1400,
):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -79,7 +93,8 @@ class BrozzlerWorker:
self._stealth = stealth
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
)
self._browsing_threads = set()
self._browsing_threads_lock = threading.Lock()
@ -88,24 +103,32 @@ class BrozzlerWorker:
self._shutdown = threading.Event()
def _choose_warcprox(self):
warcproxes = self._service_registry.available_services('warcprox')
warcproxes = self._service_registry.available_services("warcprox")
if not warcproxes:
return None
# .group('proxy').count() makes this query about 99% more efficient
reql = self._frontier.rr.table('sites').between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
index='sites_last_disclaimed').group('proxy').count()
# returns results like
reql = (
self._frontier.rr.table("sites")
.between(
["ACTIVE", r.minval],
["ACTIVE", r.maxval],
index="sites_last_disclaimed",
)
.group("proxy")
.count()
)
# returns results like
# {
# "wbgrp-svc030.us.archive.org:8000": 148,
# "wbgrp-svc030.us.archive.org:8001": 145
# }
proxy_scoreboard = dict(reql.run())
for warcprox in warcproxes:
address = '%s:%s' % (warcprox['host'], warcprox['port'])
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
warcproxes.sort(key=lambda warcprox: (
warcprox['assigned_sites'], warcprox['load']))
address = "%s:%s" % (warcprox["host"], warcprox["port"])
warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
warcproxes.sort(
key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
)
# XXX make this heuristic more advanced?
return warcproxes[0]
@ -118,13 +141,15 @@ class BrozzlerWorker:
svc = self._choose_warcprox()
if svc is None:
raise brozzler.ProxyError(
'no available instances of warcprox in the service '
'registry')
site.proxy = '%s:%s' % (svc['host'], svc['port'])
"no available instances of warcprox in the service " "registry"
)
site.proxy = "%s:%s" % (svc["host"], svc["port"])
site.save()
self.logger.info(
'chose warcprox instance %r from service registry for %r',
site.proxy, site)
"chose warcprox instance %r from service registry for %r",
site.proxy,
site,
)
return site.proxy
return None
@ -132,14 +157,16 @@ class BrozzlerWorker:
if self._proxy:
if self._proxy_is_warcprox is None:
try:
response = requests.get('http://%s/status' % self._proxy)
response = requests.get("http://%s/status" % self._proxy)
status = json.loads(response.text)
self._proxy_is_warcprox = (status['role'] == 'warcprox')
self._proxy_is_warcprox = status["role"] == "warcprox"
except Exception as e:
self._proxy_is_warcprox = False
logging.info(
'%s %s warcprox', self._proxy,
'IS' if self._proxy_is_warcprox else 'IS NOT')
"%s %s warcprox",
self._proxy,
"IS" if self._proxy_is_warcprox else "IS NOT",
)
return self._proxy_is_warcprox
else:
# I should have commented when I originally wrote this code, but I
@ -148,13 +175,20 @@ class BrozzlerWorker:
return bool(site.proxy or self._warcprox_auto)
def _warcprox_write_record(
self, warcprox_address, url, warc_type, content_type,
payload, extra_headers=None):
headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
self,
warcprox_address,
url,
warc_type,
content_type,
payload,
extra_headers=None,
):
headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
if extra_headers:
headers.update(extra_headers)
request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
headers=headers, data=payload)
request = urllib.request.Request(
url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
)
# XXX setting request.type="http" is a hack to stop urllib from trying
# to tunnel if url is https
@ -165,26 +199,31 @@ class BrozzlerWorker:
with urllib.request.urlopen(request, timeout=600) as response:
if response.getcode() != 204:
self.logger.warning(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
response.getcode(), response.reason)
'got "%s %s" response on warcprox '
"WARCPROX_WRITE_RECORD request (expected 204)",
response.getcode(),
response.reason,
)
return request, response
except urllib.error.HTTPError as e:
self.logger.warning(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info())
'got "%s %s" response on warcprox '
"WARCPROX_WRITE_RECORD request (expected 204)",
e.getcode(),
e.info(),
)
return request, None
except urllib.error.URLError as e:
raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
"proxy error on WARCPROX_WRITE_RECORD %s" % url
) from e
except ConnectionError as e:
raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
"proxy error on WARCPROX_WRITE_RECORD %s" % url
) from e
def thumb_jpeg(self, full_jpeg):
"""Create JPEG thumbnail.
"""
"""Create JPEG thumbnail."""
img = PIL.Image.open(io.BytesIO(full_jpeg))
thumb_width = 300
thumb_height = (thumb_width / img.size[0]) * img.size[1]
@ -193,8 +232,15 @@ class BrozzlerWorker:
img.save(out, "jpeg", quality=95)
return out.getbuffer()
def brozzle_page(self, browser, site, page, on_screenshot=None,
on_request=None, enable_youtube_dl=True):
def brozzle_page(
self,
browser,
site,
page,
on_screenshot=None,
on_request=None,
enable_youtube_dl=True,
):
self.logger.info("brozzling {}".format(page))
ydl_fetches = None
outlinks = set()
@ -208,31 +254,38 @@ class BrozzlerWorker:
except brozzler.ProxyError:
raise
except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code')
and e.exc_info[1].code == 430):
if (
hasattr(e, "exc_info")
and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info(
'youtube-dl got %s %s processing %s',
e.exc_info[1].code, e.exc_info[1].msg, page.url)
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else:
self.logger.error(
'youtube_dl raised exception on %s', page,
exc_info=True)
"youtube_dl raised exception on %s", page, exc_info=True
)
if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page)
self.logger.info("needs browsing: %s", page)
try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request)
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info('page interstitial shown (http auth): %s', page)
self.logger.info("page interstitial shown (http auth): %s", page)
else:
if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page)
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
else:
self.logger.info('already fetched: %s', page)
self.logger.info("already fetched: %s", page)
return outlinks
@ -242,85 +295,103 @@ class BrozzlerWorker:
on_screenshot(screenshot_jpeg)
if self._using_warcprox(site):
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy_for(site), page)
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s",
self._proxy_for(site),
page,
)
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers(page))
warcprox_address=self._proxy_for(site),
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource",
content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers(page),
)
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers(page))
warcprox_address=self._proxy_for(site),
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource",
content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers(page),
)
def _on_response(chrome_msg):
if ('params' in chrome_msg
and 'response' in chrome_msg['params']
and 'mimeType' in chrome_msg['params']['response']
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
and chrome_msg['params']['response'].get('status') in (200, 206)):
if (
"params" in chrome_msg
and "response" in chrome_msg["params"]
and "mimeType" in chrome_msg["params"]["response"]
and chrome_msg["params"]["response"]
.get("mimeType", "")
.startswith("video/")
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and chrome_msg["params"]["response"]["mimeType"]
!= "video/vnd.mpeg.dash.mpd"
and chrome_msg["params"]["response"].get("status") in (200, 206)
):
video = {
'blame': 'browser',
'url': chrome_msg['params']['response'].get('url'),
'response_code': chrome_msg['params']['response']['status'],
'content-type': chrome_msg['params']['response']['mimeType'],
"blame": "browser",
"url": chrome_msg["params"]["response"].get("url"),
"response_code": chrome_msg["params"]["response"]["status"],
"content-type": chrome_msg["params"]["response"]["mimeType"],
}
response_headers = CaseInsensitiveDict(
chrome_msg['params']['response']['headers'])
if 'content-length' in response_headers:
video['content-length'] = int(response_headers['content-length'])
if 'content-range' in response_headers:
video['content-range'] = response_headers['content-range']
logging.debug('embedded video %s', video)
if not 'videos' in page:
chrome_msg["params"]["response"]["headers"]
)
if "content-length" in response_headers:
video["content-length"] = int(response_headers["content-length"])
if "content-range" in response_headers:
video["content-range"] = response_headers["content-range"]
logging.debug("embedded video %s", video)
if not "videos" in page:
page.videos = []
page.videos.append(video)
sw_fetched = set()
def _on_service_worker_version_updated(chrome_msg):
# https://github.com/internetarchive/brozzler/issues/140
self.logger.trace('%r', chrome_msg)
if chrome_msg.get('params', {}).get('versions'):
url = chrome_msg.get('params', {}).get('versions')[0]\
.get('scriptURL')
self.logger.trace("%r", chrome_msg)
if chrome_msg.get("params", {}).get("versions"):
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
if url and url not in sw_fetched:
self.logger.info('fetching service worker script %s', url)
self.logger.info("fetching service worker script %s", url)
self._fetch_url(site, url=url)
sw_fetched.add(url)
if not browser.is_running():
browser.start(
proxy=self._proxy_for(site),
cookie_db=site.get('cookie_db'),
window_height=self._window_height,
window_width=self._window_width)
proxy=self._proxy_for(site),
cookie_db=site.get("cookie_db"),
window_height=self._window_height,
window_width=self._window_width,
)
final_page_url, outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(page),
behavior_parameters=site.get('behavior_parameters'),
username=site.get('username'), password=site.get('password'),
user_agent=site.get('user_agent'),
on_screenshot=_on_screenshot, on_response=_on_response,
on_request=on_request,
on_service_worker_version_updated=_on_service_worker_version_updated,
hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout,
extract_outlinks_timeout=self._extract_outlinks_timeout,
download_throughput=self._download_throughput,
stealth=self._stealth)
page.url,
extra_headers=site.extra_headers(page),
behavior_parameters=site.get("behavior_parameters"),
username=site.get("username"),
password=site.get("password"),
user_agent=site.get("user_agent"),
on_screenshot=_on_screenshot,
on_response=_on_response,
on_request=on_request,
on_service_worker_version_updated=_on_service_worker_version_updated,
hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout,
extract_outlinks_timeout=self._extract_outlinks_timeout,
download_throughput=self._download_throughput,
stealth=self._stealth,
)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks
@ -328,22 +399,21 @@ class BrozzlerWorker:
def _fetch_url(self, site, url=None, page=None):
proxies = None
if page:
url=page.url
url = page.url
if self._proxy_for(site):
proxies = {
'http': 'http://%s' % self._proxy_for(site),
'https': 'http://%s' % self._proxy_for(site),
"http": "http://%s" % self._proxy_for(site),
"https": "http://%s" % self._proxy_for(site),
}
self.logger.info('fetching %s', url)
self.logger.info("fetching %s", url)
try:
# response is ignored
requests.get(
url, proxies=proxies, headers=site.extra_headers(page),
verify=False)
url, proxies=proxies, headers=site.extra_headers(page), verify=False
)
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError(
'proxy error fetching %s' % url) from e
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
def _needs_browsing(self, page, ydl_fetches):
if ydl_fetches:
@ -351,8 +421,10 @@ class BrozzlerWorker:
if not final_bounces:
return True
for txn in final_bounces:
if txn['response_headers'].get_content_type() in [
'text/html', 'application/xhtml+xml']:
if txn["response_headers"].get_content_type() in [
"text/html",
"application/xhtml+xml",
]:
return True
return False
else:
@ -361,14 +433,13 @@ class BrozzlerWorker:
def _already_fetched(self, page, ydl_fetches):
if ydl_fetches:
for fetch in ydl.final_bounces(ydl_fetches, page.url):
if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
if fetch["method"] == "GET" and fetch["response_code"] == 200:
return True
return False
def brozzle_site(self, browser, site):
try:
site.last_claimed_by = '%s:%s' % (
socket.gethostname(), browser.chrome.port)
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
site.save()
start = time.time()
page = None
@ -377,28 +448,28 @@ class BrozzlerWorker:
# _proxy_for() call in log statement can raise brozzler.ProxyError
# which is why we honor time limit and stop request first☝🏻
self.logger.info(
"brozzling site (proxy=%r) %s",
self._proxy_for(site), site)
"brozzling site (proxy=%r) %s", self._proxy_for(site), site
)
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh()
self._frontier.enforce_time_limit(site)
self._frontier.honor_stop_request(site)
page = self._frontier.claim_page(site, "%s:%s" % (
socket.gethostname(), browser.chrome.port))
page = self._frontier.claim_page(
site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
)
if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(
site, page.url, self._proxy_for(site))):
if page.needs_robots_check and not brozzler.is_permitted_by_robots(
site, page.url, self._proxy_for(site)
):
logging.warning("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
outlinks = self.brozzle_page(
browser, site, page,
enable_youtube_dl=not self._skip_youtube_dl)
browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
)
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
if browser.is_running():
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
@ -418,31 +489,36 @@ class BrozzlerWorker:
except brozzler.ProxyError as e:
if self._warcprox_auto:
logging.error(
'proxy error (site.proxy=%s), will try to choose a '
'healthy instance next time site is brozzled: %s',
site.proxy, e)
"proxy error (site.proxy=%s), will try to choose a "
"healthy instance next time site is brozzled: %s",
site.proxy,
e,
)
site.proxy = None
else:
# using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time
logging.error(
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
except:
self.logger.error(
'unexpected exception site=%r page=%r', site, page,
exc_info=True)
"unexpected exception site=%r page=%r", site, page, exc_info=True
)
if page:
page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info(
'marking page "completed" after %s unexpected '
'exceptions attempting to brozzle %s',
page.failed_attempts, page)
'marking page "completed" after %s unexpected '
"exceptions attempting to brozzle %s",
page.failed_attempts,
page,
)
self._frontier.completed_page(site, page)
page = None
finally:
if start:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
site.active_brozzling_time = (
(site.active_brozzling_time or 0) + time.time() - start
)
self._frontier.disclaim_site(site, page)
def _brozzle_site_thread_target(self, browser, site):
@ -462,21 +538,25 @@ class BrozzlerWorker:
"role": "brozzler-worker",
"ttl": self.HEARTBEAT_INTERVAL * 3,
}
status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
status_info["load"] = (
1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
)
status_info["browser_pool_size"] = self._browser_pool.size
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
try:
self.status_info = self._service_registry.heartbeat(status_info)
self.logger.trace(
"status in service registry: %s", self.status_info)
self.logger.trace("status in service registry: %s", self.status_info)
except r.ReqlError as e:
self.logger.error(
"failed to send heartbeat and update service registry "
"with info %s: %s", status_info, e)
"failed to send heartbeat and update service registry "
"with info %s: %s",
status_info,
e,
)
def _service_heartbeat_if_due(self):
'''Sends service registry heartbeat if due'''
"""Sends service registry heartbeat if due"""
due = False
if self._service_registry:
if not hasattr(self, "status_info"):
@ -489,15 +569,16 @@ class BrozzlerWorker:
self._service_heartbeat()
def _start_browsing_some_sites(self):
'''
"""
Starts browsing some sites.
Raises:
NoBrowsersAvailable if none available
'''
"""
# acquire_multi() raises NoBrowsersAvailable if none available
browsers = self._browser_pool.acquire_multi(
(self._browser_pool.num_available() + 1) // 2)
(self._browser_pool.num_available() + 1) // 2
)
try:
sites = self._frontier.claim_sites(len(browsers))
except:
@ -507,10 +588,11 @@ class BrozzlerWorker:
for i in range(len(browsers)):
if i < len(sites):
th = threading.Thread(
target=self._brozzle_site_thread_target,
args=(browsers[i], sites[i]),
name="BrozzlingThread:%s" % browsers[i].chrome.port,
daemon=True)
target=self._brozzle_site_thread_target,
args=(browsers[i], sites[i]),
name="BrozzlingThread:%s" % browsers[i].chrome.port,
daemon=True,
)
with self._browsing_threads_lock:
self._browsing_threads.add(th)
th.start()
@ -519,7 +601,8 @@ class BrozzlerWorker:
def run(self):
self.logger.notice(
'brozzler %s - brozzler-worker starting', brozzler.__version__)
"brozzler %s - brozzler-worker starting", brozzler.__version__
)
last_nothing_to_claim = 0
try:
while not self._shutdown.is_set():
@ -528,39 +611,38 @@ class BrozzlerWorker:
try:
self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable:
logging.trace(
"all %s browsers are in use",
self._max_browsers)
logging.trace("all %s browsers are in use", self._max_browsers)
except brozzler.NothingToClaim:
last_nothing_to_claim = time.time()
logging.trace(
"nothing to claim, all available active sites "
"are already claimed by a brozzler worker")
"nothing to claim, all available active sites "
"are already claimed by a brozzler worker"
)
time.sleep(0.5)
self.logger.notice("shutdown requested")
except r.ReqlError as e:
self.logger.error(
"caught rethinkdb exception, will try to proceed",
exc_info=True)
"caught rethinkdb exception, will try to proceed", exc_info=True
)
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except:
self.logger.critical(
"thread exiting due to unexpected exception",
exc_info=True)
"thread exiting due to unexpected exception", exc_info=True
)
finally:
if self._service_registry and hasattr(self, "status_info"):
try:
self._service_registry.unregister(self.status_info["id"])
except:
self.logger.error(
"failed to unregister from service registry",
exc_info=True)
"failed to unregister from service registry", exc_info=True
)
self.logger.info(
'shutting down %s brozzling threads',
len(self._browsing_threads))
"shutting down %s brozzling threads", len(self._browsing_threads)
)
with self._browsing_threads_lock:
for th in self._browsing_threads:
if th.is_alive():
@ -575,11 +657,10 @@ class BrozzlerWorker:
with self._start_stop_lock:
if self._thread:
self.logger.warning(
'ignoring start request because self._thread is '
'not None')
"ignoring start request because self._thread is " "not None"
)
return
self._thread = threading.Thread(
target=self.run, name="BrozzlerWorker")
self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
self._thread.start()
def shutdown_now(self):
@ -590,4 +671,3 @@ class BrozzlerWorker:
def is_alive(self):
return self._thread and self._thread.is_alive()

View File

@ -1,4 +1,4 @@
'''
"""
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
Copyright (C) 2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import logging
import yt_dlp
@ -31,6 +31,7 @@ import threading
thread_local = threading.local()
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
req.add_header(h, v)
return req
class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
def _http_response(self, request, response):
fetch = {
'url': request.full_url,
'method': request.get_method(),
'response_code': response.code,
'response_headers': response.headers,
"url": request.full_url,
"method": request.get_method(),
"response_code": response.code,
"response_headers": response.headers,
}
self.fetches.append(fetch)
return response
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
def reset(self):
self.fetches = []
def final_bounces(fetches, url):
"""
Resolves redirect chains in `fetches` and returns a list of fetches
@ -73,26 +76,28 @@ def final_bounces(fetches, url):
"""
redirects = {}
for fetch in fetches:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in fetch['response_headers']:
redirects[fetch['url']] = fetch
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if "location" in fetch["response_headers"]:
redirects[fetch["url"]] = fetch
final_url = url
while final_url in redirects:
fetch = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
fetch['url'], fetch['response_headers']['location'])
fetch["url"], fetch["response_headers"]["location"]
)
final_bounces = []
for fetch in fetches:
if fetch['url'] == final_url:
if fetch["url"] == final_url:
final_bounces.append(fetch)
return final_bounces
def _build_youtube_dl(worker, destdir, site, page):
'''
"""
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
The `YoutubeDL` instance does a few special brozzler-specific things:
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):
Returns:
a yt-dlp `yt_dlp.YoutubeDL` instance
'''
"""
class _YoutubeDL(yt_dlp.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
if ie_result.get("_type") == "playlist":
self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
if ie.IE_NAME in {
"youtube:playlist",
"youtube:tab",
"soundcloud:user",
"instagram:user",
}:
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because
# otherwise `process_ie_result()` will clobber it, and we
# use it later to extract the watch pages as outlinks.
try:
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e:
self.logger.warning(
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
ie.IE_NAME, url, e)
ie_result['entries_no_dl'] =[]
ie_result['entries'] = []
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
ie.IE_NAME,
url,
e,
)
ie_result["entries_no_dl"] = []
ie_result["entries"] = []
self.logger.info(
'not downloading %s media files from this '
'playlist because we expect to capture them from '
'individual watch/track/detail pages',
len(ie_result['entries_no_dl']))
"not downloading %s media files from this "
"playlist because we expect to capture them from "
"individual watch/track/detail pages",
len(ie_result["entries_no_dl"]),
)
else:
self.logger.info(
'extractor %r found a download in %s', ie.IE_NAME, url)
self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
# youtube-dl produces a stitched-up video that /usr/bin/file fails
# to identify (says "application/octet-stream"). `ffprobe` doesn't
# give us a mimetype.
if info_dict.get('ext') == 'mp4':
mimetype = 'video/mp4'
if info_dict.get("ext") == "mp4":
mimetype = "video/mp4"
else:
try:
import magic
mimetype = magic.from_file(info_dict['filepath'], mime=True)
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e:
mimetype = 'video/%s' % info_dict['ext']
self.logger.warning(
'guessing mimetype %s because %r', mimetype, e)
mimetype = "video/%s" % info_dict["ext"]
self.logger.warning("guessing mimetype %s because %r", mimetype, e)
# youtube watch page postprocessor is MoveFiles
if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1,
info_dict['webpage_url'])
if postprocessor == "FixupM3u8" or postprocessor == "Merger":
url = "youtube-dl:%05d:%s" % (
info_dict.get("playlist_index") or 1,
info_dict["webpage_url"],
)
else:
url = info_dict.get('url', '')
url = info_dict.get("url", "")
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
if url.endswith('.m3u8') or url == '':
if url.endswith(".m3u8") or url == "":
return
size = os.path.getsize(info_dict['filepath'])
size = os.path.getsize(info_dict["filepath"])
self.logger.info(
'pushing %r video as %s (%s bytes) to '
'warcprox at %s with url %s', info_dict['format'],
mimetype, size, worker._proxy_for(site), url)
with open(info_dict['filepath'], 'rb') as f:
"pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
info_dict["format"],
mimetype,
size,
worker._proxy_for(site),
url,
)
with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked
# transfer, which warcprox currently rejects
extra_headers = dict(site.extra_headers())
extra_headers['content-length'] = size
extra_headers["content-length"] = size
request, response = worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site), url=url,
warc_type='resource', content_type=mimetype, payload=f,
extra_headers=extra_headers)
warcprox_address=worker._proxy_for(site),
url=url,
warc_type="resource",
content_type=mimetype,
payload=f,
extra_headers=extra_headers,
)
# consulted by _remember_videos()
ydl.pushed_videos.append({
'url': url,
'response_code': response.code,
'content-type': mimetype,
'content-length': size,
})
ydl.pushed_videos.append(
{
"url": url,
"response_code": response.code,
"content-type": mimetype,
"content-length": size,
}
)
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
# in case yt-dlp takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
if (
site.rr
and doublethink.utcnow() - site.last_claimed
> datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
):
worker.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
"heartbeating site.last_claimed to prevent another "
"brozzler-worker claiming this site id=%r",
site.id,
)
site.last_claimed = doublethink.utcnow()
site.save()
except:
worker.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
"problem heartbeating site.last_claimed site id=%r",
site.id,
exc_info=True,
)
def ydl_postprocess_hook(d):
if d['status'] == 'finished':
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
if d["status"] == "finished":
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
worker.logger.info(
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
)
if worker._using_warcprox(site):
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
_YoutubeDL._push_video_to_warcprox(
_YoutubeDL, site, d["info_dict"], d["postprocessor"]
)
# default socket_timeout is 20 -- we hit it often when cluster is busy
ydl_opts = {
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
"socket_timeout": 40,
"progress_hooks": [maybe_heartbeat_site_last_claimed],
"postprocessor_hooks": [ydl_postprocess_hook],
# https://github.com/yt-dlp/yt-dlp#format-selection
# "By default, yt-dlp tries to download the best available quality..."
# pre-v.2023.07.06: "format_sort": ["ext"],
@ -238,16 +272,13 @@ def _build_youtube_dl(worker, destdir, site, page):
# recommended: convert working cli to api call with
# https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py
"format": "b/bv+ba",
"format_sort": ["res:720","vcodec:h264","acodec:aac"],
"format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
# skip live streams
"match_filter": match_filter_func("!is_live"),
"extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
"extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
# --cache-dir local or..
# this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
"cache_dir": "/home/archiveit",
"logger": logging.getLogger("yt_dlp"),
"verbose": False,
"quiet": False,
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
ydl._opener.add_handler(ydl.fetch_spy)
return ydl
def _remember_videos(page, fetches, pushed_videos=None):
'''
"""
Saves info about videos captured by yt-dlp in `page.videos`.
'''
if not 'videos' in page:
"""
if not "videos" in page:
page.videos = []
for fetch in fetches or []:
content_type = fetch['response_headers'].get_content_type()
if (content_type.startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd'
and fetch['method'] == 'GET'
and fetch['response_code'] in (200, 206)):
content_type = fetch["response_headers"].get_content_type()
if (
content_type.startswith("video/")
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != "video/vnd.mpeg.dash.mpd"
and fetch["method"] == "GET"
and fetch["response_code"] in (200, 206)
):
video = {
'blame': 'youtube-dl',
'url': fetch['url'],
'response_code': fetch['response_code'],
'content-type': content_type,
"blame": "youtube-dl",
"url": fetch["url"],
"response_code": fetch["response_code"],
"content-type": content_type,
}
if 'content-length' in fetch['response_headers']:
video['content-length'] = int(
fetch['response_headers']['content-length'])
if 'content-range' in fetch['response_headers']:
if "content-length" in fetch["response_headers"]:
video["content-length"] = int(
fetch["response_headers"]["content-length"]
)
if "content-range" in fetch["response_headers"]:
# skip chunked youtube video
if 'googlevideo.com/videoplayback' in fetch['url']:
if "googlevideo.com/videoplayback" in fetch["url"]:
continue
video['content-range'] = fetch[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
video["content-range"] = fetch["response_headers"]["content-range"]
logging.debug("embedded video %s", video)
page.videos.append(video)
for pushed_video in pushed_videos or []:
if pushed_video['content-type'].startswith('video/'):
if pushed_video["content-type"].startswith("video/"):
video = {
'blame': 'youtube-dl',
'url': pushed_video['url'],
'response_code': pushed_video['response_code'],
'content-type': pushed_video['content-type'],
'content-length': pushed_video['content-length'],
"blame": "youtube-dl",
"url": pushed_video["url"],
"response_code": pushed_video["response_code"],
"content-type": pushed_video["content-type"],
"content-length": pushed_video["content-length"],
}
logging.debug('embedded video %s', video)
logging.debug("embedded video %s", video)
page.videos.append(video)
def _try_youtube_dl(worker, ydl, site, page):
try:
logging.info("trying yt-dlp on %s", page)
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
# no host given>" resulting in ProxyError
# needs automated test
# and yt-dlp needs sanitize_info for extract_info
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
ie_result = ydl.sanitize_info(
ydl.extract_info(str(urlcanon.whatwg(page.url)))
)
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s", page)
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
page,
)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers(page))
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers(page),
)
return ie_result
except brozzler.ShutdownRequested as e:
raise
except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
return None
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
elif (
hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420
):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)):
elif (
hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)
):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'yt-dlp hit apparent proxy error from '
'%s' % page.url) from e
"yt-dlp hit apparent proxy error from " "%s" % page.url
) from e
else:
raise
def do_youtube_dl(worker, site, page):
'''
"""
Runs yt-dlp configured for `worker` and `site` to download videos from
`page`.
@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
'response_headers': ...,
}, ...]
`list` of `str`: outlink urls
'''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
"""
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site, page)
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set()
if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or
ie_result.get('extractor') == 'youtube:tab'):
if ie_result and (
ie_result.get("extractor") == "youtube:playlist"
or ie_result.get("extractor") == "youtube:tab"
):
# youtube watch pages as outlinks
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
for e in ie_result.get('entries_no_dl', [])}
outlinks = {
"https://www.youtube.com/watch?v=%s" % e["id"]
for e in ie_result.get("entries_no_dl", [])
}
# any outlinks for other cases?
return ydl.fetch_spy.fetches, outlinks

147
setup.py
View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
setup.py - brozzler setup script
Copyright (C) 2014-2024 Internet Archive
@ -15,89 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import setuptools
import os
def find_package_data(package):
pkg_data = []
depth = len(package.split('.'))
path = os.path.join(*package.split('.'))
depth = len(package.split("."))
path = os.path.join(*package.split("."))
for dirpath, dirnames, filenames in os.walk(path):
if not os.path.exists(os.path.join(dirpath, '__init__.py')):
if not os.path.exists(os.path.join(dirpath, "__init__.py")):
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
return pkg_data
setuptools.setup(
name='brozzler',
version='1.5.44',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
author_email='nlevitt@archive.org',
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
license='Apache License 2.0',
packages=['brozzler', 'brozzler.dashboard'],
package_data={
'brozzler': [
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
},
entry_points={
'console_scripts': [
'brozzle-page=brozzler.cli:brozzle_page',
'brozzler-new-job=brozzler.cli:brozzler_new_job',
'brozzler-new-site=brozzler.cli:brozzler_new_site',
'brozzler-worker=brozzler.cli:brozzler_worker',
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl',
'brozzler-purge=brozzler.cli:brozzler_purge',
'brozzler-dashboard=brozzler.dashboard:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
],
},
install_requires=[
'PyYAML>=5.1',
'yt_dlp<2023.11.16',
'reppy==0.3.4',
'requests>=2.21',
'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0',
'urlcanon>=0.1.dev23',
'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311',
'rethinkdb<2.4.10',
'cerberus>=1.0.1',
'jinja2>=2.10',
'cryptography>=2.3',
'python-magic>=0.4.15',
name="brozzler",
version="1.5.44",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",
author_email="nlevitt@archive.org",
long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
license="Apache License 2.0",
packages=["brozzler", "brozzler.dashboard"],
package_data={
"brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
"brozzler.dashboard": find_package_data("brozzler.dashboard"),
},
entry_points={
"console_scripts": [
"brozzle-page=brozzler.cli:brozzle_page",
"brozzler-new-job=brozzler.cli:brozzler_new_job",
"brozzler-new-site=brozzler.cli:brozzler_new_site",
"brozzler-worker=brozzler.cli:brozzler_worker",
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",
"brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
"brozzler-list-sites=brozzler.cli:brozzler_list_sites",
"brozzler-list-pages=brozzler.cli:brozzler_list_pages",
"brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
"brozzler-purge=brozzler.cli:brozzler_purge",
"brozzler-dashboard=brozzler.dashboard:main",
"brozzler-easy=brozzler.easy:main",
"brozzler-wayback=brozzler.pywb:main",
],
extras_require={
'dashboard': [
'flask>=1.0',
'gunicorn>=19.8.1'
],
'easy': [
'warcprox>=2.4.31',
'pywb>=0.33.2,<2',
'flask>=1.0',
'gunicorn>=19.8.1'
],
},
zip_safe=False,
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Topic :: Internet :: WWW/HTTP',
'Topic :: System :: Archiving',
])
},
install_requires=[
"PyYAML>=5.1",
"yt_dlp<2023.11.16",
"reppy==0.3.4",
"requests>=2.21",
"websocket-client>=0.39.0,<=0.48.0",
"pillow>=5.2.0",
"urlcanon>=0.1.dev23",
"doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
"rethinkdb<2.4.10",
"cerberus>=1.0.1",
"jinja2>=2.10",
"cryptography>=2.3",
"python-magic>=0.4.15",
],
extras_require={
"dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
"easy": [
"warcprox>=2.4.31",
"pywb>=0.33.2,<2",
"flask>=1.0",
"gunicorn>=19.8.1",
],
},
zip_safe=False,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Internet :: WWW/HTTP",
"Topic :: System :: Archiving",
],
)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
test_brozzling.py - XXX explain
Copyright (C) 2016-2018 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import pytest
import brozzler
@ -34,79 +34,81 @@ args.log_level = logging.INFO
brozzler.cli.configure_logging(args)
WARCPROX_META_420 = {
'stats': {
'test_limits_bucket': {
'total': {'urls': 0, 'wire_bytes': 0},
'new': {'urls': 0, 'wire_bytes': 0},
'revisit': {'urls': 0, 'wire_bytes': 0},
'bucket': 'test_limits_bucket'
"stats": {
"test_limits_bucket": {
"total": {"urls": 0, "wire_bytes": 0},
"new": {"urls": 0, "wire_bytes": 0},
"revisit": {"urls": 0, "wire_bytes": 0},
"bucket": "test_limits_bucket",
}
},
'reached-limit': {'test_limits_bucket/total/urls': 0}
"reached-limit": {"test_limits_bucket/total/urls": 0},
}
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd'
self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
def do_GET(self):
if self.path == '/420':
self.send_response(420, 'Reached limit')
self.send_header('Connection', 'close')
self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
self.send_header('Content-Type', 'text/plain;charset=utf-8')
self.send_header('Content-Length', len(payload))
if self.path == "/420":
self.send_response(420, "Reached limit")
self.send_header("Connection", "close")
self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Content-Length", len(payload))
self.end_headers()
self.wfile.write(payload)
elif self.path == '/401':
elif self.path == "/401":
self.send_response(401)
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
self.send_header('Content-type', 'text/html')
self.send_header("WWW-Authenticate", 'Basic realm="Test"')
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(self.headers.get('Authorization', b''))
self.wfile.write(b'not authenticated')
self.wfile.write(self.headers.get("Authorization", b""))
self.wfile.write(b"not authenticated")
else:
super().do_GET()
def do_POST(self):
if self.path == '/login-action':
if self.path == "/login-action":
self.send_response(200)
payload = b'login successful\n'
self.send_header('Content-Type', 'text/plain;charset=utf-8')
self.send_header('Content-Length', len(payload))
payload = b"login successful\n"
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Content-Length", len(payload))
self.end_headers()
self.wfile.write(payload)
else:
super().do_POST()
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
request.addfinalizer(fin)
return httpd
def test_httpd(httpd):
'''
"""
Tests that our http server is working as expected, and that two fetches
of the same url return the same payload, proving it can be used to test
deduplication.
'''
"""
payload1 = content2 = None
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload1 = response.read()
@ -119,123 +121,136 @@ def test_httpd(httpd):
assert payload1 == payload2
url = 'http://localhost:%s/420' % httpd.server_port
url = "http://localhost:%s/420" % httpd.server_port
with pytest.raises(urllib.error.HTTPError) as excinfo:
urllib.request.urlopen(url)
assert excinfo.value.getcode() == 420
def test_aw_snap_hes_dead_jim():
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.BrowsingException):
browser.browse_page('chrome://crash')
browser.browse_page("chrome://crash")
# chromium's 401 handling changed???
@pytest.mark.xfail
def test_page_interstitial_exception(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/401' % httpd.server_port
url = "http://localhost:%s/401" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.PageInterstitialShown):
browser.browse_page(url)
def test_on_response(httpd):
response_urls = []
def on_response(msg):
response_urls.append(msg['params']['response']['url'])
response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site3/page.html' % httpd.server_port
url = "http://localhost:%s/site3/page.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(url, on_response=on_response)
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
assert (
response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
)
assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
def test_420(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/420' % httpd.server_port
url = "http://localhost:%s/420" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ReachedLimit) as excinfo:
browser.browse_page(url)
assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/alert.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/confirm.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
"http://localhost:%s/site4/confirm.html" % httpd.server_port
)
browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
# XXX print dialog unresolved
# browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
def test_page_videos(httpd):
# test depends on behavior of youtube-dl and chromium, could fail and need
# to be adjusted on youtube-dl or chromium updates
chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {})
page = brozzler.Page(None, {
'url':'http://localhost:%s/site6/' % httpd.server_port})
page = brozzler.Page(
None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
worker.brozzle_page(browser, site, page)
assert page.videos
assert len(page.videos) == 4
assert page.videos[0] == {
'blame': 'youtube-dl',
'response_code': 200,
'content-length': 383631,
'content-type': 'video/mp4',
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
"blame": "youtube-dl",
"response_code": 200,
"content-length": 383631,
"content-type": "video/mp4",
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
}
assert page.videos[1] == {
'blame': 'youtube-dl',
'content-length': 92728,
'content-type': 'video/webm',
'response_code': 200,
'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
"blame": "youtube-dl",
"content-length": 92728,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
% httpd.server_port,
}
assert page.videos[2] == {
'blame': 'youtube-dl',
'content-length': 101114,
'content-type': 'video/webm',
'response_code': 200,
'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
"blame": "youtube-dl",
"content-length": 101114,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
}
assert page.videos[3] == {
'blame': 'browser',
"blame": "browser",
# 'response_code': 206,
# 'content-range': 'bytes 0-229454/229455',
'response_code': 200,
'content-length': 229455,
'content-type': 'video/webm',
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
"response_code": 200,
"content-length": 229455,
"content-type": "video/webm",
"url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
}
def test_extract_outlinks(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {})
page = brozzler.Page(None, {
'url':'http://localhost:%s/site8/' % httpd.server_port})
page = brozzler.Page(
None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
outlinks = worker.brozzle_page(browser, site, page)
assert outlinks == {
'http://example.com/offsite',
'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
"http://example.com/offsite",
"http://localhost:%s/site8/baz/zuh" % httpd.server_port,
"http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
"http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
}
def test_proxy_down():
'''
"""
Test that browsing raises `brozzler.ProxyError` when proxy is down.
See also `test_proxy_down` in test_units.py.
@ -243,40 +258,41 @@ def test_proxy_down():
Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections
'''
"""
sock = socket.socket()
sock.bind(('127.0.0.1', 0))
for not_listening_proxy in (
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'})
sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
site = brozzler.Site(None, {"seed": "http://example.com/"})
page = brozzler.Page(None, {"url": "http://example.com/"})
worker = brozzler.BrozzlerWorker(
frontier=None, proxy=not_listening_proxy)
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ProxyError):
worker.brozzle_page(browser, site, page)
def test_try_login(httpd):
"""Test try_login behavior.
"""
"""Test try_login behavior."""
response_urls = []
def on_response(msg):
response_urls.append(msg['params']['response']['url'])
response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe()
form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port
favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
login_url = 'http://localhost:%s/login-action' % httpd.server_port
form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
login_url = "http://localhost:%s/login-action" % httpd.server_port
# When username and password are defined and initial page has login form,
# detect login form, submit login, and then return to the initial page.
username = 'user1'
password = 'pass1'
username = "user1"
password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_url, username=username, password=password,
on_response=on_response)
browser.browse_page(
form_url, username=username, password=password, on_response=on_response
)
assert len(response_urls) == 4
assert response_urls[0] == form_url
assert response_urls[1] == favicon_url
@ -285,11 +301,15 @@ def test_try_login(httpd):
# We are now supporting a different type of form, we'll test that here.
response_urls = []
username = 'user1'
password = 'pass1'
username = "user1"
password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_url_other, username=username, password=password,
on_response=on_response)
browser.browse_page(
form_url_other,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 4
assert response_urls[0] == form_url_other
assert response_urls[1] == favicon_url
@ -306,10 +326,16 @@ def test_try_login(httpd):
# when the page doesn't have a form with username/password, don't submit it
response_urls = []
form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
form_without_login_url = (
"http://localhost:%s/site11/form-no-login.html" % httpd.server_port
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_without_login_url, username=username,
password=password, on_response=on_response)
browser.browse_page(
form_without_login_url,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 2
assert response_urls[0] == form_without_login_url
assert response_urls[1] == favicon_url

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
test_cli.py - test brozzler commands
Copyright (C) 2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import brozzler.cli
import pkg_resources
@ -23,59 +23,62 @@ import pytest
import subprocess
import doublethink
def cli_commands():
commands = set(pkg_resources.get_entry_map(
'brozzler')['console_scripts'].keys())
commands.remove('brozzler-wayback')
commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
commands.remove("brozzler-wayback")
try:
import gunicorn
except ImportError:
commands.remove('brozzler-dashboard')
commands.remove("brozzler-dashboard")
try:
import pywb
except ImportError:
commands.remove('brozzler-easy')
commands.remove("brozzler-easy")
return commands
@pytest.mark.parametrize('cmd', cli_commands())
@pytest.mark.parametrize("cmd", cli_commands())
def test_call_entrypoint(capsys, cmd):
entrypoint = pkg_resources.get_entry_map(
'brozzler')['console_scripts'][cmd]
entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
callable = entrypoint.resolve()
with pytest.raises(SystemExit):
callable(['/whatever/bin/%s' % cmd, '--version'])
callable(["/whatever/bin/%s" % cmd, "--version"])
out, err = capsys.readouterr()
assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd)
assert err == ''
assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
assert err == ""
@pytest.mark.parametrize('cmd', cli_commands())
@pytest.mark.parametrize("cmd", cli_commands())
def test_run_command(capsys, cmd):
proc = subprocess.Popen(
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate()
assert err == b''
assert out == ('brozzler %s - %s\n' % (
brozzler.__version__, cmd)).encode('ascii')
assert err == b""
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
def test_rethinkdb_up():
'''Check that rethinkdb is up and running.'''
"""Check that rethinkdb is up and running."""
# check that rethinkdb is listening and looks sane
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
tbls = rr.table_list().run()
assert len(tbls) > 10
# XXX don't know why this test is failing in travis-ci and vagrant while
# test_call_entrypoint tests pass :( (also fails with capfd)
@pytest.mark.xfail
def test_stop_nonexistent_crawl(capsys):
with pytest.raises(SystemExit):
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123'])
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
out, err = capsys.readouterr()
assert err.endswith('site not found with id=123\n')
assert out == ''
assert err.endswith("site not found with id=123\n")
assert out == ""
with pytest.raises(SystemExit):
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc'])
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
out, err = capsys.readouterr()
assert err.endswith('''job not found with id='abc'\n''')
assert out == ''
assert err.endswith("""job not found with id='abc'\n""")
assert out == ""

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
test_units.py - some unit tests for parts of brozzler amenable to that
Copyright (C) 2016-2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import pytest
import http.server
@ -37,99 +37,131 @@ import threading
from unittest import mock
logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
stream=sys.stderr,
level=logging.INFO,
format=(
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
),
)
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
("localhost", 0), http.server.SimpleHTTPRequestHandler
)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
request.addfinalizer(fin)
return httpd
def test_robots(httpd):
'''
"""
Basic test of robots.txt user-agent substring matching.
'''
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
"""
url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
assert brozzler.is_permitted_by_robots(site, url)
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
assert not brozzler.is_permitted_by_robots(site, url)
def test_robots_http_statuses():
for status in (
200, 204, 400, 401, 402, 403, 404, 405,
500, 501, 502, 503, 504, 505):
200,
204,
400,
401,
402,
403,
404,
405,
500,
501,
502,
503,
504,
505,
):
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
response = (('HTTP/1.1 %s Meaningless message\r\n'
+ 'Content-length: 0\r\n'
+ '\r\n') % status).encode('utf-8')
response = (
(
"HTTP/1.1 %s Meaningless message\r\n"
+ "Content-length: 0\r\n"
+ "\r\n"
)
% status
).encode("utf-8")
self.connection.sendall(response)
# self.send_response(status)
# self.end_headers()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url)
finally:
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
def test_robots_empty_response():
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
self.connection.shutdown(socket.SHUT_RDWR)
self.connection.close()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url)
finally:
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
def test_robots_socket_timeout():
stop_hanging = threading.Event()
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
stop_hanging.wait(60)
self.connection.sendall(
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {"seed": url})
brozzler.robots._SessionRaiseOn420.timeout = 2
assert brozzler.is_permitted_by_robots(site, url)
finally:
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
httpd.server_close()
httpd_thread.join()
def test_robots_dns_failure():
# .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://whatever.invalid./'
site = brozzler.Site(None, {'seed': url})
url = "http://whatever.invalid./"
site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url)
def test_robots_connection_failure():
# .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://localhost:4/' # nobody listens on port 4
site = brozzler.Site(None, {'seed': url})
url = "http://localhost:4/" # nobody listens on port 4
site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url)
def test_scoping():
test_scope = yaml.safe_load('''
test_scope = yaml.safe_load(
"""
max_hops: 100
accepts:
- url_match: REGEX_MATCH
@ -169,40 +205,73 @@ blocks:
- domain: twitter.com
url_match: REGEX_MATCH
value: ^.*lang=(?!en).*$
''')
"""
)
site = brozzler.Site(None, {
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
'scope': test_scope})
page = brozzler.Page(None, {
'url': 'http://example.com/foo/bar?baz=quux#monkey',
'site_id': site.id})
site = brozzler.Site(
None,
{
"id": 1,
"seed": "http://example.com/foo/bar?baz=quux#monkey",
"scope": test_scope,
},
)
page = brozzler.Page(
None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
)
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
assert (
site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
is True
)
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
assert (
site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
)
assert (
site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
)
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
assert (
site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
)
assert (
site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
)
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
assert (
site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
)
assert (
site.accept_reject_or_neither(
"https://www.youtube.com/watch?v=dUIn5OAPS5s", page
)
is None
)
yt_user_page = brozzler.Page(
None,
{
"url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
"site_id": site.id,
"hops_from_seed": 10,
},
)
assert (
site.accept_reject_or_neither(
"https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
)
is True
)
assert site.accept_reject_or_neither(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
yt_user_page = brozzler.Page(None, {
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
'site_id': site.id, 'hops_from_seed': 10})
assert site.accept_reject_or_neither(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
def test_proxy_down():
'''
"""
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
This test needs to cover every possible fetch through the proxy other than
@ -211,24 +280,24 @@ def test_proxy_down():
Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections
'''
"""
sock = socket.socket()
sock.bind(('127.0.0.1', 0))
for not_listening_proxy in (
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
worker = brozzler.BrozzlerWorker(
frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(None, {
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'})
sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
)
page = brozzler.Page(None, {"url": "http://example.com/"})
# robots.txt fetch
with pytest.raises(brozzler.ProxyError):
brozzler.is_permitted_by_robots(
site, 'http://example.com/', proxy=not_listening_proxy)
site, "http://example.com/", proxy=not_listening_proxy
)
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page)
@ -239,47 +308,58 @@ def test_proxy_down():
# WARCPROX_WRITE_RECORD
with pytest.raises(brozzler.ProxyError):
worker._warcprox_write_record(
warcprox_address=not_listening_proxy,
url='test://proxy_down/warcprox_write_record',
warc_type='metadata',
content_type='text/plain',
payload=b'''payload doesn't matter here''')
warcprox_address=not_listening_proxy,
url="test://proxy_down/warcprox_write_record",
warc_type="metadata",
content_type="text/plain",
payload=b"""payload doesn't matter here""",
)
def test_start_stop_backwards_compat():
site = brozzler.Site(None, {'seed': 'http://example.com/'})
site = brozzler.Site(None, {"seed": "http://example.com/"})
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start']
assert site.starts_and_stops[0]['stop'] is None
assert not 'start_time' in site
assert site.starts_and_stops[0]["start"]
assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site
site = brozzler.Site(None, {
'seed': 'http://example.com/',
'start_time': datetime.datetime(2017,1,1)})
site = brozzler.Site(
None,
{"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
)
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
assert site.starts_and_stops[0]['stop'] is None
assert not 'start_time' in site
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
assert job.starts_and_stops[0]['start']
assert job.starts_and_stops[0]['stop'] is None
assert not 'started' in job
assert not 'finished' in job
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
assert job.starts_and_stops[0]["start"]
assert job.starts_and_stops[0]["stop"] is None
assert not "started" in job
assert not "finished" in job
job = brozzler.Job(
None,
{
"seeds": [{"url": "https://example.com/"}],
"started": datetime.datetime(2017, 1, 1),
"finished": datetime.datetime(2017, 1, 2),
},
)
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
assert not "started" in job
assert not "finished" in job
job = brozzler.Job(None, {
'seeds': [{'url':'https://example.com/'}],
'started': datetime.datetime(2017, 1, 1),
'finished': datetime.datetime(2017, 1, 2)})
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
assert not 'started' in job
assert not 'finished' in job
class Exception1(Exception):
pass
class Exception2(Exception):
pass
def test_thread_raise_not_accept():
def never_accept():
try:
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
th.join()
assert thread_caught_exception is None
def test_thread_raise_immediate():
def accept_immediately():
try:
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
assert isinstance(thread_caught_exception, Exception1)
assert time.time() - start < 1.0
def test_thread_raise_safe_exit():
def delay_context_exit():
gate = brozzler.thread_accept_exceptions()
orig_exit = type(gate).__exit__
try:
type(gate).__exit__ = lambda self, et, ev, t: (
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
brozzler.sleep(2),
orig_exit(self, et, ev, t),
False,
)[-1]
with brozzler.thread_accept_exceptions() as gate:
brozzler.sleep(2)
except Exception as e:
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
assert thread_caught_exception
assert isinstance(thread_caught_exception, Exception1)
def test_thread_raise_pending_exception():
def accept_eventually():
try:
@ -365,16 +451,17 @@ def test_thread_raise_pending_exception():
assert isinstance(thread_caught_exception, Exception1)
assert time.time() - start > 1.0
def test_thread_raise_second_with_block():
def two_with_blocks():
try:
with brozzler.thread_accept_exceptions():
time.sleep(2)
return # test fails
return # test fails
except Exception1 as e:
pass
except:
return # fail test
return # fail test
try:
with brozzler.thread_accept_exceptions():
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
th.join()
assert isinstance(thread_caught_exception, Exception2)
def test_needs_browsing():
# only one test case here right now, which exposed a bug
class ConvenientHeaders(http.client.HTTPMessage):
def __init__(self, headers):
http.client.HTTPMessage.__init__(self)
for (k, v) in headers.items():
for k, v in headers.items():
self.add_header(k, v)
page = brozzler.Page(None, {
'url':'http://example.com/a'})
page = brozzler.Page(None, {"url": "http://example.com/a"})
spy = brozzler.ydl.YoutubeDLSpy()
spy.fetches.append({
'url': 'http://example.com/a',
'method': 'HEAD',
'response_code': 301,
'response_headers': ConvenientHeaders({'Location': '/b'})})
spy.fetches.append({
'url': 'http://example.com/b',
'method': 'GET',
'response_code': 200,
'response_headers': ConvenientHeaders({
'Content-Type': 'application/pdf'})})
spy.fetches.append(
{
"url": "http://example.com/a",
"method": "HEAD",
"response_code": 301,
"response_headers": ConvenientHeaders({"Location": "/b"}),
}
)
spy.fetches.append(
{
"url": "http://example.com/b",
"method": "GET",
"response_code": 200,
"response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
}
)
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
None, page, spy.fetches)
def test_seed_redirect():
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
site.note_seed_redirect('https://foo.com/a/b/c')
assert site.scope == {'accepts': [
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,foo,//https:/',}]}
site = brozzler.Site(None, {"seed": "http://foo.com/"})
site.note_seed_redirect("https://foo.com/a/b/c")
assert site.scope == {
"accepts": [
{
"ssurt": "com,foo,//http:/",
},
{
"ssurt": "com,foo,//https:/",
},
]
}
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
site.note_seed_redirect('http://foo.com/a/b/c')
assert site.scope == {'accepts': [
{'ssurt': 'com,foo,//https:/',},
{'ssurt': 'com,foo,//http:/',}]}
site = brozzler.Site(None, {"seed": "https://foo.com/"})
site.note_seed_redirect("http://foo.com/a/b/c")
assert site.scope == {
"accepts": [
{
"ssurt": "com,foo,//https:/",
},
{
"ssurt": "com,foo,//http:/",
},
]
}
site = brozzler.Site(None, {"seed": "http://foo.com/"})
site.note_seed_redirect("https://bar.com/a/b/c")
assert site.scope == {
"accepts": [
{
"ssurt": "com,foo,//http:/",
},
{
"ssurt": "com,bar,//https:/a/b/c",
},
]
}
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
site.note_seed_redirect('https://bar.com/a/b/c')
assert site.scope == {'accepts': [
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,bar,//https:/a/b/c',}]}
def test_limit_failures():
page = mock.Mock()
@ -446,9 +560,9 @@ def test_limit_failures():
page.brozzle_count = 0
site = mock.Mock()
site.status = 'ACTIVE'
site.status = "ACTIVE"
site.active_brozzling_time = 0
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
rr = mock.Mock()
rr.servers = [mock.Mock()]
@ -456,11 +570,12 @@ def test_limit_failures():
rr.db_list = mock.Mock(return_value=rethink_query)
rr.table_list = mock.Mock(return_value=rethink_query)
rr.table = mock.Mock(
return_value=mock.Mock(
between=mock.Mock(
return_value=mock.Mock(
limit=mock.Mock(
return_value=rethink_query)))))
return_value=mock.Mock(
between=mock.Mock(
return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
)
)
)
assert rr.table().between().limit().run() == []
frontier = brozzler.RethinkDbFrontier(rr)
frontier.enforce_time_limit = mock.Mock()
@ -475,20 +590,19 @@ def test_limit_failures():
assert page.failed_attempts is None
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
assert site.status == "ACTIVE"
worker.brozzle_site(browser, site)
assert page.failed_attempts == 1
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
assert site.status == "ACTIVE"
worker.brozzle_site(browser, site)
assert page.failed_attempts == 2
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
assert site.status == "ACTIVE"
worker.brozzle_site(browser, site)
assert page.failed_attempts == 3
assert page.brozzle_count == 1
assert site.status == 'FINISHED'
assert site.status == "FINISHED"

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
queue a job for your vagrant brozzler deployment.
@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import sys
import os
import argparse
import subprocess
def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
"job_conf_file",
metavar="JOB_CONF_FILE",
help="brozzler job configuration file in yaml",
)
args = arg_parser.parse_args(args=argv[1:])
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
with open(args.job_conf_file, 'rb') as f:
subprocess.call([
'vagrant', 'ssh', '--',
'f=`mktemp` && cat > $f && '
'/home/vagrant/brozzler-ve3/bin/python '
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
stdin=f)
with open(args.job_conf_file, "rb") as f:
subprocess.call(
[
"vagrant",
"ssh",
"--",
"f=`mktemp` && cat > $f && "
"/home/vagrant/brozzler-ve3/bin/python "
"/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
],
stdin=f,
)
if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
'''
"""
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
queue a site for your vagrant brozzler deployment.
@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
"""
import sys
import os
import argparse
import subprocess
try:
from shlex import quote
except:
from pipes import quote
def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
arg_parser.add_argument("seed", metavar="SEED", help="seed url")
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
help='time limit in seconds for this site')
"--time-limit",
dest="time_limit",
default=None,
help="time limit in seconds for this site",
)
arg_parser.add_argument(
'--ignore-robots', dest='ignore_robots', action='store_true',
help='ignore robots.txt for this site')
"--ignore-robots",
dest="ignore_robots",
action="store_true",
help="ignore robots.txt for this site",
)
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help=(
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true')
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true')
"--warcprox-meta",
dest="warcprox_meta",
help=(
"Warcprox-Meta http request header to send with each request; "
"must be a json blob, ignored unless warcprox features are "
"enabled"
),
)
arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
args = arg_parser.parse_args(args=argv[1:])
options = []
if args.time_limit:
options.append('--time-limit=%s' % args.time_limit)
options.append("--time-limit=%s" % args.time_limit)
if args.ignore_robots:
options.append('--ignore-robots')
options.append("--ignore-robots")
if args.warcprox_meta:
# I think this shell escaping is correct?
options.append(
'--warcprox-meta=%s' % quote(args.warcprox_meta))
options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
if args.quiet:
options.append('--quiet')
options.append("--quiet")
if args.verbose:
options.append('--verbose')
options.append("--verbose")
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
cmd = (
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
'%s %s') % (' '.join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd])
"/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
) % (" ".join(options), args.seed)
subprocess.call(["vagrant", "ssh", "--", cmd])
if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv)