Merge branch 'blacked' into qa

This commit is contained in:
Barbara Miller 2024-02-08 12:31:39 -08:00
commit 8f14dc1aec
23 changed files with 4048 additions and 2796 deletions

31
.github/workflows/python-formatting.yml vendored Normal file
View file

@ -0,0 +1,31 @@
name: Python Formatting Check
on:
push:
branches:
- main
- master
pull_request:
branches:
- main
- master
jobs:
formatting:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Create virtual environment
run: python -m venv venv
- name: Install black
run: |
./venv/bin/pip install --upgrade pip
./venv/bin/pip install black
- name: Run formatting check
run: make ck-format

2
.gitignore vendored
View file

@ -2,3 +2,5 @@
*.diff *.diff
.*.sw* .*.sw*
/brozzler.egg-info/ /brozzler.egg-info/
venv
.idea

7
Makefile Normal file
View file

@ -0,0 +1,7 @@
.PHONY: format
format:
venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
.PHONY: ck-format
ck-format:
venv/bin/black --check .

View file

@ -19,33 +19,41 @@ limitations under the License.
import logging import logging
from pkg_resources import get_distribution as _get_distribution from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution('brozzler').version
__version__ = _get_distribution("brozzler").version
class ShutdownRequested(Exception): class ShutdownRequested(Exception):
pass pass
class NothingToClaim(Exception): class NothingToClaim(Exception):
pass pass
class CrawlStopped(Exception): class CrawlStopped(Exception):
pass pass
class PageInterstitialShown(Exception): class PageInterstitialShown(Exception):
pass pass
class ProxyError(Exception): class ProxyError(Exception):
pass pass
class ReachedTimeLimit(Exception): class ReachedTimeLimit(Exception):
pass pass
class ReachedLimit(Exception): class ReachedLimit(Exception):
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
import json import json
if http_error: if http_error:
if "warcprox-meta" in http_error.headers: if "warcprox-meta" in http_error.headers:
self.warcprox_meta = json.loads( self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
http_error.headers["warcprox-meta"])
else: else:
self.warcprox_meta = None self.warcprox_meta = None
self.http_payload = http_error.read() self.http_payload = http_error.read()
@ -55,28 +63,39 @@ class ReachedLimit(Exception):
def __repr__(self): def __repr__(self):
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % ( return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None, self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
self.http_payload if hasattr(self, 'http_payload') else None) self.http_payload if hasattr(self, "http_payload") else None,
)
def __str__(self): def __str__(self):
return self.__repr__() return self.__repr__()
# monkey-patch log levels TRACE and NOTICE # monkey-patch log levels TRACE and NOTICE
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2 logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
def _logger_trace(self, msg, *args, **kwargs): def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.TRACE): if self.isEnabledFor(logging.TRACE):
self._log(logging.TRACE, msg, args, **kwargs) self._log(logging.TRACE, msg, args, **kwargs)
logging.Logger.trace = _logger_trace logging.Logger.trace = _logger_trace
logging.trace = logging.root.trace logging.trace = logging.root.trace
logging.addLevelName(logging.TRACE, 'TRACE') logging.addLevelName(logging.TRACE, "TRACE")
logging.NOTICE = (logging.INFO + logging.WARN) // 2 logging.NOTICE = (logging.INFO + logging.WARN) // 2
def _logger_notice(self, msg, *args, **kwargs): def _logger_notice(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.NOTICE): if self.isEnabledFor(logging.NOTICE):
self._log(logging.NOTICE, msg, args, **kwargs) self._log(logging.NOTICE, msg, args, **kwargs)
logging.Logger.notice = _logger_notice logging.Logger.notice = _logger_notice
logging.notice = logging.root.notice logging.notice = logging.root.notice
logging.addLevelName(logging.NOTICE, 'NOTICE') logging.addLevelName(logging.NOTICE, "NOTICE")
# see https://github.com/internetarchive/brozzler/issues/91 # see https://github.com/internetarchive/brozzler/issues/91
def _logging_handler_handle(self, record): def _logging_handler_handle(self, record):
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
except: except:
pass pass
return rv return rv
logging.Handler.handle = _logging_handler_handle logging.Handler.handle = _logging_handler_handle
_behaviors = None _behaviors = None
def behaviors(behaviors_dir=None): def behaviors(behaviors_dir=None):
"""Return list of JS behaviors loaded from YAML file. """Return list of JS behaviors loaded from YAML file.
@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
`js-templates/`. Defaults to brozzler dir. `js-templates/`. Defaults to brozzler dir.
""" """
import os, yaml, string import os, yaml, string
global _behaviors global _behaviors
if _behaviors is None: if _behaviors is None:
d = behaviors_dir or os.path.dirname(__file__) d = behaviors_dir or os.path.dirname(__file__)
behaviors_yaml = os.path.join(d, 'behaviors.yaml') behaviors_yaml = os.path.join(d, "behaviors.yaml")
with open(behaviors_yaml) as fin: with open(behaviors_yaml) as fin:
_behaviors = yaml.safe_load(fin) _behaviors = yaml.safe_load(fin)
return _behaviors return _behaviors
def behavior_script(url, template_parameters=None, behaviors_dir=None): def behavior_script(url, template_parameters=None, behaviors_dir=None):
''' """
Returns the javascript behavior string populated with template_parameters. Returns the javascript behavior string populated with template_parameters.
''' """
import re, logging, json import re, logging, json
for behavior in behaviors(behaviors_dir=behaviors_dir): for behavior in behaviors(behaviors_dir=behaviors_dir):
if re.match(behavior['url_regex'], url): if re.match(behavior["url_regex"], url):
parameters = dict() parameters = dict()
if 'default_parameters' in behavior: if "default_parameters" in behavior:
parameters.update(behavior['default_parameters']) parameters.update(behavior["default_parameters"])
if template_parameters: if template_parameters:
parameters.update(template_parameters) parameters.update(template_parameters)
template = jinja2_environment(behaviors_dir).get_template( template = jinja2_environment(behaviors_dir).get_template(
behavior['behavior_js_template']) behavior["behavior_js_template"]
)
script = template.render(parameters) script = template.render(parameters)
logging.info( logging.info(
'using template=%r populated with parameters=%r for %r', "using template=%r populated with parameters=%r for %r",
behavior['behavior_js_template'], json.dumps(parameters), url) behavior["behavior_js_template"],
json.dumps(parameters),
url,
)
return script return script
return None return None
class ThreadExceptionGate: class ThreadExceptionGate:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -142,8 +173,7 @@ class ThreadExceptionGate:
def __enter__(self): def __enter__(self):
assert self.thread == threading.current_thread() assert self.thread == threading.current_thread()
if self.pending_exception: if self.pending_exception:
self.logger.info( self.logger.info("raising pending exception %s", self.pending_exception)
'raising pending exception %s', self.pending_exception)
tmp = self.pending_exception tmp = self.pending_exception
self.pending_exception = None self.pending_exception = None
raise tmp raise tmp
@ -160,19 +190,26 @@ class ThreadExceptionGate:
with self.lock: with self.lock:
if self.pending_exception: if self.pending_exception:
self.logger.warning( self.logger.warning(
'%r already pending for thread %r, discarding %r', "%r already pending for thread %r, discarding %r",
self.pending_exception, self.thread, e) self.pending_exception,
self.thread,
e,
)
else: else:
self.pending_exception = e self.pending_exception = e
def __repr__(self): def __repr__(self):
return '<ThreadExceptionGate(%s)>' % self.thread return "<ThreadExceptionGate(%s)>" % self.thread
import threading import threading
_thread_exception_gates = {} _thread_exception_gates = {}
_thread_exception_gates_lock = threading.Lock() _thread_exception_gates_lock = threading.Lock()
def thread_exception_gate(thread=None): def thread_exception_gate(thread=None):
''' """
Returns a `ThreadExceptionGate` for `thread` (current thread by default). Returns a `ThreadExceptionGate` for `thread` (current thread by default).
`ThreadExceptionGate` is a context manager which allows exceptions to be `ThreadExceptionGate` is a context manager which allows exceptions to be
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
is queued, and raised immediately if and when the thread enters the is queued, and raised immediately if and when the thread enters the
context. Only one exception will be queued this way at a time, others are context. Only one exception will be queued this way at a time, others are
discarded. discarded.
''' """
if not thread: if not thread:
thread = threading.current_thread() thread = threading.current_thread()
@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):
return _thread_exception_gates[thread] return _thread_exception_gates[thread]
thread_accept_exceptions = thread_exception_gate thread_accept_exceptions = thread_exception_gate
def thread_raise(thread, exctype): def thread_raise(thread, exctype):
''' """
Raises or queues the exception `exctype` for the thread `thread`. Raises or queues the exception `exctype` for the thread `thread`.
See the documentation on the function `thread_exception_gate()` for more See the documentation on the function `thread_exception_gate()` for more
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
Raises: Raises:
TypeError if `exctype` is not a class TypeError if `exctype` is not a class
ValueError, SystemError in case of unexpected problems ValueError, SystemError in case of unexpected problems
''' """
import ctypes, inspect, threading, logging import ctypes, inspect, threading, logging
if not inspect.isclass(exctype): if not inspect.isclass(exctype):
raise TypeError( raise TypeError(
'cannot raise %s, only exception types can be raised (not ' "cannot raise %s, only exception types can be raised (not "
'instances)' % exctype) "instances)" % exctype
)
gate = thread_exception_gate(thread) gate = thread_exception_gate(thread)
with gate.lock: with gate.lock:
if gate.ok_to_raise.is_set() and thread.is_alive(): if gate.ok_to_raise.is_set() and thread.is_alive():
gate.ok_to_raise.clear() gate.ok_to_raise.clear()
logging.info('raising %s in thread %s', exctype, thread) logging.info("raising %s in thread %s", exctype, thread)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc( res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(thread.ident), ctypes.py_object(exctype)) ctypes.c_long(thread.ident), ctypes.py_object(exctype)
)
if res == 0: if res == 0:
raise ValueError( raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
'invalid thread id? thread.ident=%s' % thread.ident)
elif res != 1: elif res != 1:
# if it returns a number greater than one, you're in trouble, # if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect # and you should call it again with exc=NULL to revert the effect
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0) ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
raise SystemError('PyThreadState_SetAsyncExc failed') raise SystemError("PyThreadState_SetAsyncExc failed")
else: else:
logging.info('queueing %s for thread %s', exctype, thread) logging.info("queueing %s for thread %s", exctype, thread)
gate.queue_exception(exctype) gate.queue_exception(exctype)
def sleep(duration): def sleep(duration):
''' """
Sleeps for duration seconds in increments of 0.5 seconds. Sleeps for duration seconds in increments of 0.5 seconds.
Use this so that the sleep can be interrupted by thread_raise(). Use this so that the sleep can be interrupted by thread_raise().
''' """
import time import time
start = time.time() start = time.time()
while True: while True:
elapsed = time.time() - start elapsed = time.time() - start
@ -259,32 +301,41 @@ def sleep(duration):
break break
time.sleep(min(duration - elapsed, 0.5)) time.sleep(min(duration - elapsed, 0.5))
_jinja2_env = None _jinja2_env = None
def jinja2_environment(behaviors_dir=None): def jinja2_environment(behaviors_dir=None):
global _jinja2_env global _jinja2_env
if not _jinja2_env: if not _jinja2_env:
import os, jinja2, json import os, jinja2, json
if behaviors_dir: if behaviors_dir:
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir, _loader = jinja2.FileSystemLoader(
'js-templates')) os.path.join(behaviors_dir, "js-templates")
)
else: else:
_loader=jinja2.PackageLoader('brozzler', 'js-templates') _loader = jinja2.PackageLoader("brozzler", "js-templates")
_jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False) _jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
_jinja2_env.filters['json'] = json.dumps _jinja2_env.filters["json"] = json.dumps
return _jinja2_env return _jinja2_env
import urlcanon import urlcanon
def _remove_query(url): def _remove_query(url):
url.question_mark = b'' url.question_mark = b""
url.query = b'' url.query = b""
# XXX chop off path after last slash?? # XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer( site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
urlcanon.semantic.steps + [_remove_query])
import doublethink import doublethink
import datetime import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC) EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
# we could make this configurable if there's a good reason # we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3 MAX_PAGE_FAILURES = 3
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool, BrowsingException from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.model import ( from brozzler.model import (
new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf) new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
from brozzler.cli import suggest_default_chrome_exe from brozzler.cli import suggest_default_chrome_exe
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots', __all__ = [
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException', "Page",
'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf', "Site",
'sleep', 'thread_accept_exceptions', 'thread_raise'] "BrozzlerWorker",
"is_permitted_by_robots",
"RethinkDbFrontier",
"Browser",
"BrowserPool",
"BrowsingException",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
"sleep",
"thread_accept_exceptions",
"thread_raise",
]

View file

@ -1,4 +1,4 @@
''' """
brozzler/browser.py - manages the browsers for brozzler brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2023 Internet Archive Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import time import time
@ -33,30 +33,35 @@ from brozzler.chrome import Chrome
import socket import socket
import urlcanon import urlcanon
class BrowsingException(Exception): class BrowsingException(Exception):
pass pass
class NoBrowsersAvailable(Exception): class NoBrowsersAvailable(Exception):
pass pass
class BrowsingTimeout(BrowsingException): class BrowsingTimeout(BrowsingException):
pass pass
class BrowserPool: class BrowserPool:
''' """
Manages pool of browsers. Automatically chooses available port for the Manages pool of browsers. Automatically chooses available port for the
debugging protocol. debugging protocol.
''' """
logger = logging.getLogger(__module__ + '.' + __qualname__)
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, size=3, **kwargs): def __init__(self, size=3, **kwargs):
''' """
Initializes the pool. Initializes the pool.
Args: Args:
size: size of pool (default 3) size: size of pool (default 3)
**kwargs: arguments for Browser(...) **kwargs: arguments for Browser(...)
''' """
self.size = size self.size = size
self.kwargs = kwargs self.kwargs = kwargs
self._in_use = set() self._in_use = set()
@ -65,7 +70,7 @@ class BrowserPool:
def _fresh_browser(self): def _fresh_browser(self):
# choose available port # choose available port
sock = socket.socket() sock = socket.socket()
sock.bind(('0.0.0.0', 0)) sock.bind(("0.0.0.0", 0))
port = sock.getsockname()[1] port = sock.getsockname()[1]
sock.close() sock.close()
@ -73,12 +78,12 @@ class BrowserPool:
return browser return browser
def acquire_multi(self, n=1): def acquire_multi(self, n=1):
''' """
Returns a list of up to `n` browsers. Returns a list of up to `n` browsers.
Raises: Raises:
NoBrowsersAvailable if none available NoBrowsersAvailable if none available
''' """
browsers = [] browsers = []
with self._lock: with self._lock:
if len(self._in_use) >= self.size: if len(self._in_use) >= self.size:
@ -90,7 +95,7 @@ class BrowserPool:
return browsers return browsers
def acquire(self): def acquire(self):
''' """
Returns an available instance. Returns an available instance.
Returns: Returns:
@ -98,7 +103,7 @@ class BrowserPool:
Raises: Raises:
NoBrowsersAvailable if none available NoBrowsersAvailable if none available
''' """
with self._lock: with self._lock:
if len(self._in_use) >= self.size: if len(self._in_use) >= self.size:
raise NoBrowsersAvailable raise NoBrowsersAvailable
@ -120,8 +125,8 @@ class BrowserPool:
def shutdown_now(self): def shutdown_now(self):
self.logger.info( self.logger.info(
'shutting down browser pool (%s browsers in use)', "shutting down browser pool (%s browsers in use)", len(self._in_use)
len(self._in_use)) )
with self._lock: with self._lock:
for browser in self._in_use: for browser in self._in_use:
browser.stop() browser.stop()
@ -132,8 +137,9 @@ class BrowserPool:
def num_in_use(self): def num_in_use(self):
return len(self._in_use) return len(self._in_use)
class WebsockReceiverThread(threading.Thread): class WebsockReceiverThread(threading.Thread):
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, websock, name=None, daemon=True): def __init__(self, websock, name=None, daemon=True):
super().__init__(name=name, daemon=daemon) super().__init__(name=name, daemon=daemon)
@ -175,50 +181,54 @@ class WebsockReceiverThread(threading.Thread):
self.is_open = True self.is_open = True
def _on_error(self, websock, e): def _on_error(self, websock, e):
''' """
Raises BrowsingException in the thread that created this instance. Raises BrowsingException in the thread that created this instance.
''' """
if isinstance(e, ( if isinstance(
websocket.WebSocketConnectionClosedException, e, (websocket.WebSocketConnectionClosedException, ConnectionResetError)
ConnectionResetError)): ):
self.logger.error('websocket closed, did chrome die?') self.logger.error("websocket closed, did chrome die?")
else: else:
self.logger.error( self.logger.error("exception from websocket receiver thread", exc_info=1)
'exception from websocket receiver thread',
exc_info=1)
brozzler.thread_raise(self.calling_thread, BrowsingException) brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self): def run(self):
# ping_timeout is used as the timeout for the call to select.select() # ping_timeout is used as the timeout for the call to select.select()
# in addition to its documented purpose, and must have a value to avoid # in addition to its documented purpose, and must have a value to avoid
# hangs in certain situations # hangs in certain situations
self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), self.websock.run_forever(
ping_timeout=0.5) sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5
)
def _on_message(self, websock, message): def _on_message(self, websock, message):
try: try:
self._handle_message(websock, message) self._handle_message(websock, message)
except: except:
self.logger.error( self.logger.error(
'uncaught exception in _handle_message message=%s', "uncaught exception in _handle_message message=%s",
message, exc_info=True) message,
exc_info=True,
)
def _network_response_received(self, message): def _network_response_received(self, message):
status = message['params']['response'].get('status') status = message["params"]["response"].get("status")
if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict( if status == 420 and "Warcprox-Meta" in CaseInsensitiveDict(
message['params']['response']['headers'])): message["params"]["response"]["headers"]
):
if not self.reached_limit: if not self.reached_limit:
warcprox_meta = json.loads(CaseInsensitiveDict( warcprox_meta = json.loads(
message['params']['response']['headers'])['Warcprox-Meta']) CaseInsensitiveDict(message["params"]["response"]["headers"])[
self.reached_limit = brozzler.ReachedLimit( "Warcprox-Meta"
warcprox_meta=warcprox_meta) ]
self.logger.info('reached limit %s', self.reached_limit) )
brozzler.thread_raise( self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
self.calling_thread, brozzler.ReachedLimit) self.logger.info("reached limit %s", self.reached_limit)
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
else: else:
self.logger.info( self.logger.info(
'reached limit but self.reached_limit is already set, ' "reached limit but self.reached_limit is already set, "
'assuming the calling thread is already handling this') "assuming the calling thread is already handling this"
)
if self.on_response: if self.on_response:
self.on_response(message) self.on_response(message)
@ -226,75 +236,92 @@ class WebsockReceiverThread(threading.Thread):
self.page_status = status self.page_status = status
def _javascript_dialog_opening(self, message): def _javascript_dialog_opening(self, message):
self.logger.info('javascript dialog opened: %s', message) self.logger.info("javascript dialog opened: %s", message)
if message['params']['type'] == 'alert': if message["params"]["type"] == "alert":
accept = True accept = True
else: else:
accept = False accept = False
self.websock.send( self.websock.send(
json.dumps(dict( json.dumps(
id=0, method='Page.handleJavaScriptDialog', dict(
params={'accept': accept}), separators=',:')) id=0,
method="Page.handleJavaScriptDialog",
params={"accept": accept},
),
separators=",:",
)
)
def _handle_message(self, websock, json_message): def _handle_message(self, websock, json_message):
message = json.loads(json_message) message = json.loads(json_message)
if 'method' in message: if "method" in message:
if message['method'] == 'Page.loadEventFired': if message["method"] == "Page.loadEventFired":
self.got_page_load_event = datetime.datetime.utcnow() self.got_page_load_event = datetime.datetime.utcnow()
elif message['method'] == 'Network.responseReceived': elif message["method"] == "Network.responseReceived":
self._network_response_received(message) self._network_response_received(message)
elif message['method'] == 'Network.requestWillBeSent': elif message["method"] == "Network.requestWillBeSent":
if self.on_request: if self.on_request:
self.on_request(message) self.on_request(message)
elif message['method'] == 'Page.interstitialShown': elif message["method"] == "Page.interstitialShown":
# AITFIVE-1529: handle http auth # AITFIVE-1529: handle http auth
# we should kill the browser when we receive Page.interstitialShown and # we should kill the browser when we receive Page.interstitialShown and
# consider the page finished, until this is fixed: # consider the page finished, until this is fixed:
# https://bugs.chromium.org/p/chromium/issues/detail?id=764505 # https://bugs.chromium.org/p/chromium/issues/detail?id=764505
self.logger.info('Page.interstialShown (likely unsupported http auth request)') self.logger.info(
brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown) "Page.interstialShown (likely unsupported http auth request)"
elif message['method'] == 'Inspector.targetCrashed': )
self.logger.error( brozzler.thread_raise(
'''chrome tab went "aw snap" or "he's dead jim"!''') self.calling_thread, brozzler.PageInterstitialShown
)
elif message["method"] == "Inspector.targetCrashed":
self.logger.error("""chrome tab went "aw snap" or "he's dead jim"!""")
brozzler.thread_raise(self.calling_thread, BrowsingException) brozzler.thread_raise(self.calling_thread, BrowsingException)
elif message['method'] == 'Console.messageAdded': elif message["method"] == "Console.messageAdded":
self.logger.debug( self.logger.debug(
'console.%s %s', message['params']['message']['level'], "console.%s %s",
message['params']['message']['text']) message["params"]["message"]["level"],
elif message['method'] == 'Runtime.exceptionThrown': message["params"]["message"]["text"],
self.logger.debug('uncaught exception: %s', message) )
elif message['method'] == 'Page.javascriptDialogOpening': elif message["method"] == "Runtime.exceptionThrown":
self.logger.debug("uncaught exception: %s", message)
elif message["method"] == "Page.javascriptDialogOpening":
self._javascript_dialog_opening(message) self._javascript_dialog_opening(message)
elif (message['method'] == 'Network.loadingFailed' elif (
and 'params' in message and 'errorText' in message['params'] message["method"] == "Network.loadingFailed"
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'): and "params" in message
and "errorText" in message["params"]
and message["params"]["errorText"] == "net::ERR_PROXY_CONNECTION_FAILED"
):
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError) brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
elif message['method'] == 'ServiceWorker.workerVersionUpdated': elif message["method"] == "ServiceWorker.workerVersionUpdated":
if self.on_service_worker_version_updated: if self.on_service_worker_version_updated:
self.on_service_worker_version_updated(message) self.on_service_worker_version_updated(message)
# else: # else:
# self.logger.debug("%s %s", message["method"], json_message) # self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message: elif "result" in message:
if message['id'] in self._result_messages: if message["id"] in self._result_messages:
self._result_messages[message['id']] = message self._result_messages[message["id"]] = message
# else: # else:
# self.logger.debug("%s", json_message) # self.logger.debug("%s", json_message)
# else: # else:
# self.logger.debug("%s", json_message) # self.logger.debug("%s", json_message)
class Browser: class Browser:
''' """
Manages an instance of Chrome for browsing pages. Manages an instance of Chrome for browsing pages.
''' """
logger = logging.getLogger(__module__ + '.' + __qualname__)
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, **kwargs): def __init__(self, **kwargs):
''' """
Initializes the Browser. Initializes the Browser.
Args: Args:
**kwargs: arguments for Chrome(...) **kwargs: arguments for Chrome(...)
''' """
self.chrome = Chrome(**kwargs) self.chrome = Chrome(**kwargs)
self.websock_url = None self.websock_url = None
self.websock = None self.websock = None
@ -311,9 +338,9 @@ class Browser:
self.stop() self.stop()
def _wait_for(self, callback, timeout=None): def _wait_for(self, callback, timeout=None):
''' """
Spins until callback() returns truthy. Spins until callback() returns truthy.
''' """
start = time.time() start = time.time()
while True: while True:
if callback(): if callback():
@ -321,112 +348,140 @@ class Browser:
elapsed = time.time() - start elapsed = time.time() - start
if timeout and elapsed > timeout: if timeout and elapsed > timeout:
raise BrowsingTimeout( raise BrowsingTimeout(
'timed out after %.1fs waiting for: %s' % ( "timed out after %.1fs waiting for: %s" % (elapsed, callback)
elapsed, callback)) )
brozzler.sleep(self._wait_interval) brozzler.sleep(self._wait_interval)
def send_to_chrome(self, suppress_logging=False, **kwargs): def send_to_chrome(self, suppress_logging=False, **kwargs):
msg_id = next(self._command_id) msg_id = next(self._command_id)
kwargs['id'] = msg_id kwargs["id"] = msg_id
msg = json.dumps(kwargs, separators=',:') msg = json.dumps(kwargs, separators=",:")
logging.log( logging.log(
logging.TRACE if suppress_logging else logging.DEBUG, logging.TRACE if suppress_logging else logging.DEBUG,
'sending message to %s: %s', self.websock, msg) "sending message to %s: %s",
self.websock,
msg,
)
self.websock.send(msg) self.websock.send(msg)
return msg_id return msg_id
def start(self, **kwargs): def start(self, **kwargs):
''' """
Starts chrome if it's not running. Starts chrome if it's not running.
Args: Args:
**kwargs: arguments for self.chrome.start(...) **kwargs: arguments for self.chrome.start(...)
''' """
if not self.is_running(): if not self.is_running():
self.websock_url = self.chrome.start(**kwargs) self.websock_url = self.chrome.start(**kwargs)
self.websock = websocket.WebSocketApp(self.websock_url) self.websock = websocket.WebSocketApp(self.websock_url)
self.websock_thread = WebsockReceiverThread( self.websock_thread = WebsockReceiverThread(
self.websock, name='WebsockThread:%s' % self.chrome.port) self.websock, name="WebsockThread:%s" % self.chrome.port
)
self.websock_thread.start() self.websock_thread.start()
self._wait_for(lambda: self.websock_thread.is_open, timeout=30) self._wait_for(lambda: self.websock_thread.is_open, timeout=30)
# tell browser to send us messages we're interested in # tell browser to send us messages we're interested in
self.send_to_chrome(method='Network.enable') self.send_to_chrome(method="Network.enable")
self.send_to_chrome(method='Page.enable') self.send_to_chrome(method="Page.enable")
# Enable Console & Runtime output only when debugging. # Enable Console & Runtime output only when debugging.
# After all, we just print these events with debug(), we don't use # After all, we just print these events with debug(), we don't use
# them in Brozzler logic. # them in Brozzler logic.
if self.logger.isEnabledFor(logging.DEBUG): if self.logger.isEnabledFor(logging.DEBUG):
self.send_to_chrome(method='Console.enable') self.send_to_chrome(method="Console.enable")
self.send_to_chrome(method='Runtime.enable') self.send_to_chrome(method="Runtime.enable")
self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method="ServiceWorker.enable")
self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') self.send_to_chrome(method="ServiceWorker.setForceUpdateOnPageLoad")
# disable google analytics and amp analytics # disable google analytics and amp analytics
self.send_to_chrome( self.send_to_chrome(
method='Network.setBlockedURLs', method="Network.setBlockedURLs",
params={'urls': ['*google-analytics.com/analytics.js*', params={
'*google-analytics.com/ga.js*', "urls": [
'*google-analytics.com/ga_exp.js*', "*google-analytics.com/analytics.js*",
'*google-analytics.com/urchin.js*', "*google-analytics.com/ga.js*",
'*google-analytics.com/collect*', "*google-analytics.com/ga_exp.js*",
'*google-analytics.com/r/collect*', "*google-analytics.com/urchin.js*",
'*google-analytics.com/__utm.gif*', "*google-analytics.com/collect*",
'*google-analytics.com/gtm/js?*', "*google-analytics.com/r/collect*",
'*google-analytics.com/cx/api.js*', "*google-analytics.com/__utm.gif*",
'*cdn.ampproject.org/*/amp-analytics*.js']}) "*google-analytics.com/gtm/js?*",
"*google-analytics.com/cx/api.js*",
"*cdn.ampproject.org/*/amp-analytics*.js",
]
},
)
def stop(self): def stop(self):
''' """
Stops chrome if it's running. Stops chrome if it's running.
''' """
try: try:
if (self.websock and self.websock.sock if self.websock and self.websock.sock and self.websock.sock.connected:
and self.websock.sock.connected): self.logger.info("shutting down websocket connection")
self.logger.info('shutting down websocket connection')
try: try:
self.websock.close() self.websock.close()
except BaseException as e: except BaseException as e:
self.logger.error( self.logger.error(
'exception closing websocket %s - %s', "exception closing websocket %s - %s", self.websock, e
self.websock, e) )
self.chrome.stop() self.chrome.stop()
if self.websock_thread and ( if self.websock_thread and (
self.websock_thread != threading.current_thread()): self.websock_thread != threading.current_thread()
):
self.websock_thread.join(timeout=30) self.websock_thread.join(timeout=30)
if self.websock_thread.is_alive(): if self.websock_thread.is_alive():
self.logger.error( self.logger.error(
'%s still alive 30 seconds after closing %s, will ' "%s still alive 30 seconds after closing %s, will "
'forcefully nudge it again', self.websock_thread, "forcefully nudge it again",
self.websock) self.websock_thread,
self.websock,
)
self.websock.keep_running = False self.websock.keep_running = False
self.websock_thread.join(timeout=30) self.websock_thread.join(timeout=30)
if self.websock_thread.is_alive(): if self.websock_thread.is_alive():
self.logger.critical( self.logger.critical(
'%s still alive 60 seconds after closing %s', "%s still alive 60 seconds after closing %s",
self.websock_thread, self.websock) self.websock_thread,
self.websock,
)
self.websock_url = None self.websock_url = None
except: except:
self.logger.error('problem stopping', exc_info=True) self.logger.error("problem stopping", exc_info=True)
def is_running(self): def is_running(self):
return self.websock_url is not None return self.websock_url is not None
def browse_page( def browse_page(
self, page_url, extra_headers=None, self,
user_agent=None, behavior_parameters=None, behaviors_dir=None, page_url,
on_request=None, on_response=None, extra_headers=None,
on_service_worker_version_updated=None, on_screenshot=None, user_agent=None,
username=None, password=None, hashtags=None, behavior_parameters=None,
screenshot_full_page=False, skip_extract_outlinks=False, behaviors_dir=None,
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, on_request=None,
page_timeout=300, behavior_timeout=900, on_response=None,
extract_outlinks_timeout=60, download_throughput=-1, stealth=False): on_service_worker_version_updated=None,
''' on_screenshot=None,
username=None,
password=None,
hashtags=None,
screenshot_full_page=False,
skip_extract_outlinks=False,
skip_visit_hashtags=False,
skip_youtube_dl=False,
simpler404=False,
page_timeout=300,
behavior_timeout=900,
extract_outlinks_timeout=60,
download_throughput=-1,
stealth=False,
):
"""
Browses page in browser. Browses page in browser.
Browser should already be running, i.e. start() should have been Browser should already be running, i.e. start() should have been
@ -473,54 +528,60 @@ class Browser:
Raises: Raises:
brozzler.ProxyError: in case of proxy connection error brozzler.ProxyError: in case of proxy connection error
BrowsingException: if browsing the page fails in some other way BrowsingException: if browsing the page fails in some other way
''' """
if not self.is_running(): if not self.is_running():
raise BrowsingException('browser has not been started') raise BrowsingException("browser has not been started")
if self.is_browsing: if self.is_browsing:
raise BrowsingException('browser is already busy browsing a page') raise BrowsingException("browser is already busy browsing a page")
self.is_browsing = True self.is_browsing = True
if on_request: if on_request:
self.websock_thread.on_request = on_request self.websock_thread.on_request = on_request
if on_response: if on_response:
self.websock_thread.on_response = on_response self.websock_thread.on_response = on_response
if on_service_worker_version_updated: if on_service_worker_version_updated:
self.websock_thread.on_service_worker_version_updated = \ self.websock_thread.on_service_worker_version_updated = (
on_service_worker_version_updated on_service_worker_version_updated
)
try: try:
with brozzler.thread_accept_exceptions(): with brozzler.thread_accept_exceptions():
self.configure_browser( self.configure_browser(
extra_headers=extra_headers, extra_headers=extra_headers,
user_agent=user_agent, user_agent=user_agent,
download_throughput=download_throughput, download_throughput=download_throughput,
stealth=stealth) stealth=stealth,
)
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
if password: if password:
self.try_login(username, password, timeout=page_timeout) self.try_login(username, password, timeout=page_timeout)
# if login redirected us, return to page_url # if login redirected us, return to page_url
if page_url != self.url().split('#')[0]: if page_url != self.url().split("#")[0]:
self.logger.debug( self.logger.debug(
'login navigated away from %s; returning!', "login navigated away from %s; returning!", page_url
page_url) )
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
# If the target page HTTP status is 4xx/5xx, there is no point # If the target page HTTP status is 4xx/5xx, there is no point
# in running behaviors, screenshot, outlink and hashtag # in running behaviors, screenshot, outlink and hashtag
# extraction as we didn't get a valid page. # extraction as we didn't get a valid page.
# This is only enabled with option `simpler404`. # This is only enabled with option `simpler404`.
run_behaviors = True run_behaviors = True
if simpler404 and (self.websock_thread.page_status is None or if simpler404 and (
self.websock_thread.page_status >= 400): self.websock_thread.page_status is None
or self.websock_thread.page_status >= 400
):
run_behaviors = False run_behaviors = False
if run_behaviors and behavior_timeout > 0: if run_behaviors and behavior_timeout > 0:
behavior_script = brozzler.behavior_script( behavior_script = brozzler.behavior_script(
page_url, behavior_parameters, page_url, behavior_parameters, behaviors_dir=behaviors_dir
behaviors_dir=behaviors_dir) )
self.run_behavior(behavior_script, timeout=behavior_timeout) self.run_behavior(behavior_script, timeout=behavior_timeout)
final_page_url = self.url() final_page_url = self.url()
if on_screenshot: if on_screenshot:
if simpler404: if simpler404:
if self.websock_thread.page_status and \ if (
self.websock_thread.page_status < 400: self.websock_thread.page_status
and self.websock_thread.page_status < 400
):
self._try_screenshot(on_screenshot, screenshot_full_page) self._try_screenshot(on_screenshot, screenshot_full_page)
else: else:
self._try_screenshot(on_screenshot, screenshot_full_page) self._try_screenshot(on_screenshot, screenshot_full_page)
@ -528,9 +589,7 @@ class Browser:
if not run_behaviors or skip_extract_outlinks: if not run_behaviors or skip_extract_outlinks:
outlinks = [] outlinks = []
else: else:
outlinks = self.extract_outlinks( outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
timeout=extract_outlinks_timeout
)
if run_behaviors and not skip_visit_hashtags: if run_behaviors and not skip_visit_hashtags:
self.visit_hashtags(final_page_url, hashtags, outlinks) self.visit_hashtags(final_page_url, hashtags, outlinks)
return final_page_url, outlinks return final_page_url, outlinks
@ -539,7 +598,7 @@ class Browser:
# more information, raise that one # more information, raise that one
raise self.websock_thread.reached_limit raise self.websock_thread.reached_limit
except websocket.WebSocketConnectionClosedException as e: except websocket.WebSocketConnectionClosedException as e:
self.logger.error('websocket closed, did chrome die?') self.logger.error("websocket closed, did chrome die?")
raise BrowsingException(e) raise BrowsingException(e)
finally: finally:
self.is_browsing = False self.is_browsing = False
@ -550,21 +609,24 @@ class Browser:
"""The browser instance must be scrolled to the top of the page before """The browser instance must be scrolled to the top of the page before
trying to get a screenshot. trying to get a screenshot.
""" """
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, self.send_to_chrome(
params={'expression': 'window.scroll(0,0)'}) method="Runtime.evaluate",
suppress_logging=True,
params={"expression": "window.scroll(0,0)"},
)
for i in range(3): for i in range(3):
try: try:
jpeg_bytes = self.screenshot(full_page) jpeg_bytes = self.screenshot(full_page)
on_screenshot(jpeg_bytes) on_screenshot(jpeg_bytes)
return return
except BrowsingTimeout as e: except BrowsingTimeout as e:
logging.error('attempt %s/3: %s', i+1, e) logging.error("attempt %s/3: %s", i + 1, e)
def visit_hashtags(self, page_url, hashtags, outlinks): def visit_hashtags(self, page_url, hashtags, outlinks):
_hashtags = set(hashtags or []) _hashtags = set(hashtags or [])
for outlink in outlinks: for outlink in outlinks:
url = urlcanon.whatwg(outlink) url = urlcanon.whatwg(outlink)
hashtag = (url.hash_sign + url.fragment).decode('utf-8') hashtag = (url.hash_sign + url.fragment).decode("utf-8")
urlcanon.canon.remove_fragment(url) urlcanon.canon.remove_fragment(url)
if hashtag and str(url) == page_url: if hashtag and str(url) == page_url:
_hashtags.add(hashtag) _hashtags.add(hashtag)
@ -572,84 +634,85 @@ class Browser:
# out which hashtags were visited already and skip those # out which hashtags were visited already and skip those
for hashtag in _hashtags: for hashtag in _hashtags:
# navigate_to_hashtag (nothing to wait for so no timeout?) # navigate_to_hashtag (nothing to wait for so no timeout?)
self.logger.debug('navigating to hashtag %s', hashtag) self.logger.debug("navigating to hashtag %s", hashtag)
url = urlcanon.whatwg(page_url) url = urlcanon.whatwg(page_url)
url.hash_sign = b'#' url.hash_sign = b"#"
url.fragment = hashtag[1:].encode('utf-8') url.fragment = hashtag[1:].encode("utf-8")
self.send_to_chrome( self.send_to_chrome(method="Page.navigate", params={"url": str(url)})
method='Page.navigate', params={'url': str(url)})
time.sleep(5) # um.. wait for idleness or something? time.sleep(5) # um.. wait for idleness or something?
# take another screenshot? # take another screenshot?
# run behavior again with short timeout? # run behavior again with short timeout?
# retrieve outlinks again and append to list? # retrieve outlinks again and append to list?
def configure_browser(self, extra_headers=None, user_agent=None, def configure_browser(
download_throughput=-1, stealth=False): self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False
):
headers = extra_headers or {} headers = extra_headers or {}
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Network.setExtraHTTPHeaders', method="Network.setExtraHTTPHeaders", params={"headers": headers}
params={'headers': headers}) )
self._wait_for( self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10)
lambda: self.websock_thread.received_result(msg_id),
timeout=10)
if user_agent: if user_agent:
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Network.setUserAgentOverride', method="Network.setUserAgentOverride", params={"userAgent": user_agent}
params={'userAgent': user_agent}) )
if download_throughput > -1: if download_throughput > -1:
# traffic shaping already used by SPN2 to aid warcprox resilience # traffic shaping already used by SPN2 to aid warcprox resilience
# parameter value as bytes/second, or -1 to disable (default) # parameter value as bytes/second, or -1 to disable (default)
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions', msg_id = self.send_to_chrome(
params={'downloadThroughput': download_throughput}) method="Network.emulateNetworkConditions",
params={"downloadThroughput": download_throughput},
)
if stealth: if stealth:
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template('stealth.js').render() js = brozzler.jinja2_environment().get_template("stealth.js").render()
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Page.addScriptToEvaluateOnNewDocument', method="Page.addScriptToEvaluateOnNewDocument", params={"source": js}
params={'source': js}) )
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=10
timeout=10) )
def navigate_to_page(self, page_url, timeout=300): def navigate_to_page(self, page_url, timeout=300):
self.logger.info('navigating to page %s', page_url) self.logger.info("navigating to page %s", page_url)
self.websock_thread.got_page_load_event = None self.websock_thread.got_page_load_event = None
self.websock_thread.page_status = None self.websock_thread.page_status = None
self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self.send_to_chrome(method="Page.navigate", params={"url": page_url})
self._wait_for( self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
lambda: self.websock_thread.got_page_load_event,
timeout=timeout)
def extract_outlinks(self, timeout=60): def extract_outlinks(self, timeout=60):
self.logger.info('extracting outlinks') self.logger.info("extracting outlinks")
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template( js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
'extract-outlinks.js').render()
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Runtime.evaluate', params={'expression': js}) method="Runtime.evaluate", params={"expression": js}
)
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=timeout
timeout=timeout) )
message = self.websock_thread.pop_result(msg_id) message = self.websock_thread.pop_result(msg_id)
if ('result' in message and 'result' in message['result'] if (
and 'value' in message['result']['result']): "result" in message
if message['result']['result']['value']: and "result" in message["result"]
and "value" in message["result"]["result"]
):
if message["result"]["result"]["value"]:
out = [] out = []
for link in message['result']['result']['value'].split('\n'): for link in message["result"]["result"]["value"].split("\n"):
try: try:
out.append(str(urlcanon.whatwg(link))) out.append(str(urlcanon.whatwg(link)))
except AddressValueError: except AddressValueError:
self.logger.warning('skip invalid outlink: %s', link) self.logger.warning("skip invalid outlink: %s", link)
return frozenset(out) return frozenset(out)
else: else:
# no links found # no links found
return frozenset() return frozenset()
else: else:
self.logger.error( self.logger.error(
'problem extracting outlinks, result message: %s', message) "problem extracting outlinks, result message: %s", message
)
return frozenset() return frozenset()
def screenshot(self, full_page=False, timeout=45): def screenshot(self, full_page=False, timeout=45):
@ -657,121 +720,141 @@ class Browser:
inspiration: inspiration:
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898 https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
""" """
self.logger.info('taking screenshot') self.logger.info("taking screenshot")
if full_page: if full_page:
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.getLayoutMetrics') msg_id = self.send_to_chrome(method="Page.getLayoutMetrics")
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=timeout
timeout=timeout) )
message = self.websock_thread.pop_result(msg_id) message = self.websock_thread.pop_result(msg_id)
width = message['result']['contentSize']['width'] width = message["result"]["contentSize"]["width"]
height = message['result']['contentSize']['height'] height = message["result"]["contentSize"]["height"]
clip = dict(x=0, y=0, width=width, height=height, scale=1) clip = dict(x=0, y=0, width=width, height=height, scale=1)
deviceScaleFactor = 1 deviceScaleFactor = 1
screenOrientation = {'angle': 0, 'type': 'portraitPrimary'} screenOrientation = {"angle": 0, "type": "portraitPrimary"}
self.send_to_chrome( self.send_to_chrome(
method='Emulation.setDeviceMetricsOverride', method="Emulation.setDeviceMetricsOverride",
params=dict(mobile=False, width=width, height=height, params=dict(
mobile=False,
width=width,
height=height,
deviceScaleFactor=deviceScaleFactor, deviceScaleFactor=deviceScaleFactor,
screenOrientation=screenOrientation) screenOrientation=screenOrientation,
),
) )
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip} capture_params = {"format": "jpeg", "quality": 95, "clip": clip}
else: else:
capture_params = {'format': 'jpeg', 'quality': 95} capture_params = {"format": "jpeg", "quality": 95}
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot', msg_id = self.send_to_chrome(
params=capture_params) method="Page.captureScreenshot", params=capture_params
)
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=timeout
timeout=timeout) )
message = self.websock_thread.pop_result(msg_id) message = self.websock_thread.pop_result(msg_id)
jpeg_bytes = base64.b64decode(message['result']['data']) jpeg_bytes = base64.b64decode(message["result"]["data"])
return jpeg_bytes return jpeg_bytes
def url(self, timeout=30): def url(self, timeout=30):
''' """
Returns value of document.URL from the browser. Returns value of document.URL from the browser.
''' """
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Runtime.evaluate', method="Runtime.evaluate", params={"expression": "document.URL"}
params={'expression': 'document.URL'}) )
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=timeout
timeout=timeout) )
message = self.websock_thread.pop_result(msg_id) message = self.websock_thread.pop_result(msg_id)
return message['result']['result']['value'] return message["result"]["result"]["value"]
def run_behavior(self, behavior_script, timeout=900): def run_behavior(self, behavior_script, timeout=900):
self.send_to_chrome( self.send_to_chrome(
method='Runtime.evaluate', suppress_logging=True, method="Runtime.evaluate",
params={'expression': behavior_script}) suppress_logging=True,
params={"expression": behavior_script},
)
check_interval = min(timeout, 7) check_interval = min(timeout, 7)
start = time.time() start = time.time()
while True: while True:
elapsed = time.time() - start elapsed = time.time() - start
if elapsed > timeout: if elapsed > timeout:
logging.info( logging.info("behavior reached hard timeout after %.1fs", elapsed)
'behavior reached hard timeout after %.1fs', elapsed)
return return
brozzler.sleep(check_interval) brozzler.sleep(check_interval)
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Runtime.evaluate', suppress_logging=True, method="Runtime.evaluate",
params={'expression': 'umbraBehaviorFinished()'}) suppress_logging=True,
params={"expression": "umbraBehaviorFinished()"},
)
try: try:
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=5
timeout=5) )
msg = self.websock_thread.pop_result(msg_id) msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg if (
and not ('exceptionDetails' in msg['result']) msg
and not ('wasThrown' in msg['result'] and "result" in msg
and msg['result']['wasThrown']) and not ("exceptionDetails" in msg["result"])
and 'result' in msg['result'] and not (
and type(msg['result']['result']['value']) == bool "wasThrown" in msg["result"] and msg["result"]["wasThrown"]
and msg['result']['result']['value']): )
self.logger.info('behavior decided it has finished') and "result" in msg["result"]
and type(msg["result"]["result"]["value"]) == bool
and msg["result"]["result"]["value"]
):
self.logger.info("behavior decided it has finished")
return return
except BrowsingTimeout: except BrowsingTimeout:
pass pass
def try_login(self, username, password, timeout=300): def try_login(self, username, password, timeout=300):
try_login_js = brozzler.jinja2_environment().get_template( try_login_js = (
'try-login.js.j2').render(username=username, password=password) brozzler.jinja2_environment()
.get_template("try-login.js.j2")
.render(username=username, password=password)
)
self.websock_thread.got_page_load_event = None self.websock_thread.got_page_load_event = None
self.send_to_chrome( self.send_to_chrome(
method='Runtime.evaluate', suppress_logging=True, method="Runtime.evaluate",
params={'expression': try_login_js}) suppress_logging=True,
params={"expression": try_login_js},
)
# wait for tryLogin to finish trying (should be very very quick) # wait for tryLogin to finish trying (should be very very quick)
start = time.time() start = time.time()
while True: while True:
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method='Runtime.evaluate', method="Runtime.evaluate",
params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) params={
"expression": 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'
},
)
try: try:
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id), timeout=5
timeout=5) )
msg = self.websock_thread.pop_result(msg_id) msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg if msg and "result" in msg and "result" in msg["result"]:
and 'result' in msg['result']): result = msg["result"]["result"]["value"]
result = msg['result']['result']['value'] if result == "login-form-not-found":
if result == 'login-form-not-found':
# we're done # we're done
return return
elif result in ('submitted-form', 'maybe-submitted-form'): elif result in ("submitted-form", "maybe-submitted-form"):
# wait for page load event below # wait for page load event below
self.logger.info( self.logger.info(
'submitted a login form, waiting for another ' "submitted a login form, waiting for another "
'page load event') "page load event"
)
break break
# else try again to get __brzl_tryLoginState # else try again to get __brzl_tryLoginState
@ -780,23 +863,23 @@ class Browser:
if time.time() - start > 30: if time.time() - start > 30:
raise BrowsingException( raise BrowsingException(
'timed out trying to check if tryLogin finished') "timed out trying to check if tryLogin finished"
)
# if we get here, we submitted a form, now we wait for another page # if we get here, we submitted a form, now we wait for another page
# load event # load event
self._wait_for( self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
lambda: self.websock_thread.got_page_load_event,
timeout=timeout)
class Counter: class Counter:
def __init__(self): def __init__(self):
self.next_value = 0 self.next_value = 0
def __next__(self): def __next__(self):
try: try:
return self.next_value return self.next_value
finally: finally:
self.next_value += 1 self.next_value += 1
def peek(self): def peek(self):
return self.next_value return self.next_value

View file

@ -1,4 +1,4 @@
''' """
brozzler/chrome.py - manages the chrome/chromium browser for brozzler brozzler/chrome.py - manages the chrome/chromium browser for brozzler
Copyright (C) 2014-2023 Internet Archive Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import urllib.request import urllib.request
@ -31,12 +31,13 @@ import json
import tempfile import tempfile
import sys import sys
def check_version(chrome_exe): def check_version(chrome_exe):
''' """
Raises SystemExit if `chrome_exe` is not a supported browser version. Raises SystemExit if `chrome_exe` is not a supported browser version.
Must run in the main thread to have the desired effect. Must run in the main thread to have the desired effect.
''' """
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version # mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
# Google Chrome 64.0.3282.140 # Google Chrome 64.0.3282.140
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version # mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
@ -45,25 +46,28 @@ def check_version(chrome_exe):
# Using PPAPI flash. # Using PPAPI flash.
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version= # --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04 # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
cmd = [chrome_exe, '--version'] cmd = [chrome_exe, "--version"]
out = subprocess.check_output(cmd, timeout=60) out = subprocess.check_output(cmd, timeout=60)
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out) m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
if not m: if not m:
sys.exit( sys.exit(
'unable to parse browser version from output of ' "unable to parse browser version from output of "
'%r: %r' % (subprocess.list2cmdline(cmd), out)) "%r: %r" % (subprocess.list2cmdline(cmd), out)
)
version_str = m.group(2).decode() version_str = m.group(2).decode()
major_version = int(version_str.split('.')[0]) major_version = int(version_str.split(".")[0])
if major_version < 64: if major_version < 64:
sys.exit('brozzler requires chrome/chromium version 64 or ' sys.exit(
'later but %s reports version %s' % ( "brozzler requires chrome/chromium version 64 or "
chrome_exe, version_str)) "later but %s reports version %s" % (chrome_exe, version_str)
)
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
''' """
Initializes instance of this class. Initializes instance of this class.
Doesn't start the browser, start() does that. Doesn't start the browser, start() does that.
@ -73,7 +77,7 @@ class Chrome:
port: chrome debugging protocol port (default 9222) port: chrome debugging protocol port (default 9222)
ignore_cert_errors: configure chrome to accept all certs (default ignore_cert_errors: configure chrome to accept all certs (default
False) False)
''' """
self.port = port self.port = port
self.chrome_exe = chrome_exe self.chrome_exe = chrome_exe
self.ignore_cert_errors = ignore_cert_errors self.ignore_cert_errors = ignore_cert_errors
@ -81,63 +85,72 @@ class Chrome:
self.chrome_process = None self.chrome_process = None
def __enter__(self): def __enter__(self):
''' """
Returns websocket url to chrome window with about:blank loaded. Returns websocket url to chrome window with about:blank loaded.
''' """
return self.start() return self.start()
def __exit__(self, *args): def __exit__(self, *args):
self.stop() self.stop()
def _init_cookie_db(self, cookie_db): def _init_cookie_db(self, cookie_db):
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
cookie_location = os.path.join(cookie_dir, 'Cookies') cookie_location = os.path.join(cookie_dir, "Cookies")
self.logger.debug('cookie DB provided, writing to %s', cookie_location) self.logger.debug("cookie DB provided, writing to %s", cookie_location)
os.makedirs(cookie_dir, exist_ok=True) os.makedirs(cookie_dir, exist_ok=True)
try: try:
with open(cookie_location, 'wb') as cookie_file: with open(cookie_location, "wb") as cookie_file:
cookie_file.write(cookie_db) cookie_file.write(cookie_db)
except OSError: except OSError:
self.logger.error( self.logger.error(
'exception writing cookie file at %s', "exception writing cookie file at %s", cookie_location, exc_info=True
cookie_location, exc_info=True) )
def persist_and_read_cookie_db(self): def persist_and_read_cookie_db(self):
cookie_location = os.path.join( cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
self._chrome_user_data_dir, 'Default', 'Cookies')
self.logger.debug( self.logger.debug(
'marking cookies persistent then reading file into memory: %s', "marking cookies persistent then reading file into memory: %s",
cookie_location) cookie_location,
)
try: try:
with sqlite3.connect(cookie_location) as conn: with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor() cur = conn.cursor()
cur.execute('UPDATE cookies SET is_persistent = 1') cur.execute("UPDATE cookies SET is_persistent = 1")
except sqlite3.Error: except sqlite3.Error:
try: try:
# db schema changed around version 66, this is the old schema # db schema changed around version 66, this is the old schema
with sqlite3.connect(cookie_location) as conn: with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor() cur = conn.cursor()
cur.execute('UPDATE cookies SET persistent = 1') cur.execute("UPDATE cookies SET persistent = 1")
except sqlite3.Error: except sqlite3.Error:
self.logger.error( self.logger.error(
'exception updating cookie DB %s', cookie_location, "exception updating cookie DB %s", cookie_location, exc_info=True
exc_info=True) )
cookie_db = None cookie_db = None
try: try:
with open(cookie_location, 'rb') as cookie_file: with open(cookie_location, "rb") as cookie_file:
cookie_db = cookie_file.read() cookie_db = cookie_file.read()
except OSError: except OSError:
self.logger.error( self.logger.error(
'exception reading from cookie DB file %s', "exception reading from cookie DB file %s",
cookie_location, exc_info=True) cookie_location,
exc_info=True,
)
return cookie_db return cookie_db
def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, def start(
disk_cache_size=None, websocket_timeout=60, self,
window_height=900, window_width=1400): proxy=None,
''' cookie_db=None,
disk_cache_dir=None,
disk_cache_size=None,
websocket_timeout=60,
window_height=900,
window_width=1400,
):
"""
Starts chrome/chromium process. Starts chrome/chromium process.
Args: Args:
@ -154,103 +167,126 @@ class Chrome:
window_height, window_width: window height and width, in pixels window_height, window_width: window height and width, in pixels
Returns: Returns:
websocket url to chrome window with about:blank loaded websocket url to chrome window with about:blank loaded
''' """
# these can raise exceptions # these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory() self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join( self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data') self._home_tmpdir.name, "chrome-user-data"
)
if cookie_db: if cookie_db:
self._init_cookie_db(cookie_db) self._init_cookie_db(cookie_db)
self._shutdown.clear() self._shutdown.clear()
new_env = os.environ.copy() new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name new_env["HOME"] = self._home_tmpdir.name
chrome_args = [ chrome_args = [
self.chrome_exe, self.chrome_exe,
'-v', "-v",
'--headless', "--headless",
'--remote-debugging-port=%s' % self.port, "--remote-debugging-port=%s" % self.port,
'--use-mock-keychain', # mac thing "--use-mock-keychain", # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir, "--user-data-dir=%s" % self._chrome_user_data_dir,
'--disable-background-networking', '--disable-breakpad', "--disable-background-networking",
'--disable-renderer-backgrounding', '--disable-hang-monitor', "--disable-breakpad",
'--disable-background-timer-throttling', '--mute-audio', "--disable-renderer-backgrounding",
'--disable-web-sockets', "--disable-hang-monitor",
f'--window-size={window_width},{window_height}', "--disable-background-timer-throttling",
'--no-default-browser-check', "--mute-audio",
'--disable-first-run-ui', '--no-first-run', "--disable-web-sockets",
'--homepage=about:blank', '--disable-direct-npapi-requests', f"--window-size={window_width},{window_height}",
'--disable-web-security', '--disable-notifications', "--no-default-browser-check",
'--disable-extensions', '--disable-save-password-bubble', "--disable-first-run-ui",
'--disable-sync'] "--no-first-run",
"--homepage=about:blank",
"--disable-direct-npapi-requests",
"--disable-web-security",
"--disable-notifications",
"--disable-extensions",
"--disable-save-password-bubble",
"--disable-sync",
]
extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
if extra_chrome_args: if extra_chrome_args:
chrome_args.extend(extra_chrome_args.split()) chrome_args.extend(extra_chrome_args.split())
if disk_cache_dir: if disk_cache_dir:
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
if disk_cache_size: if disk_cache_size:
chrome_args.append('--disk-cache-size=%s' % disk_cache_size) chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
if self.ignore_cert_errors: if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors') chrome_args.append("--ignore-certificate-errors")
if proxy: if proxy:
chrome_args.append('--proxy-server=%s' % proxy) chrome_args.append("--proxy-server=%s" % proxy)
chrome_args.append('about:blank') chrome_args.append("about:blank")
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args)) self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group # start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen( self.chrome_process = subprocess.Popen(
chrome_args, env=new_env, start_new_session=True, chrome_args,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) env=new_env,
start_new_session=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0,
)
self._out_reader_thread = threading.Thread( self._out_reader_thread = threading.Thread(
target=self._read_stderr_stdout, target=self._read_stderr_stdout,
name='ChromeOutReaderThread:%s' % self.port, daemon=True) name="ChromeOutReaderThread:%s" % self.port,
daemon=True,
)
self._out_reader_thread.start() self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid) self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
return self._websocket_url(timeout_sec=websocket_timeout) return self._websocket_url(timeout_sec=websocket_timeout)
def _websocket_url(self, timeout_sec=60): def _websocket_url(self, timeout_sec=60):
json_url = 'http://localhost:%s/json' % self.port json_url = "http://localhost:%s/json" % self.port
# make this a member variable so that kill -QUIT reports it # make this a member variable so that kill -QUIT reports it
self._start = time.time() self._start = time.time()
self._last_warning = self._start self._last_warning = self._start
while True: while True:
try: try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read() raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8')) all_debug_info = json.loads(raw_json.decode("utf-8"))
debug_info = [x for x in all_debug_info debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
if x['url'] == 'about:blank']
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
self.logger.debug('%s returned %s', json_url, raw_json) self.logger.debug("%s returned %s", json_url, raw_json)
url = debug_info[0]['webSocketDebuggerUrl'] url = debug_info[0]["webSocketDebuggerUrl"]
self.logger.info( self.logger.info(
'got chrome window websocket debug url %s from %s', "got chrome window websocket debug url %s from %s",
url, json_url) url,
json_url,
)
return url return url
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except Exception as e: except Exception as e:
if time.time() - self._last_warning > 30: if time.time() - self._last_warning > 30:
self.logger.warning( self.logger.warning(
'problem with %s (will keep trying until timeout ' "problem with %s (will keep trying until timeout "
'of %d seconds): %s', json_url, timeout_sec, e) "of %d seconds): %s",
json_url,
timeout_sec,
e,
)
self._last_warning = time.time() self._last_warning = time.time()
finally: finally:
e = None e = None
if self.chrome_process: if self.chrome_process:
if time.time() - self._start > timeout_sec: if time.time() - self._start > timeout_sec:
e = Exception( e = Exception(
'killing chrome, failed to retrieve %s after ' "killing chrome, failed to retrieve %s after "
'%s seconds' % ( "%s seconds" % (json_url, time.time() - self._start)
json_url, time.time() - self._start)) )
elif self.chrome_process.poll() is not None: elif self.chrome_process.poll() is not None:
e = Exception( e = Exception(
'chrome process died with status %s' % self.chrome_process.poll()) "chrome process died with status %s"
% self.chrome_process.poll()
)
else: else:
time.sleep(0.5) time.sleep(0.5)
else: else:
e = Exception('??? self.chrome_process is not set ???') e = Exception("??? self.chrome_process is not set ???")
if e: if e:
self.stop() self.stop()
raise e raise e
@ -258,11 +294,13 @@ class Chrome:
def _read_stderr_stdout(self): def _read_stderr_stdout(self):
# XXX select doesn't work on windows # XXX select doesn't work on windows
def readline_nonblock(f): def readline_nonblock(f):
buf = b'' buf = b""
try: try:
while not self._shutdown.is_set() and ( while (
len(buf) == 0 or buf[-1] != 0xa) and select.select( not self._shutdown.is_set()
[f],[],[],0.5)[0]: and (len(buf) == 0 or buf[-1] != 0xA)
and select.select([f], [], [], 0.5)[0]
):
buf += f.read(1) buf += f.read(1)
except (ValueError, OSError): except (ValueError, OSError):
# When the chrome process crashes, stdout & stderr are closed # When the chrome process crashes, stdout & stderr are closed
@ -276,16 +314,16 @@ class Chrome:
buf = readline_nonblock(self.chrome_process.stdout) buf = readline_nonblock(self.chrome_process.stdout)
if buf: if buf:
self.logger.trace( self.logger.trace(
'chrome pid %s STDOUT %s', "chrome pid %s STDOUT %s", self.chrome_process.pid, buf
self.chrome_process.pid, buf) )
buf = readline_nonblock(self.chrome_process.stderr) buf = readline_nonblock(self.chrome_process.stderr)
if buf: if buf:
self.logger.trace( self.logger.trace(
'chrome pid %s STDERR %s', "chrome pid %s STDERR %s", self.chrome_process.pid, buf
self.chrome_process.pid, buf) )
except: except:
self.logger.error('unexpected exception', exc_info=True) self.logger.error("unexpected exception", exc_info=True)
def stop(self): def stop(self):
if not self.chrome_process or self._shutdown.is_set(): if not self.chrome_process or self._shutdown.is_set():
@ -294,8 +332,7 @@ class Chrome:
timeout_sec = 300 timeout_sec = 300
if self.chrome_process.poll() is None: if self.chrome_process.poll() is None:
self.logger.info( self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
'terminating chrome pgid %s', self.chrome_process.pid)
os.killpg(self.chrome_process.pid, signal.SIGTERM) os.killpg(self.chrome_process.pid, signal.SIGTERM)
t0 = time.time() t0 = time.time()
@ -306,12 +343,14 @@ class Chrome:
if status is not None: if status is not None:
if status == 0: if status == 0:
self.logger.info( self.logger.info(
'chrome pid %s exited normally', "chrome pid %s exited normally", self.chrome_process.pid
self.chrome_process.pid) )
else: else:
self.logger.warning( self.logger.warning(
'chrome pid %s exited with nonzero status %s', "chrome pid %s exited with nonzero status %s",
self.chrome_process.pid, status) self.chrome_process.pid,
status,
)
# XXX I would like to forcefully kill the process group # XXX I would like to forcefully kill the process group
# here to guarantee no orphaned chromium subprocesses hang # here to guarantee no orphaned chromium subprocesses hang
@ -321,14 +360,18 @@ class Chrome:
time.sleep(0.5) time.sleep(0.5)
self.logger.warning( self.logger.warning(
'chrome pid %s still alive %.1f seconds after sending ' "chrome pid %s still alive %.1f seconds after sending "
'SIGTERM, sending SIGKILL', self.chrome_process.pid, "SIGTERM, sending SIGKILL",
time.time() - t0) self.chrome_process.pid,
time.time() - t0,
)
os.killpg(self.chrome_process.pid, signal.SIGKILL) os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait() status = self.chrome_process.wait()
self.logger.warning( self.logger.warning(
'chrome pid %s reaped (status=%s) after killing with ' "chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
'SIGKILL', self.chrome_process.pid, status) self.chrome_process.pid,
status,
)
finally: finally:
self.chrome_process.stdout.close() self.chrome_process.stdout.close()
@ -337,8 +380,7 @@ class Chrome:
self._home_tmpdir.cleanup() self._home_tmpdir.cleanup()
except: except:
self.logger.error( self.logger.error(
'exception deleting %s', self._home_tmpdir, "exception deleting %s", self._home_tmpdir, exc_info=True
exc_info=True) )
self._out_reader_thread.join() self._out_reader_thread.join()
self.chrome_process = None self.chrome_process = None

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,4 @@
''' """
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc endspoints etc
@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import sys import sys
try: try:
import flask import flask
except ImportError as e: except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee README.rst for more information.', 'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__,
e,
)
sys.exit(1) sys.exit(1)
import doublethink import doublethink
import json import json
@ -41,33 +44,44 @@ app = flask.Flask(__name__)
# configure with environment variables # configure with environment variables
SETTINGS = { SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get( "RETHINKDB_SERVERS": os.environ.get(
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','), "BROZZLER_RETHINKDB_SERVERS", "localhost"
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'), ).split(","),
'WAYBACK_BASEURL': os.environ.get( "RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'), "WAYBACK_BASEURL": os.environ.get(
'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'), "WAYBACK_BASEURL", "http://localhost:8880/brozzler"
'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost') ),
"DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
"DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
} }
rr = doublethink.Rethinker( rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
_svc_reg = None _svc_reg = None
def service_registry(): def service_registry():
global _svc_reg global _svc_reg
if not _svc_reg: if not _svc_reg:
_svc_reg = doublethink.ServiceRegistry(rr) _svc_reg = doublethink.ServiceRegistry(rr)
return _svc_reg return _svc_reg
@app.route("/api/sites/<site_id>/queued_count") @app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count") @app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id): def queued_count(site_id):
reql = rr.table("pages").between( reql = (
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], rr.table("pages")
index="priority_by_site").count() .between(
[site_id, 0, False, r.minval],
[site_id, 0, False, r.maxval],
index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
count = reql.run() count = reql.run()
return flask.jsonify(count=count) return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue") @app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue") @app.route("/api/site/<site_id>/queue")
def queue(site_id): def queue(site_id):
@ -75,38 +89,52 @@ def queue(site_id):
start = flask.request.args.get("start", 0) start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90) end = flask.request.args.get("end", start + 90)
reql = rr.table("pages").between( reql = rr.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], [site_id, 0, False, r.minval],
index="priority_by_site")[start:end] [site_id, 0, False, r.maxval],
index="priority_by_site",
)[start:end]
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run() queue_ = reql.run()
return flask.jsonify(queue_=list(queue_)) return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count") @app.route("/api/sites/<site_id>/pages_count")
@app.route("/api/site/<site_id>/pages_count") @app.route("/api/site/<site_id>/pages_count")
@app.route("/api/sites/<site_id>/page_count") @app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count") @app.route("/api/site/<site_id>/page_count")
def page_count(site_id): def page_count(site_id):
reql = rr.table("pages").between( reql = (
rr.table("pages")
.between(
[site_id, 1, False, r.minval], [site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval], [site_id, r.maxval, False, r.maxval],
index="priority_by_site").count() index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
count = reql.run() count = reql.run()
return flask.jsonify(count=count) return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages") @app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages") @app.route("/api/site/<site_id>/pages")
def pages(site_id): def pages(site_id):
"""Pages already crawled.""" """Pages already crawled."""
start = int(flask.request.args.get("start", 0)) start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90)) end = int(flask.request.args.get("end", start + 90))
reql = rr.table("pages").between( reql = (
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], rr.table("pages")
index="least_hops").order_by(index="least_hops")[start:end] .between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
)
.order_by(index="least_hops")[start:end]
)
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run() pages_ = reql.run()
return flask.jsonify(pages=list(pages_)) return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>") @app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>") @app.route("/api/page/<page_id>")
def page(page_id): def page(page_id):
@ -115,6 +143,7 @@ def page(page_id):
page_ = reql.run() page_ = reql.run()
return flask.jsonify(page_) return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml") @app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml") @app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id): def page_yaml(page_id):
@ -122,8 +151,9 @@ def page_yaml(page_id):
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run() page_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(page_, default_flow_style=False), yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
mimetype="application/yaml") )
@app.route("/api/sites/<site_id>") @app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>") @app.route("/api/site/<site_id>")
@ -135,6 +165,7 @@ def site(site_id):
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s) return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml") @app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml") @app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id): def site_yaml(site_id):
@ -142,8 +173,9 @@ def site_yaml(site_id):
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run() site_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(site_, default_flow_style=False), yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
mimetype="application/yaml") )
@app.route("/api/stats/<bucket>") @app.route("/api/stats/<bucket>")
def stats(bucket): def stats(bucket):
@ -152,6 +184,7 @@ def stats(bucket):
stats_ = reql.run() stats_ = reql.run()
return flask.jsonify(stats_) return flask.jsonify(stats_)
@app.route("/api/jobs/<job_id>/sites") @app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites") @app.route("/api/job/<job_id>/sites")
def sites(job_id): def sites(job_id):
@ -168,6 +201,7 @@ def sites(job_id):
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_) return flask.jsonify(sites=sites_)
@app.route("/api/jobless-sites") @app.route("/api/jobless-sites")
def jobless_sites(): def jobless_sites():
# XXX inefficient (unindexed) query # XXX inefficient (unindexed) query
@ -180,6 +214,7 @@ def jobless_sites():
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_) return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<job_id>") @app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>") @app.route("/api/job/<job_id>")
def job(job_id): def job(job_id):
@ -192,6 +227,7 @@ def job(job_id):
job_ = reql.run() job_ = reql.run()
return flask.jsonify(job_) return flask.jsonify(job_)
@app.route("/api/jobs/<job_id>/yaml") @app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml") @app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id): def job_yaml(job_id):
@ -203,19 +239,22 @@ def job_yaml(job_id):
logging.debug("querying rethinkdb: %s", reql) logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run() job_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(job_, default_flow_style=False), yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
mimetype="application/yaml") )
@app.route("/api/workers") @app.route("/api/workers")
def workers(): def workers():
workers_ = service_registry().available_services("brozzler-worker") workers_ = service_registry().available_services("brozzler-worker")
return flask.jsonify(workers=list(workers_)) return flask.jsonify(workers=list(workers_))
@app.route("/api/services") @app.route("/api/services")
def services(): def services():
services_ = service_registry().available_services() services_ = service_registry().available_services()
return flask.jsonify(services=list(services_)) return flask.jsonify(services=list(services_))
@app.route("/api/jobs") @app.route("/api/jobs")
def jobs(): def jobs():
reql = rr.table("jobs").order_by(r.desc("id")) reql = rr.table("jobs").order_by(r.desc("id"))
@ -223,20 +262,24 @@ def jobs():
jobs_ = list(reql.run()) jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_) return flask.jsonify(jobs=jobs_)
@app.route("/api/config") @app.route("/api/config")
def config(): def config():
return flask.jsonify(config=SETTINGS) return flask.jsonify(config=SETTINGS)
@app.route("/api/<path:path>") @app.route("/api/<path:path>")
@app.route("/api", defaults={"path": ""}) @app.route("/api", defaults={"path": ""})
def api404(path): def api404(path):
flask.abort(404) flask.abort(404)
@app.route("/", defaults={"path": ""}) @app.route("/", defaults={"path": ""})
@app.route("/<path:path>") @app.route("/<path:path>")
def root(path): def root(path):
return flask.render_template("index.html") return flask.render_template("index.html")
try: try:
import gunicorn.app.base import gunicorn.app.base
from gunicorn.six import iteritems from gunicorn.six import iteritems
@ -255,8 +298,12 @@ try:
def load_config(self): def load_config(self):
config = dict( config = dict(
[(key, value) for key, value in iteritems(self.options) [
if key in self.cfg.settings and value is not None]) (key, value)
for key, value in iteritems(self.options)
if key in self.cfg.settings and value is not None
]
)
for key, value in iteritems(config): for key, value in iteritems(config):
self.cfg.set(key.lower(), value) self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging) self.cfg.set("logger_class", BypassGunicornLogging)
@ -270,37 +317,42 @@ try:
GunicornBrozzlerDashboard(app, options).run() GunicornBrozzlerDashboard(app, options).run()
except ImportError: except ImportError:
def run(): def run():
logging.info("running brozzler-dashboard using simple flask app.run") logging.info("running brozzler-dashboard using simple flask app.run")
app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT']) app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
def main(argv=None): def main(argv=None):
import argparse import argparse
import brozzler.cli import brozzler.cli
argv = argv or sys.argv argv = argv or sys.argv
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(argv[0]), prog=os.path.basename(argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=( description=(
'brozzler-dashboard - web application for viewing brozzler ' "brozzler-dashboard - web application for viewing brozzler " "crawl status"
'crawl status'), ),
epilog=( epilog=(
'brozzler-dashboard has no command line options, but can be ' "brozzler-dashboard has no command line options, but can be "
'configured using the following environment variables:\n\n' "configured using the following environment variables:\n\n"
' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. ' " BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. "
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: ' "db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
'localhost)\n' "localhost)\n"
' BROZZLER_RETHINKDB_DB rethinkdb database name ' " BROZZLER_RETHINKDB_DB rethinkdb database name "
'(default: brozzler)\n' "(default: brozzler)\n"
' WAYBACK_BASEURL base url for constructing wayback ' " WAYBACK_BASEURL base url for constructing wayback "
'links (default http://localhost:8880/brozzler)' "links (default http://localhost:8880/brozzler)"
' DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n' " DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n"
' DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)')) " DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
),
)
brozzler.cli.add_common_options(arg_parser, argv) brozzler.cli.add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
brozzler.cli.configure_logging(args) brozzler.cli.configure_logging(args)
run() run()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
working together in a single process working together in a single process
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import sys import sys
import logging import logging
try: try:
import warcprox import warcprox
import warcprox.main import warcprox.main
@ -32,7 +33,9 @@ except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.', 'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__,
e,
)
sys.exit(1) sys.exit(1)
import argparse import argparse
import brozzler import brozzler
@ -46,76 +49,112 @@ import doublethink
import traceback import traceback
import socketserver import socketserver
def _build_arg_parser(argv=None): def _build_arg_parser(argv=None):
argv = argv or sys.argv argv = argv or sys.argv
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter, formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
prog=os.path.basename(argv[0]), description=( prog=os.path.basename(argv[0]),
'brozzler-easy - easy deployment of brozzler, with ' description=(
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all ' "brozzler-easy - easy deployment of brozzler, with "
'running in a single process')) "brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
"running in a single process"
),
)
# common args # common args
brozzler.cli.add_rethinkdb_options(arg_parser) brozzler.cli.add_rethinkdb_options(arg_parser)
arg_parser.add_argument( arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs', "-d",
help='where to write warcs') "--warcs-dir",
dest="warcs_dir",
default="./warcs",
help="where to write warcs",
)
# warcprox args # warcprox args
arg_parser.add_argument( arg_parser.add_argument(
'-c', '--cacert', dest='cacert', "-c",
default='./%s-warcprox-ca.pem' % socket.gethostname(), "--cacert",
dest="cacert",
default="./%s-warcprox-ca.pem" % socket.gethostname(),
help=( help=(
'warcprox CA certificate file; if file does not exist, it ' "warcprox CA certificate file; if file does not exist, it "
'will be created')) "will be created"
),
)
arg_parser.add_argument( arg_parser.add_argument(
'--certs-dir', dest='certs_dir', "--certs-dir",
default='./%s-warcprox-ca' % socket.gethostname(), dest="certs_dir",
help='where warcprox will store and load generated certificates') default="./%s-warcprox-ca" % socket.gethostname(),
help="where warcprox will store and load generated certificates",
)
arg_parser.add_argument( arg_parser.add_argument(
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', "--onion-tor-socks-proxy",
default=None, help=( dest="onion_tor_socks_proxy",
'host:port of tor socks proxy, used only to connect to ' default=None,
'.onion sites')) help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
)
# brozzler-worker args # brozzler-worker args
arg_parser.add_argument( arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe', "-e",
"--chrome-exe",
dest="chrome_exe",
default=brozzler.cli.suggest_default_chrome_exe(), default=brozzler.cli.suggest_default_chrome_exe(),
help='executable to use to invoke chrome') help="executable to use to invoke chrome",
)
arg_parser.add_argument( arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers', "-n",
type=int, default=1, help=( "--max-browsers",
'max number of chrome instances simultaneously ' dest="max_browsers",
'browsing pages')) type=int,
default=1,
help=("max number of chrome instances simultaneously " "browsing pages"),
)
# pywb args # pywb args
arg_parser.add_argument( arg_parser.add_argument(
'--pywb-address', dest='pywb_address', "--pywb-address",
default='0.0.0.0', dest="pywb_address",
help='pywb wayback address to listen on') default="0.0.0.0",
help="pywb wayback address to listen on",
)
arg_parser.add_argument( arg_parser.add_argument(
'--pywb-port', dest='pywb_port', type=int, "--pywb-port",
default=8880, help='pywb wayback port') dest="pywb_port",
type=int,
default=8880,
help="pywb wayback port",
)
# dashboard args # dashboard args
arg_parser.add_argument( arg_parser.add_argument(
'--dashboard-address', dest='dashboard_address', "--dashboard-address",
default='localhost', dest="dashboard_address",
help='brozzler dashboard address to listen on') default="localhost",
help="brozzler dashboard address to listen on",
)
arg_parser.add_argument( arg_parser.add_argument(
'--dashboard-port', dest='dashboard_port', "--dashboard-port",
type=int, default=8881, help='brozzler dashboard port') dest="dashboard_port",
type=int,
default=8881,
help="brozzler dashboard port",
)
# common at the bottom args # common at the bottom args
brozzler.cli.add_common_options(arg_parser, argv) brozzler.cli.add_common_options(arg_parser, argv)
return arg_parser return arg_parser
class ThreadingWSGIServer( class ThreadingWSGIServer(
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer): socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
):
pass pass
class BrozzlerEasyController: class BrozzlerEasyController:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -123,25 +162,31 @@ class BrozzlerEasyController:
self.stop = threading.Event() self.stop = threading.Event()
self.args = args self.args = args
self.warcprox_controller = warcprox.controller.WarcproxController( self.warcprox_controller = warcprox.controller.WarcproxController(
self._warcprox_opts(args)) self._warcprox_opts(args)
)
self.brozzler_worker = self._init_brozzler_worker(args) self.brozzler_worker = self._init_brozzler_worker(args)
self.pywb_httpd = self._init_pywb(args) self.pywb_httpd = self._init_pywb(args)
self.dashboard_httpd = self._init_brozzler_dashboard(args) self.dashboard_httpd = self._init_brozzler_dashboard(args)
def _init_brozzler_dashboard(self, args): def _init_brozzler_dashboard(self, args):
return wsgiref.simple_server.make_server( return wsgiref.simple_server.make_server(
args.dashboard_address, args.dashboard_port, args.dashboard_address,
brozzler.dashboard.app, ThreadingWSGIServer) args.dashboard_port,
brozzler.dashboard.app,
ThreadingWSGIServer,
)
def _init_brozzler_worker(self, args): def _init_brozzler_worker(self, args):
rr = doublethink.Rethinker( rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr) service_registry = doublethink.ServiceRegistry(rr)
worker = brozzler.worker.BrozzlerWorker( worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, chrome_exe=args.chrome_exe, frontier,
proxy='%s:%s' % self.warcprox_controller.proxy.server_address, service_registry,
max_browsers=args.max_browsers) chrome_exe=args.chrome_exe,
proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
max_browsers=args.max_browsers,
)
return worker return worker
def _init_pywb(self, args): def _init_pywb(self, args):
@ -152,66 +197,67 @@ class BrozzlerEasyController:
brozzler.pywb.monkey_patch_fuzzy_query() brozzler.pywb.monkey_patch_fuzzy_query()
brozzler.pywb.monkey_patch_calc_search_range() brozzler.pywb.monkey_patch_calc_search_range()
if args.warcs_dir.endswith('/'): if args.warcs_dir.endswith("/"):
warcs_dir = args.warcs_dir warcs_dir = args.warcs_dir
else: else:
warcs_dir = args.warcs_dir + '/' warcs_dir = args.warcs_dir + "/"
conf = { conf = {
'collections': { "collections": {
'brozzler': { "brozzler": {
'index_paths': brozzler.pywb.RethinkCDXSource( "index_paths": brozzler.pywb.RethinkCDXSource(
servers=args.rethinkdb_servers.split(","), servers=args.rethinkdb_servers.split(","),
db=args.rethinkdb_db, table='captures') db=args.rethinkdb_db,
table="captures",
)
}, },
}, },
# 'enable_http_proxy': True, # 'enable_http_proxy': True,
# 'enable_memento': True, # 'enable_memento': True,
'archive_paths': warcs_dir, "archive_paths": warcs_dir,
'enable_cdx_api': True, "enable_cdx_api": True,
'framed_replay': True, "framed_replay": True,
'port': args.pywb_port, "port": args.pywb_port,
'enable_auto_colls': False, "enable_auto_colls": False,
} }
wsgi_app = pywb.framework.wsgi_wrappers.init_app( wsgi_app = pywb.framework.wsgi_wrappers.init_app(
pywb.webapp.pywb_init.create_wb_router, config=conf, pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
load_yaml=False) )
# disable is_hop_by_hop restrictions # disable is_hop_by_hop restrictions
wsgiref.handlers.is_hop_by_hop = lambda x: False wsgiref.handlers.is_hop_by_hop = lambda x: False
return wsgiref.simple_server.make_server( return wsgiref.simple_server.make_server(
args.pywb_address, args.pywb_port, wsgi_app, args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
ThreadingWSGIServer) )
def start(self): def start(self):
self.logger.info('starting warcprox') self.logger.info("starting warcprox")
self.warcprox_controller.start() self.warcprox_controller.start()
# XXX wait til fully started? # XXX wait til fully started?
self.logger.info('starting brozzler-worker') self.logger.info("starting brozzler-worker")
self.brozzler_worker.start() self.brozzler_worker.start()
self.logger.info( self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
'starting pywb at %s:%s', *self.pywb_httpd.server_address)
threading.Thread(target=self.pywb_httpd.serve_forever).start() threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info( self.logger.info(
'starting brozzler-dashboard at %s:%s', "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
*self.dashboard_httpd.server_address) )
threading.Thread(target=self.dashboard_httpd.serve_forever).start() threading.Thread(target=self.dashboard_httpd.serve_forever).start()
def shutdown(self): def shutdown(self):
self.logger.info('shutting down brozzler-dashboard') self.logger.info("shutting down brozzler-dashboard")
self.dashboard_httpd.shutdown() self.dashboard_httpd.shutdown()
self.logger.info('shutting down brozzler-worker') self.logger.info("shutting down brozzler-worker")
self.brozzler_worker.shutdown_now() self.brozzler_worker.shutdown_now()
# brozzler-worker is fully shut down at this point # brozzler-worker is fully shut down at this point
self.logger.info('shutting down pywb') self.logger.info("shutting down pywb")
self.pywb_httpd.shutdown() self.pywb_httpd.shutdown()
self.logger.info('shutting down warcprox') self.logger.info("shutting down warcprox")
self.warcprox_controller.shutdown() self.warcprox_controller.shutdown()
def wait_for_shutdown_request(self): def wait_for_shutdown_request(self):
@ -222,14 +268,14 @@ class BrozzlerEasyController:
self.shutdown() self.shutdown()
def _warcprox_opts(self, args): def _warcprox_opts(self, args):
''' """
Takes args as produced by the argument parser built by Takes args as produced by the argument parser built by
_build_arg_parser and builds warcprox arguments object suitable to pass _build_arg_parser and builds warcprox arguments object suitable to pass
to warcprox.main.init_controller. Copies some arguments, renames some, to warcprox.main.init_controller. Copies some arguments, renames some,
populates some with defaults appropriate for brozzler-easy, etc. populates some with defaults appropriate for brozzler-easy, etc.
''' """
warcprox_opts = warcprox.Options() warcprox_opts = warcprox.Options()
warcprox_opts.address = 'localhost' warcprox_opts.address = "localhost"
# let the OS choose an available port; discover it later using # let the OS choose an available port; discover it later using
# sock.getsockname()[1] # sock.getsockname()[1]
warcprox_opts.port = 0 warcprox_opts.port = 0
@ -237,17 +283,18 @@ class BrozzlerEasyController:
warcprox_opts.certs_dir = args.certs_dir warcprox_opts.certs_dir = args.certs_dir
warcprox_opts.directory = args.warcs_dir warcprox_opts.directory = args.warcs_dir
warcprox_opts.gzip = True warcprox_opts.gzip = True
warcprox_opts.prefix = 'brozzler' warcprox_opts.prefix = "brozzler"
warcprox_opts.size = 1000 * 1000 * 1000 warcprox_opts.size = 1000 * 1000 * 1000
warcprox_opts.rollover_idle_time = 3 * 60 warcprox_opts.rollover_idle_time = 3 * 60
warcprox_opts.digest_algorithm = 'sha1' warcprox_opts.digest_algorithm = "sha1"
warcprox_opts.base32 = True warcprox_opts.base32 = True
warcprox_opts.stats_db_file = None warcprox_opts.stats_db_file = None
warcprox_opts.playback_port = None warcprox_opts.playback_port = None
warcprox_opts.playback_index_db_file = None warcprox_opts.playback_index_db_file = None
warcprox_opts.rethinkdb_big_table_url = ( warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
'rethinkdb://%s/%s/captures' % ( args.rethinkdb_servers,
args.rethinkdb_servers, args.rethinkdb_db)) args.rethinkdb_db,
)
warcprox_opts.queue_size = 500 warcprox_opts.queue_size = 500
warcprox_opts.max_threads = None warcprox_opts.max_threads = None
warcprox_opts.profile = False warcprox_opts.profile = False
@ -259,9 +306,11 @@ class BrozzlerEasyController:
for th in threading.enumerate(): for th in threading.enumerate():
state_strs.append(str(th)) state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident]) stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append(''.join(stack)) state_strs.append("".join(stack))
logging.warning('dumping state (caught signal {})\n{}'.format( logging.warning(
signum, '\n'.join(state_strs))) "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
)
def main(argv=None): def main(argv=None):
argv = argv or sys.argv argv = argv or sys.argv

View file

@ -1,4 +1,4 @@
''' """
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
Copyright (C) 2014-2018 Internet Archive Copyright (C) 2014-2018 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import brozzler import brozzler
@ -27,9 +27,11 @@ import urlcanon
r = rdb.RethinkDB() r = rdb.RethinkDB()
class UnexpectedDbResult(Exception): class UnexpectedDbResult(Exception):
pass pass
class RethinkDbFrontier: class RethinkDbFrontier:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -47,40 +49,49 @@ class RethinkDbFrontier:
tables = self.rr.table_list().run() tables = self.rr.table_list().run()
if not "sites" in tables: if not "sites" in tables:
self.logger.info( self.logger.info(
"creating rethinkdb table 'sites' in database %r", "creating rethinkdb table 'sites' in database %r", self.rr.dbname
self.rr.dbname) )
self.rr.table_create( self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas).run() "sites", shards=self.shards, replicas=self.replicas
self.rr.table("sites").index_create("sites_last_disclaimed", [ ).run()
r.row["status"], r.row["last_disclaimed"]]).run() self.rr.table("sites").index_create(
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
).run()
self.rr.table("sites").index_create("job_id").run() self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables: if not "pages" in tables:
self.logger.info( self.logger.info(
"creating rethinkdb table 'pages' in database %r", "creating rethinkdb table 'pages' in database %r", self.rr.dbname
self.rr.dbname) )
self.rr.table_create( self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas).run() "pages", shards=self.shards, replicas=self.replicas
self.rr.table("pages").index_create("priority_by_site", [ ).run()
r.row["site_id"], r.row["brozzle_count"], self.rr.table("pages").index_create(
r.row["claimed"], r.row["priority"]]).run() "priority_by_site",
[
r.row["site_id"],
r.row["brozzle_count"],
r.row["claimed"],
r.row["priority"],
],
).run()
# this index is for displaying pages in a sensible order in the web # this index is for displaying pages in a sensible order in the web
# console # console
self.rr.table("pages").index_create("least_hops", [ self.rr.table("pages").index_create(
r.row["site_id"], r.row["brozzle_count"], "least_hops",
r.row["hops_from_seed"]]).run() [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run()
if not "jobs" in tables: if not "jobs" in tables:
self.logger.info( self.logger.info(
"creating rethinkdb table 'jobs' in database %r", "creating rethinkdb table 'jobs' in database %r", self.rr.dbname
self.rr.dbname) )
self.rr.table_create( self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas).run() "jobs", shards=self.shards, replicas=self.replicas
).run()
def _vet_result(self, result, **kwargs): def _vet_result(self, result, **kwargs):
# self.logger.debug("vetting expected=%s result=%s", kwargs, result) # self.logger.debug("vetting expected=%s result=%s", kwargs, result)
# {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0} # {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
for k in [ for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
"replaced", "errors", "skipped", "inserted", "deleted",
"unchanged"]:
if k in kwargs: if k in kwargs:
expected = kwargs[k] expected = kwargs[k]
else: else:
@ -88,55 +99,81 @@ class RethinkDbFrontier:
if isinstance(expected, list): if isinstance(expected, list):
if result.get(k) not in kwargs[k]: if result.get(k) not in kwargs[k]:
raise UnexpectedDbResult( raise UnexpectedDbResult(
"expected %r to be one of %r in %r" % ( "expected %r to be one of %r in %r" % (k, expected, result)
k, expected, result)) )
else: else:
if result.get(k) != expected: if result.get(k) != expected:
raise UnexpectedDbResult("expected %r to be %r in %r" % ( raise UnexpectedDbResult(
k, expected, result)) "expected %r to be %r in %r" % (k, expected, result)
)
def claim_sites(self, n=1): def claim_sites(self, n=1):
self.logger.trace('claiming up to %s sites to brozzle', n) self.logger.trace("claiming up to %s sites to brozzle", n)
result = ( result = (
self.rr.table('sites').get_all(r.args( self.rr.table("sites")
r.db(self.rr.dbname).table('sites', read_mode='majority') .get_all(
r.args(
r.db(self.rr.dbname)
.table("sites", read_mode="majority")
.between( .between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval], ["ACTIVE", r.minval],
index='sites_last_disclaimed') ["ACTIVE", r.maxval],
.order_by(r.desc('claimed'), 'last_disclaimed') index="sites_last_disclaimed",
)
.order_by(r.desc("claimed"), "last_disclaimed")
.fold( .fold(
{}, lambda acc, site: acc.merge( {},
lambda acc, site: acc.merge(
r.branch( r.branch(
site.has_fields('job_id'), site.has_fields("job_id"),
r.object( r.object(
site['job_id'].coerce_to('string'), site["job_id"].coerce_to("string"),
acc[site['job_id'].coerce_to('string')].default(0).add(1)), acc[site["job_id"].coerce_to("string")]
{})), .default(0)
.add(1),
),
{},
)
),
emit=lambda acc, site, new_acc: r.branch( emit=lambda acc, site, new_acc: r.branch(
r.and_( r.and_(
r.or_( r.or_(
site['claimed'].not_(), site["claimed"].not_(),
site['last_claimed'].lt(r.now().sub(60*60))), site["last_claimed"].lt(r.now().sub(60 * 60)),
),
r.or_( r.or_(
site.has_fields('max_claimed_sites').not_(), site.has_fields("max_claimed_sites").not_(),
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))), new_acc[site["job_id"].coerce_to("string")].le(
[site['id']], [])) site["max_claimed_sites"]
.limit(n))) ),
),
),
[site["id"]],
[],
),
)
.limit(n)
)
)
.update( .update(
# try to avoid a race condition resulting in multiple # try to avoid a race condition resulting in multiple
# brozzler-workers claiming the same site # brozzler-workers claiming the same site
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
r.branch( r.branch(
r.or_( r.or_(
r.row['claimed'].not_(), r.row["claimed"].not_(),
r.row['last_claimed'].lt(r.now().sub(60*60))), r.row["last_claimed"].lt(r.now().sub(60 * 60)),
{'claimed': True, 'last_claimed': r.now()}, ),
{}), {"claimed": True, "last_claimed": r.now()},
return_changes=True)).run() {},
),
return_changes=True,
)
).run()
self._vet_result( self._vet_result(
result, replaced=list(range(n+1)), result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
unchanged=list(range(n+1))) )
sites = [] sites = []
for i in range(result["replaced"]): for i in range(result["replaced"]):
if result["changes"][i]["old_val"]["claimed"]: if result["changes"][i]["old_val"]["claimed"]:
@ -145,24 +182,27 @@ class RethinkDbFrontier:
"because it was last claimed a long time ago " "because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from " "at %s, and presumably some error stopped it from "
"being disclaimed", "being disclaimed",
result["changes"][i]["old_val"]["last_claimed"]) result["changes"][i]["old_val"]["last_claimed"],
)
site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
sites.append(site) sites.append(site)
self.logger.debug('claimed %s sites', len(sites)) self.logger.debug("claimed %s sites", len(sites))
if sites: if sites:
return sites return sites
else: else:
raise brozzler.NothingToClaim raise brozzler.NothingToClaim
def enforce_time_limit(self, site): def enforce_time_limit(self, site):
''' """
Raises `brozzler.ReachedTimeLimit` if appropriate. Raises `brozzler.ReachedTimeLimit` if appropriate.
''' """
if (site.time_limit and site.time_limit > 0 if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
and site.elapsed() > site.time_limit):
self.logger.debug( self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s " "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
"elapsed=%s %s", site.time_limit, site.elapsed(), site) site.time_limit,
site.elapsed(),
site,
)
raise brozzler.ReachedTimeLimit raise brozzler.ReachedTimeLimit
def claim_page(self, site, worker_id): def claim_page(self, site, worker_id):
@ -170,15 +210,20 @@ class RethinkDbFrontier:
# brozzler-worker can be working on a site at a time, and that would # brozzler-worker can be working on a site at a time, and that would
# have to be the worker calling this method, so if something is claimed # have to be the worker calling this method, so if something is claimed
# already, it must have been left that way because of some error # already, it must have been left that way because of some error
result = self.rr.table("pages").between( result = (
self.rr.table("pages")
.between(
[site.id, 0, r.minval, r.minval], [site.id, 0, r.minval, r.minval],
[site.id, 0, r.maxval, r.maxval], [site.id, 0, r.maxval, r.maxval],
index="priority_by_site").order_by( index="priority_by_site",
index=r.desc("priority_by_site")).limit( )
1).update({ .order_by(index=r.desc("priority_by_site"))
"claimed":True, .limit(1)
"last_claimed_by":worker_id}, .update(
return_changes="always").run() {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
)
.run()
)
self._vet_result(result, unchanged=[0, 1], replaced=[0, 1]) self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
if result["unchanged"] == 0 and result["replaced"] == 0: if result["unchanged"] == 0 and result["replaced"] == 0:
raise brozzler.NothingToClaim raise brozzler.NothingToClaim
@ -186,10 +231,16 @@ class RethinkDbFrontier:
return brozzler.Page(self.rr, result["changes"][0]["new_val"]) return brozzler.Page(self.rr, result["changes"][0]["new_val"])
def has_outstanding_pages(self, site): def has_outstanding_pages(self, site):
results_iter = self.rr.table("pages").between( results_iter = (
self.rr.table("pages")
.between(
[site.id, 0, r.minval, r.minval], [site.id, 0, r.minval, r.minval],
[site.id, 0, r.maxval, r.maxval], [site.id, 0, r.maxval, r.maxval],
index="priority_by_site").limit(1).run() index="priority_by_site",
)
.limit(1)
.run()
)
return len(list(results_iter)) > 0 return len(list(results_iter)) > 0
def completed_page(self, site, page): def completed_page(self, site, page):
@ -209,15 +260,17 @@ class RethinkDbFrontier:
def honor_stop_request(self, site): def honor_stop_request(self, site):
"""Raises brozzler.CrawlStopped if stop has been requested.""" """Raises brozzler.CrawlStopped if stop has been requested."""
site.refresh() site.refresh()
if (site.stop_requested if site.stop_requested and site.stop_requested <= doublethink.utcnow():
and site.stop_requested <= doublethink.utcnow()):
self.logger.info("stop requested for site %s", site.id) self.logger.info("stop requested for site %s", site.id)
raise brozzler.CrawlStopped raise brozzler.CrawlStopped
if site.job_id: if site.job_id:
job = brozzler.Job.load(self.rr, site.job_id) job = brozzler.Job.load(self.rr, site.job_id)
if (job and job.stop_requested if (
and job.stop_requested <= doublethink.utcnow()): job
and job.stop_requested
and job.stop_requested <= doublethink.utcnow()
):
self.logger.info("stop requested for job %s", site.job_id) self.logger.info("stop requested for job %s", site.job_id)
raise brozzler.CrawlStopped raise brozzler.CrawlStopped
@ -239,8 +292,7 @@ class RethinkDbFrontier:
return False return False
n += 1 n += 1
self.logger.info( self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
"all %s sites finished, job %s is FINISHED!", n, job.id)
job.finish() job.finish()
job.save() job.save()
return True return True
@ -270,13 +322,11 @@ class RethinkDbFrontier:
def resume_job(self, job): def resume_job(self, job):
job.status = "ACTIVE" job.status = "ACTIVE"
job.stop_requested = None job.stop_requested = None
job.starts_and_stops.append( job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
{"start":doublethink.utcnow(), "stop":None})
job.save() job.save()
for site in self.job_sites(job.id): for site in self.job_sites(job.id):
site.status = "ACTIVE" site.status = "ACTIVE"
site.starts_and_stops.append( site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
{"start":doublethink.utcnow(), "stop":None})
site.save() site.save()
def resume_site(self, site): def resume_site(self, site):
@ -285,51 +335,55 @@ class RethinkDbFrontier:
job = brozzler.Job.load(self.rr, site.job_id) job = brozzler.Job.load(self.rr, site.job_id)
job.status = "ACTIVE" job.status = "ACTIVE"
site.stop_requested = None site.stop_requested = None
job.starts_and_stops.append( job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
{"start":doublethink.utcnow(), "stop":None})
job.save() job.save()
site.status = "ACTIVE" site.status = "ACTIVE"
site.starts_and_stops.append( site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
{"start":doublethink.utcnow(), "stop":None})
site.save() site.save()
def _build_fresh_page(self, site, parent_page, url, hops_off=0): def _build_fresh_page(self, site, parent_page, url, hops_off=0):
url_for_scoping = urlcanon.semantic(url) url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url) url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
+ url_for_crawling.fragment).decode('utf-8') "utf-8"
)
urlcanon.canon.remove_fragment(url_for_crawling) urlcanon.canon.remove_fragment(url_for_crawling)
page = brozzler.Page(self.rr, { page = brozzler.Page(
'url': str(url_for_crawling), self.rr,
'site_id': site.id, {
'job_id': site.job_id, "url": str(url_for_crawling),
'hops_from_seed': parent_page.hops_from_seed + 1, "site_id": site.id,
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", "job_id": site.job_id,
'via_page_id': parent_page.id, "hops_from_seed": parent_page.hops_from_seed + 1,
'via_page_url': parent_page.url, "hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
'hops_off_surt': hops_off, + "L",
'hashtags': [hashtag] if hashtag else []}) "via_page_id": parent_page.id,
"via_page_url": parent_page.url,
"hops_off_surt": hops_off,
"hashtags": [hashtag] if hashtag else [],
},
)
return page return page
def _merge_page(self, existing_page, fresh_page): def _merge_page(self, existing_page, fresh_page):
''' """
Utility method for merging info from `brozzler.Page` instances Utility method for merging info from `brozzler.Page` instances
representing the same url but with possibly different metadata. representing the same url but with possibly different metadata.
''' """
existing_page.priority += fresh_page.priority existing_page.priority += fresh_page.priority
existing_page.hashtags = list(set( existing_page.hashtags = list(
(existing_page.hashtags or []) + (fresh_page.hashtags or []))) set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
existing_page.hops_off = min( )
existing_page.hops_off, fresh_page.hops_off) existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)
def _scope_and_enforce_robots(self, site, parent_page, outlinks): def _scope_and_enforce_robots(self, site, parent_page, outlinks):
''' """
Returns tuple ( Returns tuple (
dict of {page_id: Page} of fresh `brozzler.Page` representing in dict of {page_id: Page} of fresh `brozzler.Page` representing in
scope links accepted by robots policy, scope links accepted by robots policy,
set of in scope urls (canonicalized) blocked by robots policy, set of in scope urls (canonicalized) blocked by robots policy,
set of out-of-scope urls (canonicalized)). set of out-of-scope urls (canonicalized)).
''' """
pages = {} # {page_id: Page, ...} pages = {} # {page_id: Page, ...}
blocked = set() blocked = set()
out_of_scope = set() out_of_scope = set()
@ -337,17 +391,18 @@ class RethinkDbFrontier:
url_for_scoping = urlcanon.semantic(url) url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url) url_for_crawling = urlcanon.whatwg(url)
decision = site.accept_reject_or_neither( decision = site.accept_reject_or_neither(
url_for_scoping, parent_page=parent_page) url_for_scoping, parent_page=parent_page
)
if decision is True: if decision is True:
hops_off = 0 hops_off = 0
elif decision is None: elif decision is None:
decision = parent_page.hops_off < site.scope.get( decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
'max_hops_off', 0)
hops_off = parent_page.hops_off + 1 hops_off = parent_page.hops_off + 1
if decision is True: if decision is True:
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
fresh_page = self._build_fresh_page( fresh_page = self._build_fresh_page(
site, parent_page, url, hops_off) site, parent_page, url, hops_off
)
if fresh_page.id in pages: if fresh_page.id in pages:
self._merge_page(pages[fresh_page.id], fresh_page) self._merge_page(pages[fresh_page.id], fresh_page)
else: else:
@ -359,31 +414,32 @@ class RethinkDbFrontier:
return pages, blocked, out_of_scope return pages, blocked, out_of_scope
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
decisions = {'accepted':set(),'blocked':set(),'rejected':set()} decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
counts = {'added':0,'updated':0,'rejected':0,'blocked':0} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots( fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
site, parent_page, outlinks) site, parent_page, outlinks
decisions['blocked'] = blocked )
decisions['rejected'] = out_of_scope decisions["blocked"] = blocked
counts['blocked'] += len(blocked) decisions["rejected"] = out_of_scope
counts['rejected'] += len(out_of_scope) counts["blocked"] += len(blocked)
counts["rejected"] += len(out_of_scope)
# get existing pages from rethinkdb # get existing pages from rethinkdb
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}
# build list of pages to save, consisting of new pages, and existing # build list of pages to save, consisting of new pages, and existing
# pages updated with higher priority and new hashtags # pages updated with higher priority and new hashtags
for fresh_page in fresh_pages.values(): for fresh_page in fresh_pages.values():
decisions['accepted'].add(fresh_page.url) decisions["accepted"].add(fresh_page.url)
if fresh_page.id in pages: if fresh_page.id in pages:
page = pages[fresh_page.id] page = pages[fresh_page.id]
self._merge_page(page, fresh_page) self._merge_page(page, fresh_page)
counts['updated'] += 1 counts["updated"] += 1
else: else:
pages[fresh_page.id] = fresh_page pages[fresh_page.id] = fresh_page
counts['added'] += 1 counts["added"] += 1
# make sure we're not stepping on our own toes in case we have a link # make sure we're not stepping on our own toes in case we have a link
# back to parent_page, which I think happens because of hashtags # back to parent_page, which I think happens because of hashtags
@ -398,17 +454,20 @@ class RethinkDbFrontier:
l = list(pages.values()) l = list(pages.values())
for batch in (l[i : i + 50] for i in range(0, len(l), 50)): for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
try: try:
self.logger.debug( self.logger.debug("inserting/replacing batch of %s pages", len(batch))
'inserting/replacing batch of %s pages', len(batch)) reql = self.rr.table("pages").insert(batch, conflict="replace")
reql = self.rr.table('pages').insert(batch, conflict='replace')
self.logger.trace( self.logger.trace(
'running query self.rr.table("pages").insert(%r, ' 'running query self.rr.table("pages").insert(%r, '
'conflict="replace")', batch) 'conflict="replace")',
batch,
)
result = reql.run() result = reql.run()
except Exception as e: except Exception as e:
self.logger.error( self.logger.error(
'problem inserting/replacing batch of %s pages', "problem inserting/replacing batch of %s pages",
len(batch), exc_info=True) len(batch),
exc_info=True,
)
parent_page.outlinks = {} parent_page.outlinks = {}
for k in decisions: for k in decisions:
@ -416,43 +475,56 @@ class RethinkDbFrontier:
parent_page.save() parent_page.save()
self.logger.info( self.logger.info(
'%s new links added, %s existing links updated, %s links ' "%s new links added, %s existing links updated, %s links "
'rejected, %s links blocked by robots from %s', "rejected, %s links blocked by robots from %s",
counts['added'], counts['updated'], counts['rejected'], counts["added"],
counts['blocked'], parent_page) counts["updated"],
counts["rejected"],
counts["blocked"],
parent_page,
)
def reached_limit(self, site, e): def reached_limit(self, site, e):
self.logger.info("reached_limit site=%s e=%s", site, e) self.logger.info("reached_limit site=%s e=%s", site, e)
assert isinstance(e, brozzler.ReachedLimit) assert isinstance(e, brozzler.ReachedLimit)
if (site.reached_limit if (
and site.reached_limit != e.warcprox_meta["reached-limit"]): site.reached_limit
and site.reached_limit != e.warcprox_meta["reached-limit"]
):
self.logger.warning( self.logger.warning(
"reached limit %s but site had already reached limit %s", "reached limit %s but site had already reached limit %s",
e.warcprox_meta["reached-limit"], self.reached_limit) e.warcprox_meta["reached-limit"],
self.reached_limit,
)
else: else:
site.reached_limit = e.warcprox_meta["reached-limit"] site.reached_limit = e.warcprox_meta["reached-limit"]
self.finished(site, "FINISHED_REACHED_LIMIT") self.finished(site, "FINISHED_REACHED_LIMIT")
def job_sites(self, job_id): def job_sites(self, job_id):
results = self.rr.table('sites').get_all(job_id, index="job_id").run() results = self.rr.table("sites").get_all(job_id, index="job_id").run()
for result in results: for result in results:
yield brozzler.Site(self.rr, result) yield brozzler.Site(self.rr, result)
def seed_page(self, site_id): def seed_page(self, site_id):
results = self.rr.table("pages").between( results = (
self.rr.table("pages")
.between(
[site_id, r.minval, r.minval, r.minval], [site_id, r.minval, r.minval, r.minval],
[site_id, r.maxval, r.maxval, r.maxval], [site_id, r.maxval, r.maxval, r.maxval],
index="priority_by_site").filter({"hops_from_seed":0}).run() index="priority_by_site",
)
.filter({"hops_from_seed": 0})
.run()
)
pages = list(results) pages = list(results)
if len(pages) > 1: if len(pages) > 1:
self.logger.warning( self.logger.warning("more than one seed page for site_id %s ?", site_id)
"more than one seed page for site_id %s ?", site_id)
if len(pages) < 1: if len(pages) < 1:
return None return None
return brozzler.Page(self.rr, pages[0]) return brozzler.Page(self.rr, pages[0])
def site_pages(self, site_id, brozzled=None): def site_pages(self, site_id, brozzled=None):
''' """
Args: Args:
site_id (str or int): site_id (str or int):
brozzled (bool): if true, results include only pages that have brozzled (bool): if true, results include only pages that have
@ -460,16 +532,14 @@ class RethinkDbFrontier:
not been brozzled; and if None (the default), all pages not been brozzled; and if None (the default), all pages
Returns: Returns:
iterator of brozzler.Page iterator of brozzler.Page
''' """
query = self.rr.table("pages").between( query = self.rr.table("pages").between(
[site_id, 1 if brozzled is True else 0, [site_id, 1 if brozzled is True else 0, r.minval, r.minval],
r.minval, r.minval], [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
[site_id, 0 if brozzled is False else r.maxval, index="priority_by_site",
r.maxval, r.maxval], )
index="priority_by_site")
self.logger.trace("running query: %r", query) self.logger.trace("running query: %r", query)
results = query.run() results = query.run()
for result in results: for result in results:
self.logger.trace("yielding result: %r", result) self.logger.trace("yielding result: %r", result)
yield brozzler.Page(self.rr, result) yield brozzler.Page(self.rr, result)

View file

@ -1,4 +1,4 @@
''' """
brozzler/models.py - model classes representing jobs, sites, and pages, with brozzler/models.py - model classes representing jobs, sites, and pages, with
related logic related logic
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import brozzler import brozzler
import base64 import base64
@ -36,15 +36,18 @@ import yaml
import zlib import zlib
from typing import Optional from typing import Optional
def load_schema(): def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
with open(schema_file) as f: with open(schema_file) as f:
return yaml.safe_load(f) return yaml.safe_load(f)
class JobValidator(cerberus.Validator): class JobValidator(cerberus.Validator):
def _validate_type_url(self, value): def _validate_type_url(self, value):
url = urllib.parse.urlparse(value) url = urllib.parse.urlparse(value)
return url.scheme in ('http', 'https', 'ftp') return url.scheme in ("http", "https", "ftp")
class InvalidJobConf(Exception): class InvalidJobConf(Exception):
def __init__(self, validator): def __init__(self, validator):
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
# Cerberus does a nice job hiding the bad value. In the case I # Cerberus does a nice job hiding the bad value. In the case I
# debugged, I found it here. Maybe there's a better way to see it. # debugged, I found it here. Maybe there's a better way to see it.
value = validator._errors[0].info[0][0].info[0][0].value value = validator._errors[0].info[0][0].info[0][0].value
self.errors['bad value'] = value self.errors["bad value"] = value
except: except:
value = None value = None
def validate_conf(job_conf, schema=load_schema()): def validate_conf(job_conf, schema=load_schema()):
v = JobValidator(schema) v = JobValidator(schema)
if not v.validate(job_conf, normalize=False): if not v.validate(job_conf, normalize=False):
raise InvalidJobConf(v) raise InvalidJobConf(v)
def merge(a, b): def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict): if isinstance(a, dict) and isinstance(b, dict):
merged = dict(a) merged = dict(a)
@ -75,19 +80,22 @@ def merge(a, b):
else: else:
return a return a
def new_job_file(frontier, job_conf_file): def new_job_file(frontier, job_conf_file):
'''Returns new Job.''' """Returns new Job."""
logging.info("loading %s", job_conf_file) logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f: with open(job_conf_file) as f:
job_conf = yaml.safe_load(f) job_conf = yaml.safe_load(f)
return new_job(frontier, job_conf) return new_job(frontier, job_conf)
def new_job(frontier, job_conf): def new_job(frontier, job_conf):
'''Returns new Job.''' """Returns new Job."""
validate_conf(job_conf) validate_conf(job_conf)
job = Job(frontier.rr, { job = Job(
"conf": job_conf, "status": "ACTIVE", frontier.rr,
"started": doublethink.utcnow()}) {"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
)
if "id" in job_conf: if "id" in job_conf:
job.id = job_conf["id"] job.id = job_conf["id"]
if "max_claimed_sites" in job_conf: if "max_claimed_sites" in job_conf:
@ -109,31 +117,39 @@ def new_job(frontier, job_conf):
# insert in batches to avoid this error # insert in batches to avoid this error
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)): for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
logging.info('inserting batch of %s pages', len(batch)) logging.info("inserting batch of %s pages", len(batch))
result = frontier.rr.table('pages').insert(batch).run() result = frontier.rr.table("pages").insert(batch).run()
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)): for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
logging.info('inserting batch of %s sites', len(batch)) logging.info("inserting batch of %s sites", len(batch))
result = frontier.rr.table('sites').insert(batch).run() result = frontier.rr.table("sites").insert(batch).run()
logging.info('job %s fully started', job.id) logging.info("job %s fully started", job.id)
return job return job
def new_seed_page(frontier, site): def new_seed_page(frontier, site):
url = urlcanon.parse_url(site.seed) url = urlcanon.parse_url(site.seed)
hashtag = (url.hash_sign + url.fragment).decode("utf-8") hashtag = (url.hash_sign + url.fragment).decode("utf-8")
urlcanon.canon.remove_fragment(url) urlcanon.canon.remove_fragment(url)
page = brozzler.Page(frontier.rr, { page = brozzler.Page(
frontier.rr,
{
"url": str(url), "url": str(url),
"site_id": site.get("id"), "site_id": site.get("id"),
"job_id": site.get("job_id"), "job_id": site.get("job_id"),
"hops_from_seed": 0, "hops_from_seed": 0,
"priority": 1000, "priority": 1000,
"needs_robots_check": True, "needs_robots_check": True,
"hop_path": None}) "hop_path": None,
},
)
if hashtag: if hashtag:
page.hashtags = [hashtag,] page.hashtags = [
hashtag,
]
return page return page
def new_site(frontier, site): def new_site(frontier, site):
logging.info("new site %s", site) logging.info("new site %s", site)
site.id = site.id or str(uuid.uuid4()) site.id = site.id or str(uuid.uuid4())
@ -148,9 +164,10 @@ def new_site(frontier, site):
# finally block because we want to insert the Site no matter what # finally block because we want to insert the Site no matter what
site.save() site.save()
class ElapsedMixIn(object): class ElapsedMixIn(object):
def elapsed(self): def elapsed(self):
''' """
Returns elapsed crawl time as a float in seconds. Returns elapsed crawl time as a float in seconds.
This metric includes all the time that a site was in active rotation, This metric includes all the time that a site was in active rotation,
@ -158,21 +175,22 @@ class ElapsedMixIn(object):
In contrast `Site.active_brozzling_time` only counts time when a In contrast `Site.active_brozzling_time` only counts time when a
brozzler worker claimed the site and was actively brozzling it. brozzler worker claimed the site and was actively brozzling it.
''' """
dt = 0 dt = 0
for ss in self.starts_and_stops[:-1]: for ss in self.starts_and_stops[:-1]:
if ss['stop']: if ss["stop"]:
dt += (ss['stop'] - ss['start']).total_seconds() dt += (ss["stop"] - ss["start"]).total_seconds()
else: else:
self.logger.warning("missing expected ss['stop']") self.logger.warning("missing expected ss['stop']")
dt += (doublethink.utcnow() - ss['start']).total_seconds() dt += (doublethink.utcnow() - ss["start"]).total_seconds()
ss = self.starts_and_stops[-1] ss = self.starts_and_stops[-1]
if ss['stop']: if ss["stop"]:
dt += (ss['stop'] - ss['start']).total_seconds() dt += (ss["stop"] - ss["start"]).total_seconds()
else: # crawl is active else: # crawl is active
dt += (doublethink.utcnow() - ss['start']).total_seconds() dt += (doublethink.utcnow() - ss["start"]).total_seconds()
return dt return dt
class Job(doublethink.Document, ElapsedMixIn): class Job(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = "jobs" table = "jobs"
@ -182,28 +200,29 @@ class Job(doublethink.Document, ElapsedMixIn):
self.status = "ACTIVE" self.status = "ACTIVE"
if not "starts_and_stops" in self: if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility if self.get("started"): # backward compatibility
self.starts_and_stops = [{ self.starts_and_stops = [
"start": self.get("started"), {"start": self.get("started"), "stop": self.get("finished")}
"stop": self.get("finished")}] ]
del self["started"] del self["started"]
if "finished" in self: if "finished" in self:
del self["finished"] del self["finished"]
else: else:
self.starts_and_stops = [ self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
{"start":doublethink.utcnow(),"stop":None}]
def finish(self): def finish(self):
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]: if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
self.logger.error( self.logger.error(
"job is already finished status=%s " "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
"starts_and_stops[-1]['stop']=%s", self.status, self.status,
self.starts_and_stops[-1]["stop"]) self.starts_and_stops[-1]["stop"],
)
self.status = "FINISHED" self.status = "FINISHED"
self.starts_and_stops[-1]["stop"] = doublethink.utcnow() self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
class Site(doublethink.Document, ElapsedMixIn): class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = 'sites' table = "sites"
def populate_defaults(self): def populate_defaults(self):
if not "status" in self: if not "status" in self:
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
del self.scope["surt"] del self.scope["surt"]
# backward compatibility # backward compatibility
if ("max_hops_off_surt" in self.scope if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
and not "max_hops_off" in self.scope):
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope: if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"] del self.scope["max_hops_off_surt"]
if self.seed: if self.seed:
self._accept_ssurt_if_not_redundant( self._accept_ssurt_if_not_redundant(
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)
if not "starts_and_stops" in self: if not "starts_and_stops" in self:
if self.get("start_time"): # backward compatibility if self.get("start_time"): # backward compatibility
self.starts_and_stops = [{ self.starts_and_stops = [
"start":self.get("start_time"),"stop":None}] {"start": self.get("start_time"), "stop": None}
]
if self.get("status") != "ACTIVE": if self.get("status") != "ACTIVE":
self.starts_and_stops[0]["stop"] = self.last_disclaimed self.starts_and_stops[0]["stop"] = self.last_disclaimed
del self["start_time"] del self["start_time"]
else: else:
self.starts_and_stops = [ self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
{"start":doublethink.utcnow(),"stop":None}]
def __str__(self): def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
if not "accepts" in self.scope: if not "accepts" in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
simple_rule_ssurts = ( simple_rule_ssurts = (
rule["ssurt"] for rule in self.scope["accepts"] rule["ssurt"]
if set(rule.keys()) == {'ssurt'}) for rule in self.scope["accepts"]
if set(rule.keys()) == {"ssurt"}
)
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
self.logger.info( self.logger.info("adding ssurt %s to scope accept rules", ssurt)
"adding ssurt %s to scope accept rules", ssurt)
self.scope["accepts"].append({"ssurt": ssurt}) self.scope["accepts"].append({"ssurt": ssurt})
def note_seed_redirect(self, url): def note_seed_redirect(self, url):
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also # if http://foo.com/ redirects to https://foo.com/a/b/c let's also
# put all of https://foo.com/ in scope # put all of https://foo.com/ in scope
if (canon_seed_redirect.authority == canon_seed.authority if (
and canon_seed_redirect.scheme != canon_seed.scheme): canon_seed_redirect.authority == canon_seed.authority
and canon_seed_redirect.scheme != canon_seed.scheme
):
canon_seed.scheme = canon_seed_redirect.scheme canon_seed.scheme = canon_seed_redirect.scheme
self._accept_ssurt_if_not_redundant( self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))
canon_seed.ssurt().decode('ascii'))
self._accept_ssurt_if_not_redundant( self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))
canon_seed_redirect.ssurt().decode('ascii'))
def extra_headers(self, page: Optional["Page"] = None): def extra_headers(self, page: Optional["Page"] = None):
hdrs = {} hdrs = {}
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta) temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
if "blocks" in self.warcprox_meta: if "blocks" in self.warcprox_meta:
# delete temp_warcprox_meta's 'blocks' (they may be big!) # delete temp_warcprox_meta's 'blocks' (they may be big!)
del temp_warcprox_meta['blocks'] del temp_warcprox_meta["blocks"]
# str-ify blocks # str-ify blocks
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':')) blocks_str = json.dumps(
self.warcprox_meta["blocks"], separators=(",", ":")
)
# encode(), compress, b64encode, decode() # encode(), compress, b64encode, decode()
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode() temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
zlib.compress(blocks_str.encode())
).decode()
if page is not None: if page is not None:
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':')) hdrs["Warcprox-Meta"] = json.dumps(
temp_warcprox_meta, separators=(",", ":")
)
return hdrs return hdrs
def accept_reject_or_neither(self, url, parent_page=None): def accept_reject_or_neither(self, url, parent_page=None):
''' """
Returns `True` (accepted), `False` (rejected), or `None` (no decision). Returns `True` (accepted), `False` (rejected), or `None` (no decision).
`None` usually means rejected, unless `max_hops_off` comes into play. `None` usually means rejected, unless `max_hops_off` comes into play.
''' """
if not isinstance(url, urlcanon.ParsedUrl): if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url) url = urlcanon.semantic(url)
if not url.scheme in (b'http', b'https'): if not url.scheme in (b"http", b"https"):
# XXX doesn't belong here maybe (where? worker ignores unknown # XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?) # schemes?)
return False return False
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
if parent_page: if parent_page:
try_parent_urls.append(urlcanon.semantic(parent_page.url)) try_parent_urls.append(urlcanon.semantic(parent_page.url))
if parent_page.redirect_url: if parent_page.redirect_url:
try_parent_urls.append( try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))
urlcanon.semantic(parent_page.redirect_url))
# enforce max_hops # enforce max_hops
if (parent_page and "max_hops" in self.scope if (
and parent_page.hops_from_seed >= self.scope["max_hops"]): parent_page
and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]
):
return False return False
# enforce reject rules # enforce reject rules
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
# no decision if we reach here # no decision if we reach here
return None return None
class Page(doublethink.Document): class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = "pages" table = "pages"
@ -398,4 +427,3 @@ class Page(doublethink.Document):
if self._canon_hurl is None: if self._canon_hurl is None:
self._canon_hurl = urlcanon.semantic(self.url) self._canon_hurl = urlcanon.semantic(self.url)
return str(self._canon_hurl) return str(self._canon_hurl)

View file

@ -1,4 +1,4 @@
''' """
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, canonicalization rules matching loading from warcs still being written to, canonicalization rules matching
brozzler conventions, support for screenshot: and thumbnail: urls brozzler conventions, support for screenshot: and thumbnail: urls
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import sys import sys
import logging import logging
try: try:
import pywb.apps.cli import pywb.apps.cli
import pywb.cdx.cdxdomainspecific import pywb.cdx.cdxdomainspecific
@ -32,7 +33,9 @@ except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.', 'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__,
e,
)
sys.exit(1) sys.exit(1)
import doublethink import doublethink
import rethinkdb as rdb import rethinkdb as rdb
@ -43,6 +46,7 @@ import argparse
r = rdb.RethinkDB() r = rdb.RethinkDB()
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table): def __init__(self, servers, db, table):
self.servers = servers self.servers = servers
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
# XXX inefficient, it gets parsed later, figure out how to # XXX inefficient, it gets parsed later, figure out how to
# short-circuit this step and create the CDXObject directly # short-circuit this step and create the CDXObject directly
blob = { blob = {
'url': record['url'], "url": record["url"],
'status': str(record['response_code']), "status": str(record["response_code"]),
'digest': record['sha1base32'], "digest": record["sha1base32"],
'length': str(record.get('record_length', '-')), "length": str(record.get("record_length", "-")),
'offset': str(record['offset']), "offset": str(record["offset"]),
'filename': record['filename'], "filename": record["filename"],
} }
if record['warc_type'] != 'revisit': if record["warc_type"] != "revisit":
blob['mime'] = record['content_type'] or '-' blob["mime"] = record["content_type"] or "-"
else: else:
blob['mime'] = 'warc/revisit' blob["mime"] = "warc/revisit"
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}' # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format( cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
record['canon_surt'], record['timestamp'], record["canon_surt"], record["timestamp"], json.dumps(blob)
json.dumps(blob)) )
yield cdx_line.encode('utf-8') yield cdx_line.encode("utf-8")
def _query_rethinkdb(self, cdx_query): def _query_rethinkdb(self, cdx_query):
start_key = cdx_query.key.decode('utf-8') start_key = cdx_query.key.decode("utf-8")
end_key = cdx_query.end_key.decode('utf-8') end_key = cdx_query.end_key.decode("utf-8")
reql = self.rr.table(self.table).between( reql = self.rr.table(self.table).between(
[start_key[:150], r.minval], [end_key[:150], r.maxval], [start_key[:150], r.minval],
index='abbr_canon_surt_timestamp', right_bound='closed') [end_key[:150], r.maxval],
reql = reql.order_by(index='abbr_canon_surt_timestamp') index="abbr_canon_surt_timestamp",
right_bound="closed",
)
reql = reql.order_by(index="abbr_canon_surt_timestamp")
# TODO support for POST, etc # TODO support for POST, etc
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
reql = reql.filter( reql = reql.filter(
lambda capture: r.expr( lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
['WARCPROX_WRITE_RECORD','GET']).contains( capture["http_method"]
capture['http_method'])) )
)
reql = reql.filter( reql = reql.filter(
lambda capture: (capture['canon_surt'] >= start_key) lambda capture: (capture["canon_surt"] >= start_key)
& (capture['canon_surt'] < end_key)) & (capture["canon_surt"] < end_key)
)
if cdx_query.limit: if cdx_query.limit:
reql = reql.limit(cdx_query.limit) reql = reql.limit(cdx_query.limit)
logging.debug('rethinkdb query: %s', reql) logging.debug("rethinkdb query: %s", reql)
results = reql.run() results = reql.run()
return results return results
class TheGoodUrlCanonicalizer(object): class TheGoodUrlCanonicalizer(object):
''' """
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
surts with scheme and with trailing comma, and does not "massage" surts with scheme and with trailing comma, and does not "massage"
www.foo.org into foo.org. www.foo.org into foo.org.
''' """
def __init__(self, surt_ordered=True): def __init__(self, surt_ordered=True):
'''We are always surt ordered (surt_ordered param is ignored)''' """We are always surt ordered (surt_ordered param is ignored)"""
self.surt_ordered = True self.surt_ordered = True
def __call__(self, url): def __call__(self, url):
try: try:
key = urlcanon.semantic(url).surt().decode('ascii') key = urlcanon.semantic(url).surt().decode("ascii")
# logging.debug('%s -> %s', url, key) # logging.debug('%s -> %s', url, key)
return key return key
except Exception as e: except Exception as e:
return url return url
def replace_default_canonicalizer(): def replace_default_canonicalizer():
'''Replace parent class of CustomUrlCanonicalizer with this class.''' """Replace parent class of CustomUrlCanonicalizer with this class."""
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
TheGoodUrlCanonicalizer,) TheGoodUrlCanonicalizer,
)
def good_surts_from_default(default_surt): def good_surts_from_default(default_surt):
''' """
Takes a standard surt without scheme and without trailing comma, and Takes a standard surt without scheme and without trailing comma, and
returns a list of "good" surts that together match the same set of returns a list of "good" surts that together match the same set of
urls. For example: urls. For example:
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
'http://(com,example,www,)/path', 'http://(com,example,www,)/path',
'https://(com,example,www,)/path'] 'https://(com,example,www,)/path']
''' """
if default_surt == '': if default_surt == "":
return [''] return [""]
parts = default_surt.split(')', 1) parts = default_surt.split(")", 1)
if len(parts) == 2: if len(parts) == 2:
orig_host_part, path_part = parts orig_host_part, path_part = parts
good_surts = [ good_surts = [
'http://(%s,)%s' % (orig_host_part, path_part), "http://(%s,)%s" % (orig_host_part, path_part),
'https://(%s,)%s' % (orig_host_part, path_part), "https://(%s,)%s" % (orig_host_part, path_part),
'http://(%s,www,)%s' % (orig_host_part, path_part), "http://(%s,www,)%s" % (orig_host_part, path_part),
'https://(%s,www,)%s' % (orig_host_part, path_part), "https://(%s,www,)%s" % (orig_host_part, path_part),
] ]
else: # no path part else: # no path part
host_part = parts[0] host_part = parts[0]
good_surts = [ good_surts = [
'http://(%s' % host_part, "http://(%s" % host_part,
'https://(%s' % host_part, "https://(%s" % host_part,
] ]
return good_surts return good_surts
def monkey_patch_dsrules_init(): def monkey_patch_dsrules_init():
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
def cdx_dsrule_init(self, url_prefix, rules): def cdx_dsrule_init(self, url_prefix, rules):
good_surts = [] good_surts = []
url_prefixes = [url_prefix] if isinstance( url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
url_prefix, str) else url_prefix
for bad_surt in url_prefixes: for bad_surt in url_prefixes:
good_surts.extend( good_surts.extend(
TheGoodUrlCanonicalizer.good_surts_from_default( TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
bad_surt)) )
if 'match' in rules and 'regex' in rules['match']: if "match" in rules and "regex" in rules["match"]:
rules['match']['regex'] = r'https?://\(' + rules['match']['regex'] rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
orig_init(self, good_surts, rules) orig_init(self, good_surts, rules)
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
def support_in_progress_warcs(): def support_in_progress_warcs():
''' """
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
being written to (warcs having ".open" suffix). This way if a cdx entry being written to (warcs having ".open" suffix). This way if a cdx entry
references foo.warc.gz, pywb will try both foo.warc.gz and references foo.warc.gz, pywb will try both foo.warc.gz and
foo.warc.gz.open. foo.warc.gz.open.
''' """
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__ _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
def _prefix_resolver_call(self, filename, cdx=None): def _prefix_resolver_call(self, filename, cdx=None):
raw_results = _orig_prefix_resolver_call(self, filename, cdx) raw_results = _orig_prefix_resolver_call(self, filename, cdx)
results = [] results = []
for warc_path in raw_results: for warc_path in raw_results:
results.append(warc_path) results.append(warc_path)
results.append('%s.open' % warc_path) results.append("%s.open" % warc_path)
return results return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
class SomeWbUrl(pywb.rewrite.wburl.WbUrl): class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url): def __init__(self, orig_url):
import re import re
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
pywb.rewrite.wburl.BaseWbUrl.__init__(self) pywb.rewrite.wburl.BaseWbUrl.__init__(self)
if six.PY2 and isinstance(orig_url, six.text_type): if six.PY2 and isinstance(orig_url, six.text_type):
orig_url = orig_url.encode('utf-8') orig_url = orig_url.encode("utf-8")
orig_url = quote(orig_url) orig_url = quote(orig_url)
self._original_url = orig_url self._original_url = orig_url
if not self._init_query(orig_url): if not self._init_query(orig_url):
if not self._init_replay(orig_url): if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url) raise Exception("Invalid WbUrl: ", orig_url)
new_uri = WbUrl.to_uri(self.url) new_uri = WbUrl.to_uri(self.url)
@ -227,8 +244,11 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
self.url = new_uri self.url = new_uri
# begin brozzler changes # begin brozzler changes
if (self.url.startswith('urn:') or self.url.startswith('screenshot:') if (
or self.url.startswith('thumbnail:')): self.url.startswith("urn:")
or self.url.startswith("screenshot:")
or self.url.startswith("thumbnail:")
):
return return
# end brozzler changes # end brozzler changes
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
self.url = self.DEFAULT_SCHEME + self.url self.url = self.DEFAULT_SCHEME + self.url
else: else:
inx += 2 inx += 2
if inx < len(self.url) and self.url[inx] != '/': if inx < len(self.url) and self.url[inx] != "/":
self.url = self.url[:inx] + '/' + self.url[inx:] self.url = self.url[:inx] + "/" + self.url[inx:]
def _get_wburl_type(self): def _get_wburl_type(self):
return SomeWbUrl return SomeWbUrl
def monkey_patch_wburl(): def monkey_patch_wburl():
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli): class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
def _extend_parser(self, arg_parser): def _extend_parser(self, arg_parser):
super()._extend_parser(arg_parser) super()._extend_parser(arg_parser)
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
arg_parser.epilog = ''' arg_parser.epilog = """
Run pywb like so: Run pywb like so:
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
See README.rst for more information. See README.rst for more information.
''' """
# copied and pasted from cdxdomainspecific.py, only changes are commented as # copied and pasted from cdxdomainspecific.py, only changes are commented as
# such below # such below
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):
matched_rule = None matched_rule = None
urlkey = to_native_str(query.key, 'utf-8') urlkey = to_native_str(query.key, "utf-8")
url = query.url url = query.url
filter_ = query.filters filter_ = query.filters
output = query.output output = query.output
@ -306,7 +330,7 @@ def _fuzzy_query_call(self, query):
if not matched_rule: if not matched_rule:
return None return None
repl = '?' repl = "?"
if matched_rule.replace: if matched_rule.replace:
repl = matched_rule.replace repl = matched_rule.replace
@ -315,33 +339,33 @@ def _fuzzy_query_call(self, query):
url = url[: inx + len(repl)] url = url[: inx + len(repl)]
# begin brozzler changes # begin brozzler changes
if matched_rule.match_type == 'domain': if matched_rule.match_type == "domain":
orig_split_url = urlsplit(url) orig_split_url = urlsplit(url)
# remove the subdomain, path, query and fragment # remove the subdomain, path, query and fragment
host = orig_split_url.netloc.split('.', 1)[1] host = orig_split_url.netloc.split(".", 1)[1]
new_split_url = (orig_split_url.scheme, host, '', '', '') new_split_url = (orig_split_url.scheme, host, "", "", "")
url = urlunsplit(new_split_url) url = urlunsplit(new_split_url)
# end brozzler changes # end brozzler changes
params = query.params params = query.params
params.update({'url': url, params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})
'matchType': matched_rule.match_type,
'filter': filter_})
if 'reverse' in params: if "reverse" in params:
del params['reverse'] del params["reverse"]
if 'closest' in params: if "closest" in params:
del params['closest'] del params["closest"]
if 'end_key' in params: if "end_key" in params:
del params['end_key'] del params["end_key"]
return params return params
def monkey_patch_fuzzy_query(): def monkey_patch_fuzzy_query():
pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
# copied and pasted from pywb/utils/canonicalize.py, only changes are commented # copied and pasted from pywb/utils/canonicalize.py, only changes are commented
# as such # as such
def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None): def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
start_key = url_canon(url) start_key = url_canon(url)
if match_type == 'exact': if match_type == "exact":
end_key = start_key + '!' end_key = start_key + "!"
elif match_type == 'prefix': elif match_type == "prefix":
# add trailing slash if url has it # add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'): if url.endswith("/") and not start_key.endswith("/"):
start_key += '/' start_key += "/"
end_key = inc_last_char(start_key) end_key = inc_last_char(start_key)
elif match_type == 'host': elif match_type == "host":
if surt_ordered: if surt_ordered:
host = start_key.split(')/')[0] host = start_key.split(")/")[0]
start_key = host + ')/' start_key = host + ")/"
end_key = host + '*' end_key = host + "*"
else: else:
host = urlparse.urlsplit(url).netloc host = urlparse.urlsplit(url).netloc
start_key = host + '/' start_key = host + "/"
end_key = host + '0' end_key = host + "0"
elif match_type == 'domain': elif match_type == "domain":
if not surt_ordered: if not surt_ordered:
msg = 'matchType=domain unsupported for non-surt' msg = "matchType=domain unsupported for non-surt"
raise UrlCanonicalizeException(msg) raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0] host = start_key.split(")/")[0]
# if tld, use com, as start_key # if tld, use com, as start_key
# otherwise, stick with com,example)/ # otherwise, stick with com,example)/
if ',' not in host: if "," not in host:
start_key = host + ',' start_key = host + ","
else: else:
start_key = host + ')/' start_key = host + ")/"
# begin brozzler changes # begin brozzler changes
end_key = host + '~' end_key = host + "~"
# end brozzler changes # end brozzler changes
else: else:
raise UrlCanonicalizeException('Invalid match_type: ' + match_type) raise UrlCanonicalizeException("Invalid match_type: " + match_type)
return (start_key, end_key) return (start_key, end_key)
def monkey_patch_calc_search_range(): def monkey_patch_calc_search_range():
pywb.utils.canonicalize.calc_search_range = _calc_search_range pywb.utils.canonicalize.calc_search_range = _calc_search_range
pywb.cdx.query.calc_search_range = _calc_search_range pywb.cdx.query.calc_search_range = _calc_search_range
def main(argv=sys.argv): def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
@ -417,7 +443,10 @@ def main(argv=sys.argv):
brozzler.pywb.monkey_patch_fuzzy_query() brozzler.pywb.monkey_patch_fuzzy_query()
brozzler.pywb.monkey_patch_calc_search_range() brozzler.pywb.monkey_patch_calc_search_range()
wayback_cli = BrozzlerWaybackCli( wayback_cli = BrozzlerWaybackCli(
args=argv[1:], default_port=8880, args=argv[1:],
desc=('brozzler-wayback - pywb wayback (monkey-patched for use ' default_port=8880,
'with brozzler)')) desc=(
"brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
),
)
wayback_cli.run() wayback_cli.run()

View file

@ -1,4 +1,4 @@
''' """
brozzler/robots.py - robots.txt support brozzler/robots.py - robots.txt support
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import json import json
import logging import logging
@ -34,30 +34,40 @@ __all__ = ["is_permitted_by_robots"]
# monkey-patch reppy to do substring user-agent matching, see top of file # monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent): def _reppy_rules_getitem(self, agent):
''' """
Find the user-agent token matching the supplied full user-agent, using Find the user-agent token matching the supplied full user-agent, using
a case-insensitive substring search. a case-insensitive substring search.
''' """
lc_agent = agent.lower() lc_agent = agent.lower()
for s in self.agents: for s in self.agents:
if s in lc_agent: if s in lc_agent:
return self.agents[s] return self.agents[s]
return self.agents.get('*') return self.agents.get("*")
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
class _SessionRaiseOn420(requests.Session): class _SessionRaiseOn420(requests.Session):
timeout = 60 timeout = 60
def get(self, url, *args, **kwargs): def get(self, url, *args, **kwargs):
res = super().get(url, timeout=self.timeout, *args, **kwargs) res = super().get(url, timeout=self.timeout, *args, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers: if res.status_code == 420 and "warcprox-meta" in res.headers:
raise brozzler.ReachedLimit( raise brozzler.ReachedLimit(
warcprox_meta=json.loads(res.headers['warcprox-meta']), warcprox_meta=json.loads(res.headers["warcprox-meta"]),
http_payload=res.text) http_payload=res.text,
)
else: else:
return res return res
_robots_caches = {} # {site_id:reppy.cache.RobotsCache} _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None): def _robots_cache(site, proxy=None):
if not site.id in _robots_caches: if not site.id in _robots_caches:
req_sesh = _SessionRaiseOn420() req_sesh = _SessionRaiseOn420()
@ -68,14 +78,16 @@ def _robots_cache(site, proxy=None):
if site.extra_headers(): if site.extra_headers():
req_sesh.headers.update(site.extra_headers()) req_sesh.headers.update(site.extra_headers())
if site.user_agent: if site.user_agent:
req_sesh.headers['User-Agent'] = site.user_agent req_sesh.headers["User-Agent"] = site.user_agent
_robots_caches[site.id] = reppy.cache.RobotsCache( _robots_caches[site.id] = reppy.cache.RobotsCache(
session=req_sesh, disallow_forbidden=False) session=req_sesh, disallow_forbidden=False
)
return _robots_caches[site.id] return _robots_caches[site.id]
def is_permitted_by_robots(site, url, proxy=None): def is_permitted_by_robots(site, url, proxy=None):
''' """
Checks if `url` is permitted by robots.txt. Checks if `url` is permitted by robots.txt.
Treats any kind of error fetching robots.txt as "allow all". See Treats any kind of error fetching robots.txt as "allow all". See
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
Raises: Raises:
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
requests.exceptions.ProxyError: if the proxy is down requests.exceptions.ProxyError: if the proxy is down
''' """
if site.ignore_robots: if site.ignore_robots:
return True return True
try: try:
result = _robots_cache(site, proxy).allowed( result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
url, site.user_agent or "brozzler")
return result return result
except Exception as e: except Exception as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance( if isinstance(e, reppy.exceptions.ServerError) and isinstance(
e.args[0], brozzler.ReachedLimit): e.args[0], brozzler.ReachedLimit
):
raise e.args[0] raise e.args[0]
elif hasattr(e, 'args') and isinstance( elif hasattr(e, "args") and isinstance(
e.args[0], requests.exceptions.ProxyError): e.args[0], requests.exceptions.ProxyError
):
# reppy has wrapped an exception that we want to bubble up # reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e) raise brozzler.ProxyError(e)
else: else:
logging.warning( logging.warning(
"returning true (permitted) after problem fetching " "returning true (permitted) after problem fetching "
"robots.txt for %r: %r", url, e) "robots.txt for %r: %r",
url,
e,
)
return True return True

View file

@ -1,4 +1,4 @@
''' """
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs yt-dlp on them, browses them and runs behaviors if appropriate, it runs yt-dlp on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier scopes and adds outlinks to the frontier
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import brozzler import brozzler
@ -39,6 +39,7 @@ from . import ydl
r = rdb.RethinkDB() r = rdb.RethinkDB()
class BrozzlerWorker: class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -50,13 +51,26 @@ class BrozzlerWorker:
SITE_SESSION_MINUTES = 15 SITE_SESSION_MINUTES = 15
def __init__( def __init__(
self, frontier, service_registry=None, max_browsers=1, self,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, frontier,
skip_extract_outlinks=False, skip_visit_hashtags=False, service_registry=None,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, max_browsers=1,
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, chrome_exe="chromium-browser",
download_throughput=-1, stealth=False, warcprox_auto=False,
window_height=900, window_width=1400): proxy=None,
skip_extract_outlinks=False,
skip_visit_hashtags=False,
skip_youtube_dl=False,
simpler404=False,
screenshot_full_page=False,
page_timeout=300,
behavior_timeout=900,
extract_outlinks_timeout=60,
download_throughput=-1,
stealth=False,
window_height=900,
window_width=1400,
):
self._frontier = frontier self._frontier = frontier
self._service_registry = service_registry self._service_registry = service_registry
self._max_browsers = max_browsers self._max_browsers = max_browsers
@ -79,7 +93,8 @@ class BrozzlerWorker:
self._stealth = stealth self._stealth = stealth
self._browser_pool = brozzler.browser.BrowserPool( self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
)
self._browsing_threads = set() self._browsing_threads = set()
self._browsing_threads_lock = threading.Lock() self._browsing_threads_lock = threading.Lock()
@ -88,13 +103,20 @@ class BrozzlerWorker:
self._shutdown = threading.Event() self._shutdown = threading.Event()
def _choose_warcprox(self): def _choose_warcprox(self):
warcproxes = self._service_registry.available_services('warcprox') warcproxes = self._service_registry.available_services("warcprox")
if not warcproxes: if not warcproxes:
return None return None
# .group('proxy').count() makes this query about 99% more efficient # .group('proxy').count() makes this query about 99% more efficient
reql = self._frontier.rr.table('sites').between( reql = (
['ACTIVE', r.minval], ['ACTIVE', r.maxval], self._frontier.rr.table("sites")
index='sites_last_disclaimed').group('proxy').count() .between(
["ACTIVE", r.minval],
["ACTIVE", r.maxval],
index="sites_last_disclaimed",
)
.group("proxy")
.count()
)
# returns results like # returns results like
# { # {
# "wbgrp-svc030.us.archive.org:8000": 148, # "wbgrp-svc030.us.archive.org:8000": 148,
@ -102,10 +124,11 @@ class BrozzlerWorker:
# } # }
proxy_scoreboard = dict(reql.run()) proxy_scoreboard = dict(reql.run())
for warcprox in warcproxes: for warcprox in warcproxes:
address = '%s:%s' % (warcprox['host'], warcprox['port']) address = "%s:%s" % (warcprox["host"], warcprox["port"])
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0) warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
warcproxes.sort(key=lambda warcprox: ( warcproxes.sort(
warcprox['assigned_sites'], warcprox['load'])) key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
)
# XXX make this heuristic more advanced? # XXX make this heuristic more advanced?
return warcproxes[0] return warcproxes[0]
@ -118,13 +141,15 @@ class BrozzlerWorker:
svc = self._choose_warcprox() svc = self._choose_warcprox()
if svc is None: if svc is None:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'no available instances of warcprox in the service ' "no available instances of warcprox in the service " "registry"
'registry') )
site.proxy = '%s:%s' % (svc['host'], svc['port']) site.proxy = "%s:%s" % (svc["host"], svc["port"])
site.save() site.save()
self.logger.info( self.logger.info(
'chose warcprox instance %r from service registry for %r', "chose warcprox instance %r from service registry for %r",
site.proxy, site) site.proxy,
site,
)
return site.proxy return site.proxy
return None return None
@ -132,14 +157,16 @@ class BrozzlerWorker:
if self._proxy: if self._proxy:
if self._proxy_is_warcprox is None: if self._proxy_is_warcprox is None:
try: try:
response = requests.get('http://%s/status' % self._proxy) response = requests.get("http://%s/status" % self._proxy)
status = json.loads(response.text) status = json.loads(response.text)
self._proxy_is_warcprox = (status['role'] == 'warcprox') self._proxy_is_warcprox = status["role"] == "warcprox"
except Exception as e: except Exception as e:
self._proxy_is_warcprox = False self._proxy_is_warcprox = False
logging.info( logging.info(
'%s %s warcprox', self._proxy, "%s %s warcprox",
'IS' if self._proxy_is_warcprox else 'IS NOT') self._proxy,
"IS" if self._proxy_is_warcprox else "IS NOT",
)
return self._proxy_is_warcprox return self._proxy_is_warcprox
else: else:
# I should have commented when I originally wrote this code, but I # I should have commented when I originally wrote this code, but I
@ -148,13 +175,20 @@ class BrozzlerWorker:
return bool(site.proxy or self._warcprox_auto) return bool(site.proxy or self._warcprox_auto)
def _warcprox_write_record( def _warcprox_write_record(
self, warcprox_address, url, warc_type, content_type, self,
payload, extra_headers=None): warcprox_address,
url,
warc_type,
content_type,
payload,
extra_headers=None,
):
headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"} headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
if extra_headers: if extra_headers:
headers.update(extra_headers) headers.update(extra_headers)
request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD", request = urllib.request.Request(
headers=headers, data=payload) url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
)
# XXX setting request.type="http" is a hack to stop urllib from trying # XXX setting request.type="http" is a hack to stop urllib from trying
# to tunnel if url is https # to tunnel if url is https
@ -166,25 +200,30 @@ class BrozzlerWorker:
if response.getcode() != 204: if response.getcode() != 204:
self.logger.warning( self.logger.warning(
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', "WARCPROX_WRITE_RECORD request (expected 204)",
response.getcode(), response.reason) response.getcode(),
response.reason,
)
return request, response return request, response
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
self.logger.warning( self.logger.warning(
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', "WARCPROX_WRITE_RECORD request (expected 204)",
e.getcode(), e.info()) e.getcode(),
e.info(),
)
return request, None return request, None
except urllib.error.URLError as e: except urllib.error.URLError as e:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e "proxy error on WARCPROX_WRITE_RECORD %s" % url
) from e
except ConnectionError as e: except ConnectionError as e:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e "proxy error on WARCPROX_WRITE_RECORD %s" % url
) from e
def thumb_jpeg(self, full_jpeg): def thumb_jpeg(self, full_jpeg):
"""Create JPEG thumbnail. """Create JPEG thumbnail."""
"""
img = PIL.Image.open(io.BytesIO(full_jpeg)) img = PIL.Image.open(io.BytesIO(full_jpeg))
thumb_width = 300 thumb_width = 300
thumb_height = (thumb_width / img.size[0]) * img.size[1] thumb_height = (thumb_width / img.size[0]) * img.size[1]
@ -193,8 +232,15 @@ class BrozzlerWorker:
img.save(out, "jpeg", quality=95) img.save(out, "jpeg", quality=95)
return out.getbuffer() return out.getbuffer()
def brozzle_page(self, browser, site, page, on_screenshot=None, def brozzle_page(
on_request=None, enable_youtube_dl=True): self,
browser,
site,
page,
on_screenshot=None,
on_request=None,
enable_youtube_dl=True,
):
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
ydl_fetches = None ydl_fetches = None
outlinks = set() outlinks = set()
@ -208,31 +254,38 @@ class BrozzlerWorker:
except brozzler.ProxyError: except brozzler.ProxyError:
raise raise
except Exception as e: except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 if (
and hasattr(e.exc_info[1], 'code') hasattr(e, "exc_info")
and e.exc_info[1].code == 430): and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info( self.logger.info(
'youtube-dl got %s %s processing %s', "youtube-dl got %s %s processing %s",
e.exc_info[1].code, e.exc_info[1].msg, page.url) e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else: else:
self.logger.error( self.logger.error(
'youtube_dl raised exception on %s', page, "youtube_dl raised exception on %s", page, exc_info=True
exc_info=True) )
if self._needs_browsing(page, ydl_fetches): if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page) self.logger.info("needs browsing: %s", page)
try: try:
browser_outlinks = self._browse_page( browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request) browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks) outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown: except brozzler.PageInterstitialShown:
self.logger.info('page interstitial shown (http auth): %s', page) self.logger.info("page interstitial shown (http auth): %s", page)
else: else:
if not self._already_fetched(page, ydl_fetches): if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page) self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page) self._fetch_url(site, page=page)
else: else:
self.logger.info('already fetched: %s', page) self.logger.info("already fetched: %s", page)
return outlinks return outlinks
@ -243,71 +296,88 @@ class BrozzlerWorker:
if self._using_warcprox(site): if self._using_warcprox(site):
self.logger.info( self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with " "sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy_for(site), page) "screenshot for %s",
self._proxy_for(site),
page,
)
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy_for(site), warcprox_address=self._proxy_for(site),
url="screenshot:%s" % str(urlcanon.semantic(page.url)), url="screenshot:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg", warc_type="resource",
content_type="image/jpeg",
payload=screenshot_jpeg, payload=screenshot_jpeg,
extra_headers=site.extra_headers(page)) extra_headers=site.extra_headers(page),
)
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy_for(site), warcprox_address=self._proxy_for(site),
url="thumbnail:%s" % str(urlcanon.semantic(page.url)), url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg", warc_type="resource",
content_type="image/jpeg",
payload=thumbnail_jpeg, payload=thumbnail_jpeg,
extra_headers=site.extra_headers(page)) extra_headers=site.extra_headers(page),
)
def _on_response(chrome_msg): def _on_response(chrome_msg):
if ('params' in chrome_msg if (
and 'response' in chrome_msg['params'] "params" in chrome_msg
and 'mimeType' in chrome_msg['params']['response'] and "response" in chrome_msg["params"]
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') and "mimeType" in chrome_msg["params"]["response"]
and chrome_msg["params"]["response"]
.get("mimeType", "")
.startswith("video/")
# skip manifests of DASH segmented video - # skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70 # see https://github.com/internetarchive/brozzler/pull/70
and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' and chrome_msg["params"]["response"]["mimeType"]
and chrome_msg['params']['response'].get('status') in (200, 206)): != "video/vnd.mpeg.dash.mpd"
and chrome_msg["params"]["response"].get("status") in (200, 206)
):
video = { video = {
'blame': 'browser', "blame": "browser",
'url': chrome_msg['params']['response'].get('url'), "url": chrome_msg["params"]["response"].get("url"),
'response_code': chrome_msg['params']['response']['status'], "response_code": chrome_msg["params"]["response"]["status"],
'content-type': chrome_msg['params']['response']['mimeType'], "content-type": chrome_msg["params"]["response"]["mimeType"],
} }
response_headers = CaseInsensitiveDict( response_headers = CaseInsensitiveDict(
chrome_msg['params']['response']['headers']) chrome_msg["params"]["response"]["headers"]
if 'content-length' in response_headers: )
video['content-length'] = int(response_headers['content-length']) if "content-length" in response_headers:
if 'content-range' in response_headers: video["content-length"] = int(response_headers["content-length"])
video['content-range'] = response_headers['content-range'] if "content-range" in response_headers:
logging.debug('embedded video %s', video) video["content-range"] = response_headers["content-range"]
if not 'videos' in page: logging.debug("embedded video %s", video)
if not "videos" in page:
page.videos = [] page.videos = []
page.videos.append(video) page.videos.append(video)
sw_fetched = set() sw_fetched = set()
def _on_service_worker_version_updated(chrome_msg): def _on_service_worker_version_updated(chrome_msg):
# https://github.com/internetarchive/brozzler/issues/140 # https://github.com/internetarchive/brozzler/issues/140
self.logger.trace('%r', chrome_msg) self.logger.trace("%r", chrome_msg)
if chrome_msg.get('params', {}).get('versions'): if chrome_msg.get("params", {}).get("versions"):
url = chrome_msg.get('params', {}).get('versions')[0]\ url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
.get('scriptURL')
if url and url not in sw_fetched: if url and url not in sw_fetched:
self.logger.info('fetching service worker script %s', url) self.logger.info("fetching service worker script %s", url)
self._fetch_url(site, url=url) self._fetch_url(site, url=url)
sw_fetched.add(url) sw_fetched.add(url)
if not browser.is_running(): if not browser.is_running():
browser.start( browser.start(
proxy=self._proxy_for(site), proxy=self._proxy_for(site),
cookie_db=site.get('cookie_db'), cookie_db=site.get("cookie_db"),
window_height=self._window_height, window_height=self._window_height,
window_width=self._window_width) window_width=self._window_width,
)
final_page_url, outlinks = browser.browse_page( final_page_url, outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(page), page.url,
behavior_parameters=site.get('behavior_parameters'), extra_headers=site.extra_headers(page),
username=site.get('username'), password=site.get('password'), behavior_parameters=site.get("behavior_parameters"),
user_agent=site.get('user_agent'), username=site.get("username"),
on_screenshot=_on_screenshot, on_response=_on_response, password=site.get("password"),
user_agent=site.get("user_agent"),
on_screenshot=_on_screenshot,
on_response=_on_response,
on_request=on_request, on_request=on_request,
on_service_worker_version_updated=_on_service_worker_version_updated, on_service_worker_version_updated=_on_service_worker_version_updated,
hashtags=page.hashtags, hashtags=page.hashtags,
@ -320,7 +390,8 @@ class BrozzlerWorker:
behavior_timeout=self._behavior_timeout, behavior_timeout=self._behavior_timeout,
extract_outlinks_timeout=self._extract_outlinks_timeout, extract_outlinks_timeout=self._extract_outlinks_timeout,
download_throughput=self._download_throughput, download_throughput=self._download_throughput,
stealth=self._stealth) stealth=self._stealth,
)
if final_page_url != page.url: if final_page_url != page.url:
page.note_redirect(final_page_url) page.note_redirect(final_page_url)
return outlinks return outlinks
@ -331,19 +402,18 @@ class BrozzlerWorker:
url = page.url url = page.url
if self._proxy_for(site): if self._proxy_for(site):
proxies = { proxies = {
'http': 'http://%s' % self._proxy_for(site), "http": "http://%s" % self._proxy_for(site),
'https': 'http://%s' % self._proxy_for(site), "https": "http://%s" % self._proxy_for(site),
} }
self.logger.info('fetching %s', url) self.logger.info("fetching %s", url)
try: try:
# response is ignored # response is ignored
requests.get( requests.get(
url, proxies=proxies, headers=site.extra_headers(page), url, proxies=proxies, headers=site.extra_headers(page), verify=False
verify=False) )
except requests.exceptions.ProxyError as e: except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError( raise brozzler.ProxyError("proxy error fetching %s" % url) from e
'proxy error fetching %s' % url) from e
def _needs_browsing(self, page, ydl_fetches): def _needs_browsing(self, page, ydl_fetches):
if ydl_fetches: if ydl_fetches:
@ -351,8 +421,10 @@ class BrozzlerWorker:
if not final_bounces: if not final_bounces:
return True return True
for txn in final_bounces: for txn in final_bounces:
if txn['response_headers'].get_content_type() in [ if txn["response_headers"].get_content_type() in [
'text/html', 'application/xhtml+xml']: "text/html",
"application/xhtml+xml",
]:
return True return True
return False return False
else: else:
@ -361,14 +433,13 @@ class BrozzlerWorker:
def _already_fetched(self, page, ydl_fetches): def _already_fetched(self, page, ydl_fetches):
if ydl_fetches: if ydl_fetches:
for fetch in ydl.final_bounces(ydl_fetches, page.url): for fetch in ydl.final_bounces(ydl_fetches, page.url):
if (fetch['method'] == 'GET' and fetch['response_code'] == 200): if fetch["method"] == "GET" and fetch["response_code"] == 200:
return True return True
return False return False
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):
try: try:
site.last_claimed_by = '%s:%s' % ( site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
socket.gethostname(), browser.chrome.port)
site.save() site.save()
start = time.time() start = time.time()
page = None page = None
@ -377,28 +448,28 @@ class BrozzlerWorker:
# _proxy_for() call in log statement can raise brozzler.ProxyError # _proxy_for() call in log statement can raise brozzler.ProxyError
# which is why we honor time limit and stop request first☝🏻 # which is why we honor time limit and stop request first☝🏻
self.logger.info( self.logger.info(
"brozzling site (proxy=%r) %s", "brozzling site (proxy=%r) %s", self._proxy_for(site), site
self._proxy_for(site), site) )
while time.time() - start < self.SITE_SESSION_MINUTES * 60: while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh() site.refresh()
self._frontier.enforce_time_limit(site) self._frontier.enforce_time_limit(site)
self._frontier.honor_stop_request(site) self._frontier.honor_stop_request(site)
page = self._frontier.claim_page(site, "%s:%s" % ( page = self._frontier.claim_page(
socket.gethostname(), browser.chrome.port)) site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
)
if (page.needs_robots_check and if page.needs_robots_check and not brozzler.is_permitted_by_robots(
not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site)
site, page.url, self._proxy_for(site))): ):
logging.warning("page %s is blocked by robots.txt", page.url) logging.warning("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True page.blocked_by_robots = True
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
else: else:
outlinks = self.brozzle_page( outlinks = self.brozzle_page(
browser, site, page, browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
enable_youtube_dl=not self._skip_youtube_dl) )
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks( self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
site, page, outlinks)
if browser.is_running(): if browser.is_running():
site.cookie_db = browser.chrome.persist_and_read_cookie_db() site.cookie_db = browser.chrome.persist_and_read_cookie_db()
@ -418,31 +489,36 @@ class BrozzlerWorker:
except brozzler.ProxyError as e: except brozzler.ProxyError as e:
if self._warcprox_auto: if self._warcprox_auto:
logging.error( logging.error(
'proxy error (site.proxy=%s), will try to choose a ' "proxy error (site.proxy=%s), will try to choose a "
'healthy instance next time site is brozzled: %s', "healthy instance next time site is brozzled: %s",
site.proxy, e) site.proxy,
e,
)
site.proxy = None site.proxy = None
else: else:
# using brozzler-worker --proxy, nothing to do but try the # using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time # same proxy again next time
logging.error( logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
except: except:
self.logger.error( self.logger.error(
'unexpected exception site=%r page=%r', site, page, "unexpected exception site=%r page=%r", site, page, exc_info=True
exc_info=True) )
if page: if page:
page.failed_attempts = (page.failed_attempts or 0) + 1 page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info( self.logger.info(
'marking page "completed" after %s unexpected ' 'marking page "completed" after %s unexpected '
'exceptions attempting to brozzle %s', "exceptions attempting to brozzle %s",
page.failed_attempts, page) page.failed_attempts,
page,
)
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
page = None page = None
finally: finally:
if start: if start:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start site.active_brozzling_time = (
(site.active_brozzling_time or 0) + time.time() - start
)
self._frontier.disclaim_site(site, page) self._frontier.disclaim_site(site, page)
def _brozzle_site_thread_target(self, browser, site): def _brozzle_site_thread_target(self, browser, site):
@ -462,21 +538,25 @@ class BrozzlerWorker:
"role": "brozzler-worker", "role": "brozzler-worker",
"ttl": self.HEARTBEAT_INTERVAL * 3, "ttl": self.HEARTBEAT_INTERVAL * 3,
} }
status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size status_info["load"] = (
1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
)
status_info["browser_pool_size"] = self._browser_pool.size status_info["browser_pool_size"] = self._browser_pool.size
status_info["browsers_in_use"] = self._browser_pool.num_in_use() status_info["browsers_in_use"] = self._browser_pool.num_in_use()
try: try:
self.status_info = self._service_registry.heartbeat(status_info) self.status_info = self._service_registry.heartbeat(status_info)
self.logger.trace( self.logger.trace("status in service registry: %s", self.status_info)
"status in service registry: %s", self.status_info)
except r.ReqlError as e: except r.ReqlError as e:
self.logger.error( self.logger.error(
"failed to send heartbeat and update service registry " "failed to send heartbeat and update service registry "
"with info %s: %s", status_info, e) "with info %s: %s",
status_info,
e,
)
def _service_heartbeat_if_due(self): def _service_heartbeat_if_due(self):
'''Sends service registry heartbeat if due''' """Sends service registry heartbeat if due"""
due = False due = False
if self._service_registry: if self._service_registry:
if not hasattr(self, "status_info"): if not hasattr(self, "status_info"):
@ -489,15 +569,16 @@ class BrozzlerWorker:
self._service_heartbeat() self._service_heartbeat()
def _start_browsing_some_sites(self): def _start_browsing_some_sites(self):
''' """
Starts browsing some sites. Starts browsing some sites.
Raises: Raises:
NoBrowsersAvailable if none available NoBrowsersAvailable if none available
''' """
# acquire_multi() raises NoBrowsersAvailable if none available # acquire_multi() raises NoBrowsersAvailable if none available
browsers = self._browser_pool.acquire_multi( browsers = self._browser_pool.acquire_multi(
(self._browser_pool.num_available() + 1) // 2) (self._browser_pool.num_available() + 1) // 2
)
try: try:
sites = self._frontier.claim_sites(len(browsers)) sites = self._frontier.claim_sites(len(browsers))
except: except:
@ -510,7 +591,8 @@ class BrozzlerWorker:
target=self._brozzle_site_thread_target, target=self._brozzle_site_thread_target,
args=(browsers[i], sites[i]), args=(browsers[i], sites[i]),
name="BrozzlingThread:%s" % browsers[i].chrome.port, name="BrozzlingThread:%s" % browsers[i].chrome.port,
daemon=True) daemon=True,
)
with self._browsing_threads_lock: with self._browsing_threads_lock:
self._browsing_threads.add(th) self._browsing_threads.add(th)
th.start() th.start()
@ -519,7 +601,8 @@ class BrozzlerWorker:
def run(self): def run(self):
self.logger.notice( self.logger.notice(
'brozzler %s - brozzler-worker starting', brozzler.__version__) "brozzler %s - brozzler-worker starting", brozzler.__version__
)
last_nothing_to_claim = 0 last_nothing_to_claim = 0
try: try:
while not self._shutdown.is_set(): while not self._shutdown.is_set():
@ -528,39 +611,38 @@ class BrozzlerWorker:
try: try:
self._start_browsing_some_sites() self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable: except brozzler.browser.NoBrowsersAvailable:
logging.trace( logging.trace("all %s browsers are in use", self._max_browsers)
"all %s browsers are in use",
self._max_browsers)
except brozzler.NothingToClaim: except brozzler.NothingToClaim:
last_nothing_to_claim = time.time() last_nothing_to_claim = time.time()
logging.trace( logging.trace(
"nothing to claim, all available active sites " "nothing to claim, all available active sites "
"are already claimed by a brozzler worker") "are already claimed by a brozzler worker"
)
time.sleep(0.5) time.sleep(0.5)
self.logger.notice("shutdown requested") self.logger.notice("shutdown requested")
except r.ReqlError as e: except r.ReqlError as e:
self.logger.error( self.logger.error(
"caught rethinkdb exception, will try to proceed", "caught rethinkdb exception, will try to proceed", exc_info=True
exc_info=True) )
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
self.logger.info("shutdown requested") self.logger.info("shutdown requested")
except: except:
self.logger.critical( self.logger.critical(
"thread exiting due to unexpected exception", "thread exiting due to unexpected exception", exc_info=True
exc_info=True) )
finally: finally:
if self._service_registry and hasattr(self, "status_info"): if self._service_registry and hasattr(self, "status_info"):
try: try:
self._service_registry.unregister(self.status_info["id"]) self._service_registry.unregister(self.status_info["id"])
except: except:
self.logger.error( self.logger.error(
"failed to unregister from service registry", "failed to unregister from service registry", exc_info=True
exc_info=True) )
self.logger.info( self.logger.info(
'shutting down %s brozzling threads', "shutting down %s brozzling threads", len(self._browsing_threads)
len(self._browsing_threads)) )
with self._browsing_threads_lock: with self._browsing_threads_lock:
for th in self._browsing_threads: for th in self._browsing_threads:
if th.is_alive(): if th.is_alive():
@ -575,11 +657,10 @@ class BrozzlerWorker:
with self._start_stop_lock: with self._start_stop_lock:
if self._thread: if self._thread:
self.logger.warning( self.logger.warning(
'ignoring start request because self._thread is ' "ignoring start request because self._thread is " "not None"
'not None') )
return return
self._thread = threading.Thread( self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
target=self.run, name="BrozzlerWorker")
self._thread.start() self._thread.start()
def shutdown_now(self): def shutdown_now(self):
@ -590,4 +671,3 @@ class BrozzlerWorker:
def is_alive(self): def is_alive(self):
return self._thread and self._thread.is_alive() return self._thread and self._thread.is_alive()

View file

@ -1,4 +1,4 @@
''' """
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
Copyright (C) 2023 Internet Archive Copyright (C) 2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import logging import logging
import yt_dlp import yt_dlp
@ -31,6 +31,7 @@ import threading
thread_local = threading.local() thread_local = threading.local()
class ExtraHeaderAdder(urllib.request.BaseHandler): class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers): def __init__(self, extra_headers):
self.extra_headers = extra_headers self.extra_headers = extra_headers
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
req.add_header(h, v) req.add_header(h, v)
return req return req
class YoutubeDLSpy(urllib.request.BaseHandler): class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
def _http_response(self, request, response): def _http_response(self, request, response):
fetch = { fetch = {
'url': request.full_url, "url": request.full_url,
'method': request.get_method(), "method": request.get_method(),
'response_code': response.code, "response_code": response.code,
'response_headers': response.headers, "response_headers": response.headers,
} }
self.fetches.append(fetch) self.fetches.append(fetch)
return response return response
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
def reset(self): def reset(self):
self.fetches = [] self.fetches = []
def final_bounces(fetches, url): def final_bounces(fetches, url):
""" """
Resolves redirect chains in `fetches` and returns a list of fetches Resolves redirect chains in `fetches` and returns a list of fetches
@ -75,24 +78,26 @@ def final_bounces(fetches, url):
for fetch in fetches: for fetch in fetches:
# XXX check http status 301,302,303,307? check for "uri" header # XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler # as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in fetch['response_headers']: if "location" in fetch["response_headers"]:
redirects[fetch['url']] = fetch redirects[fetch["url"]] = fetch
final_url = url final_url = url
while final_url in redirects: while final_url in redirects:
fetch = redirects.pop(final_url) fetch = redirects.pop(final_url)
final_url = urllib.parse.urljoin( final_url = urllib.parse.urljoin(
fetch['url'], fetch['response_headers']['location']) fetch["url"], fetch["response_headers"]["location"]
)
final_bounces = [] final_bounces = []
for fetch in fetches: for fetch in fetches:
if fetch['url'] == final_url: if fetch["url"] == final_url:
final_bounces.append(fetch) final_bounces.append(fetch)
return final_bounces return final_bounces
def _build_youtube_dl(worker, destdir, site, page): def _build_youtube_dl(worker, destdir, site, page):
''' """
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
The `YoutubeDL` instance does a few special brozzler-specific things: The `YoutubeDL` instance does a few special brozzler-specific things:
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):
Returns: Returns:
a yt-dlp `yt_dlp.YoutubeDL` instance a yt-dlp `yt_dlp.YoutubeDL` instance
''' """
class _YoutubeDL(yt_dlp.YoutubeDL): class _YoutubeDL(yt_dlp.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
def add_default_extra_info(self, ie_result, ie, url): def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging # hook in some logging
super().add_default_extra_info(ie_result, ie, url) super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist': if ie_result.get("_type") == "playlist":
self.logger.info( self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
'extractor %r found playlist in %s', ie.IE_NAME, url) if ie.IE_NAME in {
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}: "youtube:playlist",
"youtube:tab",
"soundcloud:user",
"instagram:user",
}:
# At this point ie_result['entries'] is an iterator that # At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the # will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because # videos. We unroll that iterator here partly because
# otherwise `process_ie_result()` will clobber it, and we # otherwise `process_ie_result()` will clobber it, and we
# use it later to extract the watch pages as outlinks. # use it later to extract the watch pages as outlinks.
try: try:
ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e: except Exception as e:
self.logger.warning( self.logger.warning(
"failed to unroll ie_result['entries']? for %s, %s; exception %s", "failed to unroll ie_result['entries']? for %s, %s; exception %s",
ie.IE_NAME, url, e) ie.IE_NAME,
ie_result['entries_no_dl'] =[] url,
ie_result['entries'] = [] e,
)
ie_result["entries_no_dl"] = []
ie_result["entries"] = []
self.logger.info( self.logger.info(
'not downloading %s media files from this ' "not downloading %s media files from this "
'playlist because we expect to capture them from ' "playlist because we expect to capture them from "
'individual watch/track/detail pages', "individual watch/track/detail pages",
len(ie_result['entries_no_dl'])) len(ie_result["entries_no_dl"]),
)
else: else:
self.logger.info( self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
'extractor %r found a download in %s', ie.IE_NAME, url)
def _push_video_to_warcprox(self, site, info_dict, postprocessor): def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such # 220211 update: does yt-dlp supply content-type? no, not as such
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
# youtube-dl produces a stitched-up video that /usr/bin/file fails # youtube-dl produces a stitched-up video that /usr/bin/file fails
# to identify (says "application/octet-stream"). `ffprobe` doesn't # to identify (says "application/octet-stream"). `ffprobe` doesn't
# give us a mimetype. # give us a mimetype.
if info_dict.get('ext') == 'mp4': if info_dict.get("ext") == "mp4":
mimetype = 'video/mp4' mimetype = "video/mp4"
else: else:
try: try:
import magic import magic
mimetype = magic.from_file(info_dict['filepath'], mime=True)
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e: except ImportError as e:
mimetype = 'video/%s' % info_dict['ext'] mimetype = "video/%s" % info_dict["ext"]
self.logger.warning( self.logger.warning("guessing mimetype %s because %r", mimetype, e)
'guessing mimetype %s because %r', mimetype, e)
# youtube watch page postprocessor is MoveFiles # youtube watch page postprocessor is MoveFiles
if postprocessor == 'FixupM3u8' or postprocessor == 'Merger': if postprocessor == "FixupM3u8" or postprocessor == "Merger":
url = 'youtube-dl:%05d:%s' % ( url = "youtube-dl:%05d:%s" % (
info_dict.get('playlist_index') or 1, info_dict.get("playlist_index") or 1,
info_dict['webpage_url']) info_dict["webpage_url"],
)
else: else:
url = info_dict.get('url', '') url = info_dict.get("url", "")
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8 # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
if url.endswith('.m3u8') or url == '': if url.endswith(".m3u8") or url == "":
return return
size = os.path.getsize(info_dict['filepath']) size = os.path.getsize(info_dict["filepath"])
self.logger.info( self.logger.info(
'pushing %r video as %s (%s bytes) to ' "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
'warcprox at %s with url %s', info_dict['format'], info_dict["format"],
mimetype, size, worker._proxy_for(site), url) mimetype,
with open(info_dict['filepath'], 'rb') as f: size,
worker._proxy_for(site),
url,
)
with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked # include content-length header to avoid chunked
# transfer, which warcprox currently rejects # transfer, which warcprox currently rejects
extra_headers = dict(site.extra_headers()) extra_headers = dict(site.extra_headers())
extra_headers['content-length'] = size extra_headers["content-length"] = size
request, response = worker._warcprox_write_record( request, response = worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site), url=url, warcprox_address=worker._proxy_for(site),
warc_type='resource', content_type=mimetype, payload=f, url=url,
extra_headers=extra_headers) warc_type="resource",
content_type=mimetype,
payload=f,
extra_headers=extra_headers,
)
# consulted by _remember_videos() # consulted by _remember_videos()
ydl.pushed_videos.append({ ydl.pushed_videos.append(
'url': url, {
'response_code': response.code, "url": url,
'content-type': mimetype, "response_code": response.code,
'content-length': size, "content-type": mimetype,
}) "content-length": size,
}
)
def maybe_heartbeat_site_last_claimed(*args, **kwargs): def maybe_heartbeat_site_last_claimed(*args, **kwargs):
# in case yt-dlp takes a long time, heartbeat site.last_claimed # in case yt-dlp takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site # to prevent another brozzler-worker from claiming the site
try: try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES): if (
site.rr
and doublethink.utcnow() - site.last_claimed
> datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
):
worker.logger.debug( worker.logger.debug(
'heartbeating site.last_claimed to prevent another ' "heartbeating site.last_claimed to prevent another "
'brozzler-worker claiming this site id=%r', site.id) "brozzler-worker claiming this site id=%r",
site.id,
)
site.last_claimed = doublethink.utcnow() site.last_claimed = doublethink.utcnow()
site.save() site.save()
except: except:
worker.logger.debug( worker.logger.debug(
'problem heartbeating site.last_claimed site id=%r', "problem heartbeating site.last_claimed site id=%r",
site.id, exc_info=True) site.id,
exc_info=True,
)
def ydl_postprocess_hook(d): def ydl_postprocess_hook(d):
if d['status'] == 'finished': if d["status"] == "finished":
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) worker.logger.info(
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
)
if worker._using_warcprox(site): if worker._using_warcprox(site):
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor']) _YoutubeDL._push_video_to_warcprox(
_YoutubeDL, site, d["info_dict"], d["postprocessor"]
)
# default socket_timeout is 20 -- we hit it often when cluster is busy # default socket_timeout is 20 -- we hit it often when cluster is busy
ydl_opts = { ydl_opts = {
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
"socket_timeout": 40, "socket_timeout": 40,
"progress_hooks": [maybe_heartbeat_site_last_claimed], "progress_hooks": [maybe_heartbeat_site_last_claimed],
"postprocessor_hooks": [ydl_postprocess_hook], "postprocessor_hooks": [ydl_postprocess_hook],
# https://github.com/yt-dlp/yt-dlp#format-selection # https://github.com/yt-dlp/yt-dlp#format-selection
# "By default, yt-dlp tries to download the best available quality..." # "By default, yt-dlp tries to download the best available quality..."
# pre-v.2023.07.06: "format_sort": ["ext"], # pre-v.2023.07.06: "format_sort": ["ext"],
@ -241,13 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page):
"format_sort": ["res:720", "vcodec:h264", "acodec:aac"], "format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
# skip live streams # skip live streams
"match_filter": match_filter_func("!is_live"), "match_filter": match_filter_func("!is_live"),
"extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
"extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
# --cache-dir local or.. # --cache-dir local or..
# this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal? # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
"cache_dir": "/home/archiveit", "cache_dir": "/home/archiveit",
"logger": logging.getLogger("yt_dlp"), "logger": logging.getLogger("yt_dlp"),
"verbose": False, "verbose": False,
"quiet": False, "quiet": False,
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
ydl._opener.add_handler(ydl.fetch_spy) ydl._opener.add_handler(ydl.fetch_spy)
return ydl return ydl
def _remember_videos(page, fetches, pushed_videos=None): def _remember_videos(page, fetches, pushed_videos=None):
''' """
Saves info about videos captured by yt-dlp in `page.videos`. Saves info about videos captured by yt-dlp in `page.videos`.
''' """
if not 'videos' in page: if not "videos" in page:
page.videos = [] page.videos = []
for fetch in fetches or []: for fetch in fetches or []:
content_type = fetch['response_headers'].get_content_type() content_type = fetch["response_headers"].get_content_type()
if (content_type.startswith('video/') if (
content_type.startswith("video/")
# skip manifests of DASH segmented video - # skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70 # see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd' and content_type != "video/vnd.mpeg.dash.mpd"
and fetch['method'] == 'GET' and fetch["method"] == "GET"
and fetch['response_code'] in (200, 206)): and fetch["response_code"] in (200, 206)
):
video = { video = {
'blame': 'youtube-dl', "blame": "youtube-dl",
'url': fetch['url'], "url": fetch["url"],
'response_code': fetch['response_code'], "response_code": fetch["response_code"],
'content-type': content_type, "content-type": content_type,
} }
if 'content-length' in fetch['response_headers']: if "content-length" in fetch["response_headers"]:
video['content-length'] = int( video["content-length"] = int(
fetch['response_headers']['content-length']) fetch["response_headers"]["content-length"]
if 'content-range' in fetch['response_headers']: )
if "content-range" in fetch["response_headers"]:
# skip chunked youtube video # skip chunked youtube video
if 'googlevideo.com/videoplayback' in fetch['url']: if "googlevideo.com/videoplayback" in fetch["url"]:
continue continue
video['content-range'] = fetch[ video["content-range"] = fetch["response_headers"]["content-range"]
'response_headers']['content-range'] logging.debug("embedded video %s", video)
logging.debug('embedded video %s', video)
page.videos.append(video) page.videos.append(video)
for pushed_video in pushed_videos or []: for pushed_video in pushed_videos or []:
if pushed_video['content-type'].startswith('video/'): if pushed_video["content-type"].startswith("video/"):
video = { video = {
'blame': 'youtube-dl', "blame": "youtube-dl",
'url': pushed_video['url'], "url": pushed_video["url"],
'response_code': pushed_video['response_code'], "response_code": pushed_video["response_code"],
'content-type': pushed_video['content-type'], "content-type": pushed_video["content-type"],
'content-length': pushed_video['content-length'], "content-length": pushed_video["content-length"],
} }
logging.debug('embedded video %s', video) logging.debug("embedded video %s", video)
page.videos.append(video) page.videos.append(video)
def _try_youtube_dl(worker, ydl, site, page): def _try_youtube_dl(worker, ydl, site, page):
try: try:
logging.info("trying yt-dlp on %s", page) logging.info("trying yt-dlp on %s", page)
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
# no host given>" resulting in ProxyError # no host given>" resulting in ProxyError
# needs automated test # needs automated test
# and yt-dlp needs sanitize_info for extract_info # and yt-dlp needs sanitize_info for extract_info
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) ie_result = ydl.sanitize_info(
ydl.extract_info(str(urlcanon.whatwg(page.url)))
)
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos) _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
if worker._using_warcprox(site): if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4) info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info( logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox " "sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s", page) "with yt-dlp json for %s",
page,
)
worker._warcprox_write_record( worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site), warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata", warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"), payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers(page)) extra_headers=site.extra_headers(page),
)
return ie_result return ie_result
except brozzler.ShutdownRequested as e: except brozzler.ShutdownRequested as e:
raise raise
except Exception as e: except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError: if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
return None return None
elif (hasattr(e, "exc_info") elif (
hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code") and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420): and e.exc_info[1].code == 420
):
raise brozzler.ReachedLimit(e.exc_info[1]) raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info') elif (
hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.URLError and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)): and worker._proxy_for(site)
):
# connection problem when using a proxy == proxy error (XXX?) # connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError( raise brozzler.ProxyError(
'yt-dlp hit apparent proxy error from ' "yt-dlp hit apparent proxy error from " "%s" % page.url
'%s' % page.url) from e ) from e
else: else:
raise raise
def do_youtube_dl(worker, site, page): def do_youtube_dl(worker, site, page):
''' """
Runs yt-dlp configured for `worker` and `site` to download videos from Runs yt-dlp configured for `worker` and `site` to download videos from
`page`. `page`.
@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
'response_headers': ..., 'response_headers': ...,
}, ...] }, ...]
`list` of `str`: outlink urls `list` of `str`: outlink urls
''' """
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site, page) ydl = _build_youtube_dl(worker, tempdir, site, page)
ie_result = _try_youtube_dl(worker, ydl, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set() outlinks = set()
if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or if ie_result and (
ie_result.get('extractor') == 'youtube:tab'): ie_result.get("extractor") == "youtube:playlist"
or ie_result.get("extractor") == "youtube:tab"
):
# youtube watch pages as outlinks # youtube watch pages as outlinks
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id'] outlinks = {
for e in ie_result.get('entries_no_dl', [])} "https://www.youtube.com/watch?v=%s" % e["id"]
for e in ie_result.get("entries_no_dl", [])
}
# any outlinks for other cases? # any outlinks for other cases?
return ydl.fetch_spy.fetches, outlinks return ydl.fetch_spy.fetches, outlinks

124
setup.py
View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
setup.py - brozzler setup script setup.py - brozzler setup script
Copyright (C) 2014-2024 Internet Archive Copyright (C) 2014-2024 Internet Archive
@ -15,88 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import setuptools import setuptools
import os import os
def find_package_data(package): def find_package_data(package):
pkg_data = [] pkg_data = []
depth = len(package.split('.')) depth = len(package.split("."))
path = os.path.join(*package.split('.')) path = os.path.join(*package.split("."))
for dirpath, dirnames, filenames in os.walk(path): for dirpath, dirnames, filenames in os.walk(path):
if not os.path.exists(os.path.join(dirpath, '__init__.py')): if not os.path.exists(os.path.join(dirpath, "__init__.py")):
relpath = os.path.join(*dirpath.split(os.sep)[depth:]) relpath = os.path.join(*dirpath.split(os.sep)[depth:])
pkg_data.extend(os.path.join(relpath, f) for f in filenames) pkg_data.extend(os.path.join(relpath, f) for f in filenames)
return pkg_data return pkg_data
setuptools.setup( setuptools.setup(
name='brozzler', name="brozzler",
version='1.5.45a0', version="1.5.45a1",
description='Distributed web crawling with browsers', description="Distributed web crawling with browsers",
url='https://github.com/internetarchive/brozzler', url="https://github.com/internetarchive/brozzler",
author='Noah Levitt', author="Noah Levitt",
author_email='nlevitt@archive.org', author_email="nlevitt@archive.org",
long_description=open('README.rst', mode='rb').read().decode('UTF-8'), long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
license='Apache License 2.0', license="Apache License 2.0",
packages=['brozzler', 'brozzler.dashboard'], packages=["brozzler", "brozzler.dashboard"],
package_data={ package_data={
'brozzler': [ "brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'], "brozzler.dashboard": find_package_data("brozzler.dashboard"),
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
}, },
entry_points={ entry_points={
'console_scripts': [ "console_scripts": [
'brozzle-page=brozzler.cli:brozzle_page', "brozzle-page=brozzler.cli:brozzle_page",
'brozzler-new-job=brozzler.cli:brozzler_new_job', "brozzler-new-job=brozzler.cli:brozzler_new_job",
'brozzler-new-site=brozzler.cli:brozzler_new_site', "brozzler-new-site=brozzler.cli:brozzler_new_site",
'brozzler-worker=brozzler.cli:brozzler_worker', "brozzler-worker=brozzler.cli:brozzler_worker",
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
'brozzler-list-captures=brozzler.cli:brozzler_list_captures', "brozzler-list-captures=brozzler.cli:brozzler_list_captures",
'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs', "brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
'brozzler-list-sites=brozzler.cli:brozzler_list_sites', "brozzler-list-sites=brozzler.cli:brozzler_list_sites",
'brozzler-list-pages=brozzler.cli:brozzler_list_pages', "brozzler-list-pages=brozzler.cli:brozzler_list_pages",
'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl', "brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
'brozzler-purge=brozzler.cli:brozzler_purge', "brozzler-purge=brozzler.cli:brozzler_purge",
'brozzler-dashboard=brozzler.dashboard:main', "brozzler-dashboard=brozzler.dashboard:main",
'brozzler-easy=brozzler.easy:main', "brozzler-easy=brozzler.easy:main",
'brozzler-wayback=brozzler.pywb:main', "brozzler-wayback=brozzler.pywb:main",
], ],
}, },
install_requires=[ install_requires=[
'PyYAML>=5.1', "PyYAML>=5.1",
'reppy==0.3.4', "yt_dlp<2023.11.16",
'requests>=2.21', "reppy==0.3.4",
'websocket-client>=0.39.0,<=0.48.0', "requests>=2.21",
'pillow>=5.2.0', "websocket-client>=0.39.0,<=0.48.0",
'urlcanon>=0.1.dev23', "pillow>=5.2.0",
'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311', "urlcanon>=0.1.dev23",
'rethinkdb==2.4.9', "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
'cerberus>=1.0.1', "rethinkdb<2.4.10",
'jinja2>=2.10', "cerberus>=1.0.1",
'cryptography>=2.3', "jinja2>=2.10",
'python-magic>=0.4.15', "cryptography>=2.3",
"python-magic>=0.4.15",
], ],
extras_require={ extras_require={
'dashboard': [ "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
'flask>=1.0', "easy": [
'gunicorn>=19.8.1' "warcprox>=2.4.31",
], "pywb>=0.33.2,<2",
'easy': [ "flask>=1.0",
'warcprox>=2.4.31', "gunicorn>=19.8.1",
'pywb>=0.33.2,<2',
'flask>=1.0',
'gunicorn>=19.8.1'
], ],
}, },
zip_safe=False, zip_safe=False,
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', "Development Status :: 5 - Production/Stable",
'Environment :: Console', "Environment :: Console",
'License :: OSI Approved :: Apache Software License', "License :: OSI Approved :: Apache Software License",
'Programming Language :: Python :: 3.8', "Programming Language :: Python :: 3.5",
'Programming Language :: Python :: 3.9', "Programming Language :: Python :: 3.6",
'Programming Language :: Python :: 3.10', "Programming Language :: Python :: 3.7",
'Topic :: Internet :: WWW/HTTP', "Topic :: Internet :: WWW/HTTP",
'Topic :: System :: Archiving', "Topic :: System :: Archiving",
]) ],
)

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
test_brozzling.py - XXX explain test_brozzling.py - XXX explain
Copyright (C) 2016-2018 Internet Archive Copyright (C) 2016-2018 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import pytest import pytest
import brozzler import brozzler
@ -34,79 +34,81 @@ args.log_level = logging.INFO
brozzler.cli.configure_logging(args) brozzler.cli.configure_logging(args)
WARCPROX_META_420 = { WARCPROX_META_420 = {
'stats': { "stats": {
'test_limits_bucket': { "test_limits_bucket": {
'total': {'urls': 0, 'wire_bytes': 0}, "total": {"urls": 0, "wire_bytes": 0},
'new': {'urls': 0, 'wire_bytes': 0}, "new": {"urls": 0, "wire_bytes": 0},
'revisit': {'urls': 0, 'wire_bytes': 0}, "revisit": {"urls": 0, "wire_bytes": 0},
'bucket': 'test_limits_bucket' "bucket": "test_limits_bucket",
} }
}, },
'reached-limit': {'test_limits_bucket/total/urls': 0} "reached-limit": {"test_limits_bucket/total/urls": 0},
} }
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def httpd(request): def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler): class RequestHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd' self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs) http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
def do_GET(self): def do_GET(self):
if self.path == '/420': if self.path == "/420":
self.send_response(420, 'Reached limit') self.send_response(420, "Reached limit")
self.send_header('Connection', 'close') self.send_header("Connection", "close")
self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420)) self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n' payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
self.send_header('Content-Type', 'text/plain;charset=utf-8') self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header('Content-Length', len(payload)) self.send_header("Content-Length", len(payload))
self.end_headers() self.end_headers()
self.wfile.write(payload) self.wfile.write(payload)
elif self.path == '/401': elif self.path == "/401":
self.send_response(401) self.send_response(401)
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"') self.send_header("WWW-Authenticate", 'Basic realm="Test"')
self.send_header('Content-type', 'text/html') self.send_header("Content-type", "text/html")
self.end_headers() self.end_headers()
self.wfile.write(self.headers.get('Authorization', b'')) self.wfile.write(self.headers.get("Authorization", b""))
self.wfile.write(b'not authenticated') self.wfile.write(b"not authenticated")
else: else:
super().do_GET() super().do_GET()
def do_POST(self): def do_POST(self):
if self.path == '/login-action': if self.path == "/login-action":
self.send_response(200) self.send_response(200)
payload = b'login successful\n' payload = b"login successful\n"
self.send_header('Content-Type', 'text/plain;charset=utf-8') self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header('Content-Length', len(payload)) self.send_header("Content-Length", len(payload))
self.end_headers() self.end_headers()
self.wfile.write(payload) self.wfile.write(payload)
else: else:
super().do_POST() super().do_POST()
# SimpleHTTPRequestHandler always uses CWD so we have to chdir # SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
def fin(): def fin():
httpd.shutdown() httpd.shutdown()
httpd.server_close() httpd.server_close()
httpd_thread.join() httpd_thread.join()
request.addfinalizer(fin) request.addfinalizer(fin)
return httpd return httpd
def test_httpd(httpd): def test_httpd(httpd):
''' """
Tests that our http server is working as expected, and that two fetches Tests that our http server is working as expected, and that two fetches
of the same url return the same payload, proving it can be used to test of the same url return the same payload, proving it can be used to test
deduplication. deduplication.
''' """
payload1 = content2 = None payload1 = content2 = None
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
assert response.status == 200 assert response.status == 200
payload1 = response.read() payload1 = response.read()
@ -119,123 +121,136 @@ def test_httpd(httpd):
assert payload1 == payload2 assert payload1 == payload2
url = 'http://localhost:%s/420' % httpd.server_port url = "http://localhost:%s/420" % httpd.server_port
with pytest.raises(urllib.error.HTTPError) as excinfo: with pytest.raises(urllib.error.HTTPError) as excinfo:
urllib.request.urlopen(url) urllib.request.urlopen(url)
assert excinfo.value.getcode() == 420 assert excinfo.value.getcode() == 420
def test_aw_snap_hes_dead_jim(): def test_aw_snap_hes_dead_jim():
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.BrowsingException): with pytest.raises(brozzler.BrowsingException):
browser.browse_page('chrome://crash') browser.browse_page("chrome://crash")
# chromium's 401 handling changed??? # chromium's 401 handling changed???
@pytest.mark.xfail @pytest.mark.xfail
def test_page_interstitial_exception(httpd): def test_page_interstitial_exception(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/401' % httpd.server_port url = "http://localhost:%s/401" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.PageInterstitialShown): with pytest.raises(brozzler.PageInterstitialShown):
browser.browse_page(url) browser.browse_page(url)
def test_on_response(httpd): def test_on_response(httpd):
response_urls = [] response_urls = []
def on_response(msg): def on_response(msg):
response_urls.append(msg['params']['response']['url']) response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site3/page.html' % httpd.server_port url = "http://localhost:%s/site3/page.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(url, on_response=on_response) browser.browse_page(url, on_response=on_response)
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert (
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
)
assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
def test_420(httpd): def test_420(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/420' % httpd.server_port url = "http://localhost:%s/420" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ReachedLimit) as excinfo: with pytest.raises(brozzler.ReachedLimit) as excinfo:
browser.browse_page(url) browser.browse_page(url)
assert excinfo.value.warcprox_meta == WARCPROX_META_420 assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_js_dialogs(httpd): def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port url = "http://localhost:%s/site4/alert.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise # before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail # brozzler.browser.BrowsingTimeout, which would cause this test to fail
browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
browser.browse_page( browser.browse_page(
'http://localhost:%s/site4/alert.html' % httpd.server_port) "http://localhost:%s/site4/confirm.html" % httpd.server_port
browser.browse_page( )
'http://localhost:%s/site4/confirm.html' % httpd.server_port) browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
# XXX print dialog unresolved # XXX print dialog unresolved
# browser.browse_page( # browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port) # 'http://localhost:%s/site4/print.html' % httpd.server_port)
def test_page_videos(httpd): def test_page_videos(httpd):
# test depends on behavior of youtube-dl and chromium, could fail and need # test depends on behavior of youtube-dl and chromium, could fail and need
# to be adjusted on youtube-dl or chromium updates # to be adjusted on youtube-dl or chromium updates
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None) worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {}) site = brozzler.Site(None, {})
page = brozzler.Page(None, { page = brozzler.Page(
'url':'http://localhost:%s/site6/' % httpd.server_port}) None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
worker.brozzle_page(browser, site, page) worker.brozzle_page(browser, site, page)
assert page.videos assert page.videos
assert len(page.videos) == 4 assert len(page.videos) == 4
assert page.videos[0] == { assert page.videos[0] == {
'blame': 'youtube-dl', "blame": "youtube-dl",
'response_code': 200, "response_code": 200,
'content-length': 383631, "content-length": 383631,
'content-type': 'video/mp4', "content-type": "video/mp4",
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
} }
assert page.videos[1] == { assert page.videos[1] == {
'blame': 'youtube-dl', "blame": "youtube-dl",
'content-length': 92728, "content-length": 92728,
'content-type': 'video/webm', "content-type": "video/webm",
'response_code': 200, "response_code": 200,
'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
% httpd.server_port,
} }
assert page.videos[2] == { assert page.videos[2] == {
'blame': 'youtube-dl', "blame": "youtube-dl",
'content-length': 101114, "content-length": 101114,
'content-type': 'video/webm', "content-type": "video/webm",
'response_code': 200, "response_code": 200,
'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
} }
assert page.videos[3] == { assert page.videos[3] == {
'blame': 'browser', "blame": "browser",
# 'response_code': 206, # 'response_code': 206,
# 'content-range': 'bytes 0-229454/229455', # 'content-range': 'bytes 0-229454/229455',
'response_code': 200, "response_code": 200,
'content-length': 229455, "content-length": 229455,
'content-type': 'video/webm', "content-type": "video/webm",
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, "url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
} }
def test_extract_outlinks(httpd): def test_extract_outlinks(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None) worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {}) site = brozzler.Site(None, {})
page = brozzler.Page(None, { page = brozzler.Page(
'url':'http://localhost:%s/site8/' % httpd.server_port}) None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
outlinks = worker.brozzle_page(browser, site, page) outlinks = worker.brozzle_page(browser, site, page)
assert outlinks == { assert outlinks == {
'http://example.com/offsite', "http://example.com/offsite",
'http://localhost:%s/site8/baz/zuh' % httpd.server_port, "http://localhost:%s/site8/baz/zuh" % httpd.server_port,
'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, "http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port "http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
} }
def test_proxy_down(): def test_proxy_down():
''' """
Test that browsing raises `brozzler.ProxyError` when proxy is down. Test that browsing raises `brozzler.ProxyError` when proxy is down.
See also `test_proxy_down` in test_units.py. See also `test_proxy_down` in test_units.py.
@ -243,40 +258,41 @@ def test_proxy_down():
Tests two different kinds of connection error: Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :)) - nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections - port bound but not accepting connections
''' """
sock = socket.socket() sock = socket.socket()
sock.bind(('127.0.0.1', 0)) sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ( for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {"seed": "http://example.com/"})
site = brozzler.Site(None, {'seed':'http://example.com/'}) page = brozzler.Page(None, {"url": "http://example.com/"})
page = brozzler.Page(None, {'url': 'http://example.com/'})
worker = brozzler.BrozzlerWorker( worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
frontier=None, proxy=not_listening_proxy)
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ProxyError): with pytest.raises(brozzler.ProxyError):
worker.brozzle_page(browser, site, page) worker.brozzle_page(browser, site, page)
def test_try_login(httpd): def test_try_login(httpd):
"""Test try_login behavior. """Test try_login behavior."""
"""
response_urls = [] response_urls = []
def on_response(msg): def on_response(msg):
response_urls.append(msg['params']['response']['url']) response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
login_url = 'http://localhost:%s/login-action' % httpd.server_port login_url = "http://localhost:%s/login-action" % httpd.server_port
# When username and password are defined and initial page has login form, # When username and password are defined and initial page has login form,
# detect login form, submit login, and then return to the initial page. # detect login form, submit login, and then return to the initial page.
username = 'user1' username = "user1"
password = 'pass1' password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_url, username=username, password=password, browser.browse_page(
on_response=on_response) form_url, username=username, password=password, on_response=on_response
)
assert len(response_urls) == 4 assert len(response_urls) == 4
assert response_urls[0] == form_url assert response_urls[0] == form_url
assert response_urls[1] == favicon_url assert response_urls[1] == favicon_url
@ -285,11 +301,15 @@ def test_try_login(httpd):
# We are now supporting a different type of form, we'll test that here. # We are now supporting a different type of form, we'll test that here.
response_urls = [] response_urls = []
username = 'user1' username = "user1"
password = 'pass1' password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_url_other, username=username, password=password, browser.browse_page(
on_response=on_response) form_url_other,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 4 assert len(response_urls) == 4
assert response_urls[0] == form_url_other assert response_urls[0] == form_url_other
assert response_urls[1] == favicon_url assert response_urls[1] == favicon_url
@ -306,10 +326,16 @@ def test_try_login(httpd):
# when the page doesn't have a form with username/password, don't submit it # when the page doesn't have a form with username/password, don't submit it
response_urls = [] response_urls = []
form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port form_without_login_url = (
"http://localhost:%s/site11/form-no-login.html" % httpd.server_port
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_without_login_url, username=username, browser.browse_page(
password=password, on_response=on_response) form_without_login_url,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 2 assert len(response_urls) == 2
assert response_urls[0] == form_without_login_url assert response_urls[0] == form_without_login_url
assert response_urls[1] == favicon_url assert response_urls[1] == favicon_url

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
test_cli.py - test brozzler commands test_cli.py - test brozzler commands
Copyright (C) 2017 Internet Archive Copyright (C) 2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import brozzler.cli import brozzler.cli
import pkg_resources import pkg_resources
@ -23,59 +23,62 @@ import pytest
import subprocess import subprocess
import doublethink import doublethink
def cli_commands(): def cli_commands():
commands = set(pkg_resources.get_entry_map( commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
'brozzler')['console_scripts'].keys()) commands.remove("brozzler-wayback")
commands.remove('brozzler-wayback')
try: try:
import gunicorn import gunicorn
except ImportError: except ImportError:
commands.remove('brozzler-dashboard') commands.remove("brozzler-dashboard")
try: try:
import pywb import pywb
except ImportError: except ImportError:
commands.remove('brozzler-easy') commands.remove("brozzler-easy")
return commands return commands
@pytest.mark.parametrize('cmd', cli_commands())
@pytest.mark.parametrize("cmd", cli_commands())
def test_call_entrypoint(capsys, cmd): def test_call_entrypoint(capsys, cmd):
entrypoint = pkg_resources.get_entry_map( entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
'brozzler')['console_scripts'][cmd]
callable = entrypoint.resolve() callable = entrypoint.resolve()
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
callable(['/whatever/bin/%s' % cmd, '--version']) callable(["/whatever/bin/%s" % cmd, "--version"])
out, err = capsys.readouterr() out, err = capsys.readouterr()
assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd) assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
assert err == '' assert err == ""
@pytest.mark.parametrize('cmd', cli_commands())
@pytest.mark.parametrize("cmd", cli_commands())
def test_run_command(capsys, cmd): def test_run_command(capsys, cmd):
proc = subprocess.Popen( proc = subprocess.Popen(
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) [cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate() out, err = proc.communicate()
assert err == b'' assert err == b""
assert out == ('brozzler %s - %s\n' % ( assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
brozzler.__version__, cmd)).encode('ascii')
def test_rethinkdb_up(): def test_rethinkdb_up():
'''Check that rethinkdb is up and running.''' """Check that rethinkdb is up and running."""
# check that rethinkdb is listening and looks sane # check that rethinkdb is listening and looks sane
rr = doublethink.Rethinker(db='rethinkdb') # built-in db rr = doublethink.Rethinker(db="rethinkdb") # built-in db
tbls = rr.table_list().run() tbls = rr.table_list().run()
assert len(tbls) > 10 assert len(tbls) > 10
# XXX don't know why this test is failing in travis-ci and vagrant while # XXX don't know why this test is failing in travis-ci and vagrant while
# test_call_entrypoint tests pass :( (also fails with capfd) # test_call_entrypoint tests pass :( (also fails with capfd)
@pytest.mark.xfail @pytest.mark.xfail
def test_stop_nonexistent_crawl(capsys): def test_stop_nonexistent_crawl(capsys):
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123']) brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
out, err = capsys.readouterr() out, err = capsys.readouterr()
assert err.endswith('site not found with id=123\n') assert err.endswith("site not found with id=123\n")
assert out == '' assert out == ""
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc']) brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
out, err = capsys.readouterr() out, err = capsys.readouterr()
assert err.endswith('''job not found with id='abc'\n''') assert err.endswith("""job not found with id='abc'\n""")
assert out == '' assert out == ""

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
test_units.py - some unit tests for parts of brozzler amenable to that test_units.py - some unit tests for parts of brozzler amenable to that
Copyright (C) 2016-2017 Internet Archive Copyright (C) 2016-2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import pytest import pytest
import http.server import http.server
@ -37,99 +37,131 @@ import threading
from unittest import mock from unittest import mock
logging.basicConfig( logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=( stream=sys.stderr,
'%(asctime)s %(process)d %(levelname)s %(threadName)s ' level=logging.INFO,
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) format=(
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
),
)
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def httpd(request): def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir # SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
httpd = http.server.HTTPServer( httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler) ("localhost", 0), http.server.SimpleHTTPRequestHandler
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) )
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
def fin(): def fin():
httpd.shutdown() httpd.shutdown()
httpd.server_close() httpd.server_close()
httpd_thread.join() httpd_thread.join()
request.addfinalizer(fin) request.addfinalizer(fin)
return httpd return httpd
def test_robots(httpd): def test_robots(httpd):
''' """
Basic test of robots.txt user-agent substring matching. Basic test of robots.txt user-agent substring matching.
''' """
url = 'http://localhost:%s/' % httpd.server_port url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'}) site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
assert not brozzler.is_permitted_by_robots(site, url) assert not brozzler.is_permitted_by_robots(site, url)
def test_robots_http_statuses(): def test_robots_http_statuses():
for status in ( for status in (
200, 204, 400, 401, 402, 403, 404, 405, 200,
500, 501, 502, 503, 504, 505): 204,
400,
401,
402,
403,
404,
405,
500,
501,
502,
503,
504,
505,
):
class Handler(http.server.BaseHTTPRequestHandler): class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self): def do_GET(self):
response = (('HTTP/1.1 %s Meaningless message\r\n' response = (
+ 'Content-length: 0\r\n' (
+ '\r\n') % status).encode('utf-8') "HTTP/1.1 %s Meaningless message\r\n"
+ "Content-length: 0\r\n"
+ "\r\n"
)
% status
).encode("utf-8")
self.connection.sendall(response) self.connection.sendall(response)
# self.send_response(status) # self.send_response(status)
# self.end_headers() # self.end_headers()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
try: try:
url = 'http://localhost:%s/' % httpd.server_port url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {'seed': url}) site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
finally: finally:
httpd.shutdown() httpd.shutdown()
httpd.server_close() httpd.server_close()
httpd_thread.join() httpd_thread.join()
def test_robots_empty_response(): def test_robots_empty_response():
class Handler(http.server.BaseHTTPRequestHandler): class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self): def do_GET(self):
self.connection.shutdown(socket.SHUT_RDWR) self.connection.shutdown(socket.SHUT_RDWR)
self.connection.close() self.connection.close()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
try: try:
url = 'http://localhost:%s/' % httpd.server_port url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {'seed': url}) site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
finally: finally:
httpd.shutdown() httpd.shutdown()
httpd.server_close() httpd.server_close()
httpd_thread.join() httpd_thread.join()
def test_robots_socket_timeout(): def test_robots_socket_timeout():
stop_hanging = threading.Event() stop_hanging = threading.Event()
class Handler(http.server.BaseHTTPRequestHandler): class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self): def do_GET(self):
stop_hanging.wait(60) stop_hanging.wait(60)
self.connection.sendall( self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd = http.server.HTTPServer(("localhost", 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
try: try:
url = 'http://localhost:%s/' % httpd.server_port url = "http://localhost:%s/" % httpd.server_port
site = brozzler.Site(None, {'seed': url}) site = brozzler.Site(None, {"seed": url})
brozzler.robots._SessionRaiseOn420.timeout = 2 brozzler.robots._SessionRaiseOn420.timeout = 2
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
finally: finally:
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
httpd.server_close() httpd.server_close()
httpd_thread.join() httpd_thread.join()
def test_robots_dns_failure(): def test_robots_dns_failure():
# .invalid. is guaranteed nonexistent per rfc 6761 # .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://whatever.invalid./' url = "http://whatever.invalid./"
site = brozzler.Site(None, {'seed': url}) site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
def test_robots_connection_failure(): def test_robots_connection_failure():
# .invalid. is guaranteed nonexistent per rfc 6761 # .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://localhost:4/' # nobody listens on port 4 url = "http://localhost:4/" # nobody listens on port 4
site = brozzler.Site(None, {'seed': url}) site = brozzler.Site(None, {"seed": url})
assert brozzler.is_permitted_by_robots(site, url) assert brozzler.is_permitted_by_robots(site, url)
def test_scoping(): def test_scoping():
test_scope = yaml.safe_load(''' test_scope = yaml.safe_load(
"""
max_hops: 100 max_hops: 100
accepts: accepts:
- url_match: REGEX_MATCH - url_match: REGEX_MATCH
@ -169,40 +205,73 @@ blocks:
- domain: twitter.com - domain: twitter.com
url_match: REGEX_MATCH url_match: REGEX_MATCH
value: ^.*lang=(?!en).*$ value: ^.*lang=(?!en).*$
''') """
)
site = brozzler.Site(None, { site = brozzler.Site(
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey', None,
'scope': test_scope}) {
page = brozzler.Page(None, { "id": 1,
'url': 'http://example.com/foo/bar?baz=quux#monkey', "seed": "http://example.com/foo/bar?baz=quux#monkey",
'site_id': site.id}) "scope": test_scope,
},
)
page = brozzler.Page(
None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
)
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True assert (
site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
is True
)
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True assert (
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
)
assert (
site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
)
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True assert (
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
)
assert (
site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
)
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True assert (
site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
)
assert (
site.accept_reject_or_neither(
"https://www.youtube.com/watch?v=dUIn5OAPS5s", page
)
is None
)
yt_user_page = brozzler.Page(
None,
{
"url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
"site_id": site.id,
"hops_from_seed": 10,
},
)
assert (
site.accept_reject_or_neither(
"https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
)
is True
)
assert site.accept_reject_or_neither(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
yt_user_page = brozzler.Page(None, {
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
'site_id': site.id, 'hops_from_seed': 10})
assert site.accept_reject_or_neither(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
def test_proxy_down(): def test_proxy_down():
''' """
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
This test needs to cover every possible fetch through the proxy other than This test needs to cover every possible fetch through the proxy other than
@ -211,24 +280,24 @@ def test_proxy_down():
Tests two different kinds of connection error: Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :)) - nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections - port bound but not accepting connections
''' """
sock = socket.socket() sock = socket.socket()
sock.bind(('127.0.0.1', 0)) sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ( for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
worker = brozzler.BrozzlerWorker( site = brozzler.Site(
frontier=None, proxy=not_listening_proxy) None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
site = brozzler.Site(None, { )
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'}) page = brozzler.Page(None, {"url": "http://example.com/"})
page = brozzler.Page(None, {'url': 'http://example.com/'})
# robots.txt fetch # robots.txt fetch
with pytest.raises(brozzler.ProxyError): with pytest.raises(brozzler.ProxyError):
brozzler.is_permitted_by_robots( brozzler.is_permitted_by_robots(
site, 'http://example.com/', proxy=not_listening_proxy) site, "http://example.com/", proxy=not_listening_proxy
)
# youtube-dl fetch # youtube-dl fetch
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
with pytest.raises(brozzler.ProxyError): with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page) brozzler.ydl.do_youtube_dl(worker, site, page)
@ -240,46 +309,57 @@ def test_proxy_down():
with pytest.raises(brozzler.ProxyError): with pytest.raises(brozzler.ProxyError):
worker._warcprox_write_record( worker._warcprox_write_record(
warcprox_address=not_listening_proxy, warcprox_address=not_listening_proxy,
url='test://proxy_down/warcprox_write_record', url="test://proxy_down/warcprox_write_record",
warc_type='metadata', warc_type="metadata",
content_type='text/plain', content_type="text/plain",
payload=b'''payload doesn't matter here''') payload=b"""payload doesn't matter here""",
)
def test_start_stop_backwards_compat(): def test_start_stop_backwards_compat():
site = brozzler.Site(None, {'seed': 'http://example.com/'}) site = brozzler.Site(None, {"seed": "http://example.com/"})
assert len(site.starts_and_stops) == 1 assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]["start"]
assert site.starts_and_stops[0]['stop'] is None assert site.starts_and_stops[0]["stop"] is None
assert not 'start_time' in site assert not "start_time" in site
site = brozzler.Site(None, { site = brozzler.Site(
'seed': 'http://example.com/', None,
'start_time': datetime.datetime(2017,1,1)}) {"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
)
assert len(site.starts_and_stops) == 1 assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1) assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert site.starts_and_stops[0]['stop'] is None assert site.starts_and_stops[0]["stop"] is None
assert not 'start_time' in site assert not "start_time" in site
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]}) job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]["start"]
assert job.starts_and_stops[0]['stop'] is None assert job.starts_and_stops[0]["stop"] is None
assert not 'started' in job assert not "started" in job
assert not 'finished' in job assert not "finished" in job
job = brozzler.Job(
None,
{
"seeds": [{"url": "https://example.com/"}],
"started": datetime.datetime(2017, 1, 1),
"finished": datetime.datetime(2017, 1, 2),
},
)
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
assert not "started" in job
assert not "finished" in job
job = brozzler.Job(None, {
'seeds': [{'url':'https://example.com/'}],
'started': datetime.datetime(2017, 1, 1),
'finished': datetime.datetime(2017, 1, 2)})
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
assert not 'started' in job
assert not 'finished' in job
class Exception1(Exception): class Exception1(Exception):
pass pass
class Exception2(Exception): class Exception2(Exception):
pass pass
def test_thread_raise_not_accept(): def test_thread_raise_not_accept():
def never_accept(): def never_accept():
try: try:
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
th.join() th.join()
assert thread_caught_exception is None assert thread_caught_exception is None
def test_thread_raise_immediate(): def test_thread_raise_immediate():
def accept_immediately(): def accept_immediately():
try: try:
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
assert isinstance(thread_caught_exception, Exception1) assert isinstance(thread_caught_exception, Exception1)
assert time.time() - start < 1.0 assert time.time() - start < 1.0
def test_thread_raise_safe_exit(): def test_thread_raise_safe_exit():
def delay_context_exit(): def delay_context_exit():
gate = brozzler.thread_accept_exceptions() gate = brozzler.thread_accept_exceptions()
orig_exit = type(gate).__exit__ orig_exit = type(gate).__exit__
try: try:
type(gate).__exit__ = lambda self, et, ev, t: ( type(gate).__exit__ = lambda self, et, ev, t: (
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1] brozzler.sleep(2),
orig_exit(self, et, ev, t),
False,
)[-1]
with brozzler.thread_accept_exceptions() as gate: with brozzler.thread_accept_exceptions() as gate:
brozzler.sleep(2) brozzler.sleep(2)
except Exception as e: except Exception as e:
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
assert thread_caught_exception assert thread_caught_exception
assert isinstance(thread_caught_exception, Exception1) assert isinstance(thread_caught_exception, Exception1)
def test_thread_raise_pending_exception(): def test_thread_raise_pending_exception():
def accept_eventually(): def accept_eventually():
try: try:
@ -365,6 +451,7 @@ def test_thread_raise_pending_exception():
assert isinstance(thread_caught_exception, Exception1) assert isinstance(thread_caught_exception, Exception1)
assert time.time() - start > 1.0 assert time.time() - start > 1.0
def test_thread_raise_second_with_block(): def test_thread_raise_second_with_block():
def two_with_blocks(): def two_with_blocks():
try: try:
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
th.join() th.join()
assert isinstance(thread_caught_exception, Exception2) assert isinstance(thread_caught_exception, Exception2)
def test_needs_browsing(): def test_needs_browsing():
# only one test case here right now, which exposed a bug # only one test case here right now, which exposed a bug
class ConvenientHeaders(http.client.HTTPMessage): class ConvenientHeaders(http.client.HTTPMessage):
def __init__(self, headers): def __init__(self, headers):
http.client.HTTPMessage.__init__(self) http.client.HTTPMessage.__init__(self)
for (k, v) in headers.items(): for k, v in headers.items():
self.add_header(k, v) self.add_header(k, v)
page = brozzler.Page(None, { page = brozzler.Page(None, {"url": "http://example.com/a"})
'url':'http://example.com/a'})
spy = brozzler.ydl.YoutubeDLSpy() spy = brozzler.ydl.YoutubeDLSpy()
spy.fetches.append({ spy.fetches.append(
'url': 'http://example.com/a', {
'method': 'HEAD', "url": "http://example.com/a",
'response_code': 301, "method": "HEAD",
'response_headers': ConvenientHeaders({'Location': '/b'})}) "response_code": 301,
spy.fetches.append({ "response_headers": ConvenientHeaders({"Location": "/b"}),
'url': 'http://example.com/b', }
'method': 'GET', )
'response_code': 200, spy.fetches.append(
'response_headers': ConvenientHeaders({ {
'Content-Type': 'application/pdf'})}) "url": "http://example.com/b",
"method": "GET",
"response_code": 200,
"response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
}
)
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
None, page, spy.fetches)
def test_seed_redirect(): def test_seed_redirect():
site = brozzler.Site(None, {'seed': 'http://foo.com/'}) site = brozzler.Site(None, {"seed": "http://foo.com/"})
site.note_seed_redirect('https://foo.com/a/b/c') site.note_seed_redirect("https://foo.com/a/b/c")
assert site.scope == {'accepts': [ assert site.scope == {
{'ssurt': 'com,foo,//http:/',}, "accepts": [
{'ssurt': 'com,foo,//https:/',}]} {
"ssurt": "com,foo,//http:/",
},
{
"ssurt": "com,foo,//https:/",
},
]
}
site = brozzler.Site(None, {'seed': 'https://foo.com/'}) site = brozzler.Site(None, {"seed": "https://foo.com/"})
site.note_seed_redirect('http://foo.com/a/b/c') site.note_seed_redirect("http://foo.com/a/b/c")
assert site.scope == {'accepts': [ assert site.scope == {
{'ssurt': 'com,foo,//https:/',}, "accepts": [
{'ssurt': 'com,foo,//http:/',}]} {
"ssurt": "com,foo,//https:/",
},
{
"ssurt": "com,foo,//http:/",
},
]
}
site = brozzler.Site(None, {"seed": "http://foo.com/"})
site.note_seed_redirect("https://bar.com/a/b/c")
assert site.scope == {
"accepts": [
{
"ssurt": "com,foo,//http:/",
},
{
"ssurt": "com,bar,//https:/a/b/c",
},
]
}
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
site.note_seed_redirect('https://bar.com/a/b/c')
assert site.scope == {'accepts': [
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,bar,//https:/a/b/c',}]}
def test_limit_failures(): def test_limit_failures():
page = mock.Mock() page = mock.Mock()
@ -446,9 +560,9 @@ def test_limit_failures():
page.brozzle_count = 0 page.brozzle_count = 0
site = mock.Mock() site = mock.Mock()
site.status = 'ACTIVE' site.status = "ACTIVE"
site.active_brozzling_time = 0 site.active_brozzling_time = 0
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}] site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
rr = mock.Mock() rr = mock.Mock()
rr.servers = [mock.Mock()] rr.servers = [mock.Mock()]
@ -458,9 +572,10 @@ def test_limit_failures():
rr.table = mock.Mock( rr.table = mock.Mock(
return_value=mock.Mock( return_value=mock.Mock(
between=mock.Mock( between=mock.Mock(
return_value=mock.Mock( return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
limit=mock.Mock( )
return_value=rethink_query))))) )
)
assert rr.table().between().limit().run() == [] assert rr.table().between().limit().run() == []
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
frontier.enforce_time_limit = mock.Mock() frontier.enforce_time_limit = mock.Mock()
@ -475,20 +590,19 @@ def test_limit_failures():
assert page.failed_attempts is None assert page.failed_attempts is None
assert page.brozzle_count == 0 assert page.brozzle_count == 0
assert site.status == 'ACTIVE' assert site.status == "ACTIVE"
worker.brozzle_site(browser, site) worker.brozzle_site(browser, site)
assert page.failed_attempts == 1 assert page.failed_attempts == 1
assert page.brozzle_count == 0 assert page.brozzle_count == 0
assert site.status == 'ACTIVE' assert site.status == "ACTIVE"
worker.brozzle_site(browser, site) worker.brozzle_site(browser, site)
assert page.failed_attempts == 2 assert page.failed_attempts == 2
assert page.brozzle_count == 0 assert page.brozzle_count == 0
assert site.status == 'ACTIVE' assert site.status == "ACTIVE"
worker.brozzle_site(browser, site) worker.brozzle_site(browser, site)
assert page.failed_attempts == 3 assert page.failed_attempts == 3
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert site.status == 'FINISHED' assert site.status == "FINISHED"

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
queue a job for your vagrant brozzler deployment. queue a job for your vagrant brozzler deployment.
@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import sys import sys
import os import os
import argparse import argparse
import subprocess import subprocess
def main(argv=[]): def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument( arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE', "job_conf_file",
help='brozzler job configuration file in yaml') metavar="JOB_CONF_FILE",
help="brozzler job configuration file in yaml",
)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
# cd to path with Vagrantfile so "vagrant ssh" knows what to do # cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
with open(args.job_conf_file, 'rb') as f: with open(args.job_conf_file, "rb") as f:
subprocess.call([ subprocess.call(
'vagrant', 'ssh', '--', [
'f=`mktemp` && cat > $f && ' "vagrant",
'/home/vagrant/brozzler-ve3/bin/python ' "ssh",
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'], "--",
stdin=f) "f=`mktemp` && cat > $f && "
"/home/vagrant/brozzler-ve3/bin/python "
"/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
],
stdin=f,
)
if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv) main(sys.argv)

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
''' """
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
queue a site for your vagrant brozzler deployment. queue a site for your vagrant brozzler deployment.
@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' """
import sys import sys
import os import os
import argparse import argparse
import subprocess import subprocess
try: try:
from shlex import quote from shlex import quote
except: except:
from pipes import quote from pipes import quote
def main(argv=[]): def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument('seed', metavar='SEED', help='seed url') arg_parser.add_argument("seed", metavar="SEED", help="seed url")
arg_parser.add_argument( arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None, "--time-limit",
help='time limit in seconds for this site') dest="time_limit",
default=None,
help="time limit in seconds for this site",
)
arg_parser.add_argument( arg_parser.add_argument(
'--ignore-robots', dest='ignore_robots', action='store_true', "--ignore-robots",
help='ignore robots.txt for this site') dest="ignore_robots",
action="store_true",
help="ignore robots.txt for this site",
)
arg_parser.add_argument( arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta', "--warcprox-meta",
dest="warcprox_meta",
help=( help=(
'Warcprox-Meta http request header to send with each request; ' "Warcprox-Meta http request header to send with each request; "
'must be a json blob, ignored unless warcprox features are ' "must be a json blob, ignored unless warcprox features are "
'enabled')) "enabled"
arg_parser.add_argument( ),
'-q', '--quiet', dest='quiet', action='store_true') )
arg_parser.add_argument( arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
'-v', '--verbose', dest='verbose', action='store_true') arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
options = [] options = []
if args.time_limit: if args.time_limit:
options.append('--time-limit=%s' % args.time_limit) options.append("--time-limit=%s" % args.time_limit)
if args.ignore_robots: if args.ignore_robots:
options.append('--ignore-robots') options.append("--ignore-robots")
if args.warcprox_meta: if args.warcprox_meta:
# I think this shell escaping is correct? # I think this shell escaping is correct?
options.append( options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
'--warcprox-meta=%s' % quote(args.warcprox_meta))
if args.quiet: if args.quiet:
options.append('--quiet') options.append("--quiet")
if args.verbose: if args.verbose:
options.append('--verbose') options.append("--verbose")
# cd to path with Vagrantfile so "vagrant ssh" knows what to do # cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
cmd = ( cmd = (
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site ' "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
'%s %s') % (' '.join(options), args.seed) ) % (" ".join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd]) subprocess.call(["vagrant", "ssh", "--", cmd])
if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv) main(sys.argv)