mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 07:15:52 -04:00
Use black, enforce with GitHub Actions
This commit is contained in:
parent
c4620c3018
commit
8b23430a87
31
.github/workflows/python-formatting.yml
vendored
Normal file
31
.github/workflows/python-formatting.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
name: Python Formatting Check
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
|
||||
jobs:
|
||||
formatting:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: Create virtual environment
|
||||
run: python -m venv venv
|
||||
|
||||
- name: Install black
|
||||
run: |
|
||||
./venv/bin/pip install --upgrade pip
|
||||
./venv/bin/pip install black
|
||||
|
||||
- name: Run formatting check
|
||||
run: make ck-format
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -2,3 +2,5 @@
|
||||
*.diff
|
||||
.*.sw*
|
||||
/brozzler.egg-info/
|
||||
venv
|
||||
.idea
|
||||
|
7
Makefile
Normal file
7
Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
.PHONY: format
|
||||
format:
|
||||
venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
|
||||
|
||||
.PHONY: ck-format
|
||||
ck-format:
|
||||
venv/bin/black --check .
|
@ -19,33 +19,41 @@ limitations under the License.
|
||||
|
||||
import logging
|
||||
from pkg_resources import get_distribution as _get_distribution
|
||||
__version__ = _get_distribution('brozzler').version
|
||||
|
||||
__version__ = _get_distribution("brozzler").version
|
||||
|
||||
|
||||
class ShutdownRequested(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NothingToClaim(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CrawlStopped(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PageInterstitialShown(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ProxyError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ReachedTimeLimit(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ReachedLimit(Exception):
|
||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||
import json
|
||||
|
||||
if http_error:
|
||||
if "warcprox-meta" in http_error.headers:
|
||||
self.warcprox_meta = json.loads(
|
||||
http_error.headers["warcprox-meta"])
|
||||
self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
|
||||
else:
|
||||
self.warcprox_meta = None
|
||||
self.http_payload = http_error.read()
|
||||
@ -55,28 +63,39 @@ class ReachedLimit(Exception):
|
||||
|
||||
def __repr__(self):
|
||||
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
|
||||
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
|
||||
self.http_payload if hasattr(self, 'http_payload') else None)
|
||||
self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
|
||||
self.http_payload if hasattr(self, "http_payload") else None,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
# monkey-patch log levels TRACE and NOTICE
|
||||
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
||||
|
||||
|
||||
def _logger_trace(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(logging.TRACE):
|
||||
self._log(logging.TRACE, msg, args, **kwargs)
|
||||
|
||||
|
||||
logging.Logger.trace = _logger_trace
|
||||
logging.trace = logging.root.trace
|
||||
logging.addLevelName(logging.TRACE, 'TRACE')
|
||||
logging.addLevelName(logging.TRACE, "TRACE")
|
||||
|
||||
logging.NOTICE = (logging.INFO + logging.WARN) // 2
|
||||
|
||||
|
||||
def _logger_notice(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(logging.NOTICE):
|
||||
self._log(logging.NOTICE, msg, args, **kwargs)
|
||||
|
||||
|
||||
logging.Logger.notice = _logger_notice
|
||||
logging.notice = logging.root.notice
|
||||
logging.addLevelName(logging.NOTICE, 'NOTICE')
|
||||
logging.addLevelName(logging.NOTICE, "NOTICE")
|
||||
|
||||
|
||||
# see https://github.com/internetarchive/brozzler/issues/91
|
||||
def _logging_handler_handle(self, record):
|
||||
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
|
||||
except:
|
||||
pass
|
||||
return rv
|
||||
|
||||
|
||||
logging.Handler.handle = _logging_handler_handle
|
||||
|
||||
_behaviors = None
|
||||
|
||||
|
||||
def behaviors(behaviors_dir=None):
|
||||
"""Return list of JS behaviors loaded from YAML file.
|
||||
|
||||
@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
|
||||
`js-templates/`. Defaults to brozzler dir.
|
||||
"""
|
||||
import os, yaml, string
|
||||
|
||||
global _behaviors
|
||||
if _behaviors is None:
|
||||
d = behaviors_dir or os.path.dirname(__file__)
|
||||
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
||||
behaviors_yaml = os.path.join(d, "behaviors.yaml")
|
||||
with open(behaviors_yaml) as fin:
|
||||
_behaviors = yaml.safe_load(fin)
|
||||
return _behaviors
|
||||
|
||||
|
||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||
'''
|
||||
"""
|
||||
Returns the javascript behavior string populated with template_parameters.
|
||||
'''
|
||||
"""
|
||||
import re, logging, json
|
||||
|
||||
for behavior in behaviors(behaviors_dir=behaviors_dir):
|
||||
if re.match(behavior['url_regex'], url):
|
||||
if re.match(behavior["url_regex"], url):
|
||||
parameters = dict()
|
||||
if 'default_parameters' in behavior:
|
||||
parameters.update(behavior['default_parameters'])
|
||||
if "default_parameters" in behavior:
|
||||
parameters.update(behavior["default_parameters"])
|
||||
if template_parameters:
|
||||
parameters.update(template_parameters)
|
||||
template = jinja2_environment(behaviors_dir).get_template(
|
||||
behavior['behavior_js_template'])
|
||||
behavior["behavior_js_template"]
|
||||
)
|
||||
script = template.render(parameters)
|
||||
logging.info(
|
||||
'using template=%r populated with parameters=%r for %r',
|
||||
behavior['behavior_js_template'], json.dumps(parameters), url)
|
||||
"using template=%r populated with parameters=%r for %r",
|
||||
behavior["behavior_js_template"],
|
||||
json.dumps(parameters),
|
||||
url,
|
||||
)
|
||||
return script
|
||||
return None
|
||||
|
||||
|
||||
class ThreadExceptionGate:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -142,8 +173,7 @@ class ThreadExceptionGate:
|
||||
def __enter__(self):
|
||||
assert self.thread == threading.current_thread()
|
||||
if self.pending_exception:
|
||||
self.logger.info(
|
||||
'raising pending exception %s', self.pending_exception)
|
||||
self.logger.info("raising pending exception %s", self.pending_exception)
|
||||
tmp = self.pending_exception
|
||||
self.pending_exception = None
|
||||
raise tmp
|
||||
@ -154,25 +184,32 @@ class ThreadExceptionGate:
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
assert self.thread == threading.current_thread()
|
||||
self.ok_to_raise.clear()
|
||||
return False # don't swallow exception
|
||||
return False # don't swallow exception
|
||||
|
||||
def queue_exception(self, e):
|
||||
with self.lock:
|
||||
if self.pending_exception:
|
||||
self.logger.warning(
|
||||
'%r already pending for thread %r, discarding %r',
|
||||
self.pending_exception, self.thread, e)
|
||||
"%r already pending for thread %r, discarding %r",
|
||||
self.pending_exception,
|
||||
self.thread,
|
||||
e,
|
||||
)
|
||||
else:
|
||||
self.pending_exception = e
|
||||
|
||||
def __repr__(self):
|
||||
return '<ThreadExceptionGate(%s)>' % self.thread
|
||||
return "<ThreadExceptionGate(%s)>" % self.thread
|
||||
|
||||
|
||||
import threading
|
||||
|
||||
_thread_exception_gates = {}
|
||||
_thread_exception_gates_lock = threading.Lock()
|
||||
|
||||
|
||||
def thread_exception_gate(thread=None):
|
||||
'''
|
||||
"""
|
||||
Returns a `ThreadExceptionGate` for `thread` (current thread by default).
|
||||
|
||||
`ThreadExceptionGate` is a context manager which allows exceptions to be
|
||||
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
|
||||
is queued, and raised immediately if and when the thread enters the
|
||||
context. Only one exception will be queued this way at a time, others are
|
||||
discarded.
|
||||
'''
|
||||
"""
|
||||
if not thread:
|
||||
thread = threading.current_thread()
|
||||
|
||||
@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):
|
||||
|
||||
return _thread_exception_gates[thread]
|
||||
|
||||
|
||||
thread_accept_exceptions = thread_exception_gate
|
||||
|
||||
|
||||
def thread_raise(thread, exctype):
|
||||
'''
|
||||
"""
|
||||
Raises or queues the exception `exctype` for the thread `thread`.
|
||||
|
||||
See the documentation on the function `thread_exception_gate()` for more
|
||||
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
|
||||
Raises:
|
||||
TypeError if `exctype` is not a class
|
||||
ValueError, SystemError in case of unexpected problems
|
||||
'''
|
||||
"""
|
||||
import ctypes, inspect, threading, logging
|
||||
|
||||
if not inspect.isclass(exctype):
|
||||
raise TypeError(
|
||||
'cannot raise %s, only exception types can be raised (not '
|
||||
'instances)' % exctype)
|
||||
"cannot raise %s, only exception types can be raised (not "
|
||||
"instances)" % exctype
|
||||
)
|
||||
|
||||
gate = thread_exception_gate(thread)
|
||||
with gate.lock:
|
||||
if gate.ok_to_raise.is_set() and thread.is_alive():
|
||||
gate.ok_to_raise.clear()
|
||||
logging.info('raising %s in thread %s', exctype, thread)
|
||||
logging.info("raising %s in thread %s", exctype, thread)
|
||||
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
||||
ctypes.c_long(thread.ident), ctypes.py_object(exctype))
|
||||
ctypes.c_long(thread.ident), ctypes.py_object(exctype)
|
||||
)
|
||||
if res == 0:
|
||||
raise ValueError(
|
||||
'invalid thread id? thread.ident=%s' % thread.ident)
|
||||
raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
|
||||
elif res != 1:
|
||||
# if it returns a number greater than one, you're in trouble,
|
||||
# and you should call it again with exc=NULL to revert the effect
|
||||
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
|
||||
raise SystemError('PyThreadState_SetAsyncExc failed')
|
||||
raise SystemError("PyThreadState_SetAsyncExc failed")
|
||||
else:
|
||||
logging.info('queueing %s for thread %s', exctype, thread)
|
||||
logging.info("queueing %s for thread %s", exctype, thread)
|
||||
gate.queue_exception(exctype)
|
||||
|
||||
|
||||
def sleep(duration):
|
||||
'''
|
||||
"""
|
||||
Sleeps for duration seconds in increments of 0.5 seconds.
|
||||
|
||||
Use this so that the sleep can be interrupted by thread_raise().
|
||||
'''
|
||||
"""
|
||||
import time
|
||||
|
||||
start = time.time()
|
||||
while True:
|
||||
elapsed = time.time() - start
|
||||
@ -259,32 +301,41 @@ def sleep(duration):
|
||||
break
|
||||
time.sleep(min(duration - elapsed, 0.5))
|
||||
|
||||
|
||||
_jinja2_env = None
|
||||
|
||||
|
||||
def jinja2_environment(behaviors_dir=None):
|
||||
global _jinja2_env
|
||||
if not _jinja2_env:
|
||||
import os, jinja2, json
|
||||
|
||||
if behaviors_dir:
|
||||
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
|
||||
'js-templates'))
|
||||
_loader = jinja2.FileSystemLoader(
|
||||
os.path.join(behaviors_dir, "js-templates")
|
||||
)
|
||||
else:
|
||||
_loader=jinja2.PackageLoader('brozzler', 'js-templates')
|
||||
_loader = jinja2.PackageLoader("brozzler", "js-templates")
|
||||
_jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
|
||||
_jinja2_env.filters['json'] = json.dumps
|
||||
_jinja2_env.filters["json"] = json.dumps
|
||||
return _jinja2_env
|
||||
|
||||
|
||||
import urlcanon
|
||||
|
||||
|
||||
def _remove_query(url):
|
||||
url.question_mark = b''
|
||||
url.query = b''
|
||||
url.question_mark = b""
|
||||
url.query = b""
|
||||
|
||||
|
||||
# XXX chop off path after last slash??
|
||||
site_surt_canon = urlcanon.Canonicalizer(
|
||||
urlcanon.semantic.steps + [_remove_query])
|
||||
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
|
||||
|
||||
import doublethink
|
||||
import datetime
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=doublethink.UTC)
|
||||
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||
from brozzler.model import (
|
||||
new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf)
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
from brozzler.cli import suggest_default_chrome_exe
|
||||
|
||||
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
|
||||
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
|
||||
'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf',
|
||||
'sleep', 'thread_accept_exceptions', 'thread_raise']
|
||||
__all__ = [
|
||||
"Page",
|
||||
"Site",
|
||||
"BrozzlerWorker",
|
||||
"is_permitted_by_robots",
|
||||
"RethinkDbFrontier",
|
||||
"Browser",
|
||||
"BrowserPool",
|
||||
"BrowsingException",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
"sleep",
|
||||
"thread_accept_exceptions",
|
||||
"thread_raise",
|
||||
]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
||||
|
||||
Copyright (C) 2014-2023 Internet Archive
|
||||
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import logging
|
||||
import urllib.request
|
||||
@ -31,39 +31,43 @@ import json
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
|
||||
def check_version(chrome_exe):
|
||||
'''
|
||||
"""
|
||||
Raises SystemExit if `chrome_exe` is not a supported browser version.
|
||||
|
||||
Must run in the main thread to have the desired effect.
|
||||
'''
|
||||
"""
|
||||
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
|
||||
# Google Chrome 64.0.3282.140
|
||||
# Google Chrome 64.0.3282.140
|
||||
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
|
||||
# Google Chrome 66.0.3341.0 canary
|
||||
# linux$ chromium-browser --version
|
||||
# Using PPAPI flash.
|
||||
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
|
||||
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
|
||||
cmd = [chrome_exe, '--version']
|
||||
cmd = [chrome_exe, "--version"]
|
||||
out = subprocess.check_output(cmd, timeout=60)
|
||||
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
|
||||
m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
|
||||
if not m:
|
||||
sys.exit(
|
||||
'unable to parse browser version from output of '
|
||||
'%r: %r' % (subprocess.list2cmdline(cmd), out))
|
||||
"unable to parse browser version from output of "
|
||||
"%r: %r" % (subprocess.list2cmdline(cmd), out)
|
||||
)
|
||||
version_str = m.group(2).decode()
|
||||
major_version = int(version_str.split('.')[0])
|
||||
major_version = int(version_str.split(".")[0])
|
||||
if major_version < 64:
|
||||
sys.exit('brozzler requires chrome/chromium version 64 or '
|
||||
'later but %s reports version %s' % (
|
||||
chrome_exe, version_str))
|
||||
sys.exit(
|
||||
"brozzler requires chrome/chromium version 64 or "
|
||||
"later but %s reports version %s" % (chrome_exe, version_str)
|
||||
)
|
||||
|
||||
|
||||
class Chrome:
|
||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
|
||||
'''
|
||||
"""
|
||||
Initializes instance of this class.
|
||||
|
||||
Doesn't start the browser, start() does that.
|
||||
@ -73,7 +77,7 @@ class Chrome:
|
||||
port: chrome debugging protocol port (default 9222)
|
||||
ignore_cert_errors: configure chrome to accept all certs (default
|
||||
False)
|
||||
'''
|
||||
"""
|
||||
self.port = port
|
||||
self.chrome_exe = chrome_exe
|
||||
self.ignore_cert_errors = ignore_cert_errors
|
||||
@ -81,63 +85,72 @@ class Chrome:
|
||||
self.chrome_process = None
|
||||
|
||||
def __enter__(self):
|
||||
'''
|
||||
"""
|
||||
Returns websocket url to chrome window with about:blank loaded.
|
||||
'''
|
||||
"""
|
||||
return self.start()
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.stop()
|
||||
|
||||
def _init_cookie_db(self, cookie_db):
|
||||
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
||||
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
||||
self.logger.debug('cookie DB provided, writing to %s', cookie_location)
|
||||
cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
|
||||
cookie_location = os.path.join(cookie_dir, "Cookies")
|
||||
self.logger.debug("cookie DB provided, writing to %s", cookie_location)
|
||||
os.makedirs(cookie_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(cookie_location, 'wb') as cookie_file:
|
||||
with open(cookie_location, "wb") as cookie_file:
|
||||
cookie_file.write(cookie_db)
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
'exception writing cookie file at %s',
|
||||
cookie_location, exc_info=True)
|
||||
"exception writing cookie file at %s", cookie_location, exc_info=True
|
||||
)
|
||||
|
||||
def persist_and_read_cookie_db(self):
|
||||
cookie_location = os.path.join(
|
||||
self._chrome_user_data_dir, 'Default', 'Cookies')
|
||||
cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
|
||||
self.logger.debug(
|
||||
'marking cookies persistent then reading file into memory: %s',
|
||||
cookie_location)
|
||||
"marking cookies persistent then reading file into memory: %s",
|
||||
cookie_location,
|
||||
)
|
||||
try:
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute('UPDATE cookies SET is_persistent = 1')
|
||||
cur.execute("UPDATE cookies SET is_persistent = 1")
|
||||
except sqlite3.Error:
|
||||
try:
|
||||
# db schema changed around version 66, this is the old schema
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute('UPDATE cookies SET persistent = 1')
|
||||
cur.execute("UPDATE cookies SET persistent = 1")
|
||||
except sqlite3.Error:
|
||||
self.logger.error(
|
||||
'exception updating cookie DB %s', cookie_location,
|
||||
exc_info=True)
|
||||
"exception updating cookie DB %s", cookie_location, exc_info=True
|
||||
)
|
||||
|
||||
cookie_db = None
|
||||
try:
|
||||
with open(cookie_location, 'rb') as cookie_file:
|
||||
with open(cookie_location, "rb") as cookie_file:
|
||||
cookie_db = cookie_file.read()
|
||||
except OSError:
|
||||
self.logger.error(
|
||||
'exception reading from cookie DB file %s',
|
||||
cookie_location, exc_info=True)
|
||||
"exception reading from cookie DB file %s",
|
||||
cookie_location,
|
||||
exc_info=True,
|
||||
)
|
||||
return cookie_db
|
||||
|
||||
def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
|
||||
disk_cache_size=None, websocket_timeout=60,
|
||||
window_height=900, window_width=1400):
|
||||
'''
|
||||
def start(
|
||||
self,
|
||||
proxy=None,
|
||||
cookie_db=None,
|
||||
disk_cache_dir=None,
|
||||
disk_cache_size=None,
|
||||
websocket_timeout=60,
|
||||
window_height=900,
|
||||
window_width=1400,
|
||||
):
|
||||
"""
|
||||
Starts chrome/chromium process.
|
||||
|
||||
Args:
|
||||
@ -154,103 +167,126 @@ class Chrome:
|
||||
window_height, window_width: window height and width, in pixels
|
||||
Returns:
|
||||
websocket url to chrome window with about:blank loaded
|
||||
'''
|
||||
"""
|
||||
# these can raise exceptions
|
||||
self._home_tmpdir = tempfile.TemporaryDirectory()
|
||||
self._chrome_user_data_dir = os.path.join(
|
||||
self._home_tmpdir.name, 'chrome-user-data')
|
||||
self._home_tmpdir.name, "chrome-user-data"
|
||||
)
|
||||
if cookie_db:
|
||||
self._init_cookie_db(cookie_db)
|
||||
self._shutdown.clear()
|
||||
|
||||
new_env = os.environ.copy()
|
||||
new_env['HOME'] = self._home_tmpdir.name
|
||||
new_env["HOME"] = self._home_tmpdir.name
|
||||
chrome_args = [
|
||||
self.chrome_exe,
|
||||
'-v',
|
||||
'--headless',
|
||||
'--remote-debugging-port=%s' % self.port,
|
||||
'--use-mock-keychain', # mac thing
|
||||
'--user-data-dir=%s' % self._chrome_user_data_dir,
|
||||
'--disable-background-networking', '--disable-breakpad',
|
||||
'--disable-renderer-backgrounding', '--disable-hang-monitor',
|
||||
'--disable-background-timer-throttling', '--mute-audio',
|
||||
'--disable-web-sockets',
|
||||
f'--window-size={window_width},{window_height}',
|
||||
'--no-default-browser-check',
|
||||
'--disable-first-run-ui', '--no-first-run',
|
||||
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
||||
'--disable-web-security', '--disable-notifications',
|
||||
'--disable-extensions', '--disable-save-password-bubble',
|
||||
'--disable-sync']
|
||||
self.chrome_exe,
|
||||
"-v",
|
||||
"--headless",
|
||||
"--remote-debugging-port=%s" % self.port,
|
||||
"--use-mock-keychain", # mac thing
|
||||
"--user-data-dir=%s" % self._chrome_user_data_dir,
|
||||
"--disable-background-networking",
|
||||
"--disable-breakpad",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-background-timer-throttling",
|
||||
"--mute-audio",
|
||||
"--disable-web-sockets",
|
||||
f"--window-size={window_width},{window_height}",
|
||||
"--no-default-browser-check",
|
||||
"--disable-first-run-ui",
|
||||
"--no-first-run",
|
||||
"--homepage=about:blank",
|
||||
"--disable-direct-npapi-requests",
|
||||
"--disable-web-security",
|
||||
"--disable-notifications",
|
||||
"--disable-extensions",
|
||||
"--disable-save-password-bubble",
|
||||
"--disable-sync",
|
||||
]
|
||||
|
||||
extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
|
||||
extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
|
||||
if extra_chrome_args:
|
||||
chrome_args.extend(extra_chrome_args.split())
|
||||
if disk_cache_dir:
|
||||
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
|
||||
chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
|
||||
if disk_cache_size:
|
||||
chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
|
||||
chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
|
||||
if self.ignore_cert_errors:
|
||||
chrome_args.append('--ignore-certificate-errors')
|
||||
chrome_args.append("--ignore-certificate-errors")
|
||||
if proxy:
|
||||
chrome_args.append('--proxy-server=%s' % proxy)
|
||||
chrome_args.append('about:blank')
|
||||
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
|
||||
chrome_args.append("--proxy-server=%s" % proxy)
|
||||
chrome_args.append("about:blank")
|
||||
self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
self.chrome_process = subprocess.Popen(
|
||||
chrome_args, env=new_env, start_new_session=True,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
|
||||
chrome_args,
|
||||
env=new_env,
|
||||
start_new_session=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=0,
|
||||
)
|
||||
self._out_reader_thread = threading.Thread(
|
||||
target=self._read_stderr_stdout,
|
||||
name='ChromeOutReaderThread:%s' % self.port, daemon=True)
|
||||
target=self._read_stderr_stdout,
|
||||
name="ChromeOutReaderThread:%s" % self.port,
|
||||
daemon=True,
|
||||
)
|
||||
self._out_reader_thread.start()
|
||||
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
|
||||
self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
|
||||
|
||||
return self._websocket_url(timeout_sec=websocket_timeout)
|
||||
|
||||
def _websocket_url(self, timeout_sec = 60):
|
||||
json_url = 'http://localhost:%s/json' % self.port
|
||||
def _websocket_url(self, timeout_sec=60):
|
||||
json_url = "http://localhost:%s/json" % self.port
|
||||
# make this a member variable so that kill -QUIT reports it
|
||||
self._start = time.time()
|
||||
self._last_warning = self._start
|
||||
while True:
|
||||
try:
|
||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
||||
debug_info = [x for x in all_debug_info
|
||||
if x['url'] == 'about:blank']
|
||||
all_debug_info = json.loads(raw_json.decode("utf-8"))
|
||||
debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
|
||||
|
||||
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
||||
self.logger.debug('%s returned %s', json_url, raw_json)
|
||||
url = debug_info[0]['webSocketDebuggerUrl']
|
||||
if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
|
||||
self.logger.debug("%s returned %s", json_url, raw_json)
|
||||
url = debug_info[0]["webSocketDebuggerUrl"]
|
||||
self.logger.info(
|
||||
'got chrome window websocket debug url %s from %s',
|
||||
url, json_url)
|
||||
"got chrome window websocket debug url %s from %s",
|
||||
url,
|
||||
json_url,
|
||||
)
|
||||
return url
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except Exception as e:
|
||||
if time.time() - self._last_warning > 30:
|
||||
self.logger.warning(
|
||||
'problem with %s (will keep trying until timeout '
|
||||
'of %d seconds): %s', json_url, timeout_sec, e)
|
||||
"problem with %s (will keep trying until timeout "
|
||||
"of %d seconds): %s",
|
||||
json_url,
|
||||
timeout_sec,
|
||||
e,
|
||||
)
|
||||
self._last_warning = time.time()
|
||||
finally:
|
||||
e = None
|
||||
if self.chrome_process:
|
||||
if time.time() - self._start > timeout_sec:
|
||||
e = Exception(
|
||||
'killing chrome, failed to retrieve %s after '
|
||||
'%s seconds' % (
|
||||
json_url, time.time() - self._start))
|
||||
"killing chrome, failed to retrieve %s after "
|
||||
"%s seconds" % (json_url, time.time() - self._start)
|
||||
)
|
||||
elif self.chrome_process.poll() is not None:
|
||||
e = Exception(
|
||||
'chrome process died with status %s' % self.chrome_process.poll())
|
||||
"chrome process died with status %s"
|
||||
% self.chrome_process.poll()
|
||||
)
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
e = Exception('??? self.chrome_process is not set ???')
|
||||
e = Exception("??? self.chrome_process is not set ???")
|
||||
if e:
|
||||
self.stop()
|
||||
raise e
|
||||
@ -258,11 +294,13 @@ class Chrome:
|
||||
def _read_stderr_stdout(self):
|
||||
# XXX select doesn't work on windows
|
||||
def readline_nonblock(f):
|
||||
buf = b''
|
||||
buf = b""
|
||||
try:
|
||||
while not self._shutdown.is_set() and (
|
||||
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
||||
[f],[],[],0.5)[0]:
|
||||
while (
|
||||
not self._shutdown.is_set()
|
||||
and (len(buf) == 0 or buf[-1] != 0xA)
|
||||
and select.select([f], [], [], 0.5)[0]
|
||||
):
|
||||
buf += f.read(1)
|
||||
except (ValueError, OSError):
|
||||
# When the chrome process crashes, stdout & stderr are closed
|
||||
@ -276,16 +314,16 @@ class Chrome:
|
||||
buf = readline_nonblock(self.chrome_process.stdout)
|
||||
if buf:
|
||||
self.logger.trace(
|
||||
'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
"chrome pid %s STDOUT %s", self.chrome_process.pid, buf
|
||||
)
|
||||
|
||||
buf = readline_nonblock(self.chrome_process.stderr)
|
||||
if buf:
|
||||
self.logger.trace(
|
||||
'chrome pid %s STDERR %s',
|
||||
self.chrome_process.pid, buf)
|
||||
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
|
||||
)
|
||||
except:
|
||||
self.logger.error('unexpected exception', exc_info=True)
|
||||
self.logger.error("unexpected exception", exc_info=True)
|
||||
|
||||
def stop(self):
|
||||
if not self.chrome_process or self._shutdown.is_set():
|
||||
@ -294,8 +332,7 @@ class Chrome:
|
||||
|
||||
timeout_sec = 300
|
||||
if self.chrome_process.poll() is None:
|
||||
self.logger.info(
|
||||
'terminating chrome pgid %s', self.chrome_process.pid)
|
||||
self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
|
||||
|
||||
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
||||
t0 = time.time()
|
||||
@ -306,12 +343,14 @@ class Chrome:
|
||||
if status is not None:
|
||||
if status == 0:
|
||||
self.logger.info(
|
||||
'chrome pid %s exited normally',
|
||||
self.chrome_process.pid)
|
||||
"chrome pid %s exited normally", self.chrome_process.pid
|
||||
)
|
||||
else:
|
||||
self.logger.warning(
|
||||
'chrome pid %s exited with nonzero status %s',
|
||||
self.chrome_process.pid, status)
|
||||
"chrome pid %s exited with nonzero status %s",
|
||||
self.chrome_process.pid,
|
||||
status,
|
||||
)
|
||||
|
||||
# XXX I would like to forcefully kill the process group
|
||||
# here to guarantee no orphaned chromium subprocesses hang
|
||||
@ -321,14 +360,18 @@ class Chrome:
|
||||
time.sleep(0.5)
|
||||
|
||||
self.logger.warning(
|
||||
'chrome pid %s still alive %.1f seconds after sending '
|
||||
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
||||
time.time() - t0)
|
||||
"chrome pid %s still alive %.1f seconds after sending "
|
||||
"SIGTERM, sending SIGKILL",
|
||||
self.chrome_process.pid,
|
||||
time.time() - t0,
|
||||
)
|
||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||
status = self.chrome_process.wait()
|
||||
self.logger.warning(
|
||||
'chrome pid %s reaped (status=%s) after killing with '
|
||||
'SIGKILL', self.chrome_process.pid, status)
|
||||
"chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
|
||||
self.chrome_process.pid,
|
||||
status,
|
||||
)
|
||||
|
||||
finally:
|
||||
self.chrome_process.stdout.close()
|
||||
@ -337,8 +380,7 @@ class Chrome:
|
||||
self._home_tmpdir.cleanup()
|
||||
except:
|
||||
self.logger.error(
|
||||
'exception deleting %s', self._home_tmpdir,
|
||||
exc_info=True)
|
||||
"exception deleting %s", self._home_tmpdir, exc_info=True
|
||||
)
|
||||
self._out_reader_thread.join()
|
||||
self.chrome_process = None
|
||||
|
||||
|
978
brozzler/cli.py
978
brozzler/cli.py
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
||||
endspoints etc
|
||||
|
||||
@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
try:
|
||||
import flask
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
import json
|
||||
@ -41,33 +44,44 @@ app = flask.Flask(__name__)
|
||||
|
||||
# configure with environment variables
|
||||
SETTINGS = {
|
||||
'RETHINKDB_SERVERS': os.environ.get(
|
||||
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
|
||||
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
||||
'WAYBACK_BASEURL': os.environ.get(
|
||||
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
|
||||
'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'),
|
||||
'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost')
|
||||
"RETHINKDB_SERVERS": os.environ.get(
|
||||
"BROZZLER_RETHINKDB_SERVERS", "localhost"
|
||||
).split(","),
|
||||
"RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
|
||||
"WAYBACK_BASEURL": os.environ.get(
|
||||
"WAYBACK_BASEURL", "http://localhost:8880/brozzler"
|
||||
),
|
||||
"DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
|
||||
"DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
|
||||
}
|
||||
rr = doublethink.Rethinker(
|
||||
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
|
||||
rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
|
||||
_svc_reg = None
|
||||
|
||||
|
||||
def service_registry():
|
||||
global _svc_reg
|
||||
if not _svc_reg:
|
||||
_svc_reg = doublethink.ServiceRegistry(rr)
|
||||
return _svc_reg
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>/queued_count")
|
||||
@app.route("/api/site/<site_id>/queued_count")
|
||||
def queued_count(site_id):
|
||||
reql = rr.table("pages").between(
|
||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||
index="priority_by_site").count()
|
||||
reql = (
|
||||
rr.table("pages")
|
||||
.between(
|
||||
[site_id, 0, False, r.minval],
|
||||
[site_id, 0, False, r.maxval],
|
||||
index="priority_by_site",
|
||||
)
|
||||
.count()
|
||||
)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
count = reql.run()
|
||||
return flask.jsonify(count=count)
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>/queue")
|
||||
@app.route("/api/site/<site_id>/queue")
|
||||
def queue(site_id):
|
||||
@ -75,38 +89,52 @@ def queue(site_id):
|
||||
start = flask.request.args.get("start", 0)
|
||||
end = flask.request.args.get("end", start + 90)
|
||||
reql = rr.table("pages").between(
|
||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||
index="priority_by_site")[start:end]
|
||||
[site_id, 0, False, r.minval],
|
||||
[site_id, 0, False, r.maxval],
|
||||
index="priority_by_site",
|
||||
)[start:end]
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
queue_ = reql.run()
|
||||
return flask.jsonify(queue_=list(queue_))
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>/pages_count")
|
||||
@app.route("/api/site/<site_id>/pages_count")
|
||||
@app.route("/api/sites/<site_id>/page_count")
|
||||
@app.route("/api/site/<site_id>/page_count")
|
||||
def page_count(site_id):
|
||||
reql = rr.table("pages").between(
|
||||
reql = (
|
||||
rr.table("pages")
|
||||
.between(
|
||||
[site_id, 1, False, r.minval],
|
||||
[site_id, r.maxval, False, r.maxval],
|
||||
index="priority_by_site").count()
|
||||
index="priority_by_site",
|
||||
)
|
||||
.count()
|
||||
)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
count = reql.run()
|
||||
return flask.jsonify(count=count)
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>/pages")
|
||||
@app.route("/api/site/<site_id>/pages")
|
||||
def pages(site_id):
|
||||
"""Pages already crawled."""
|
||||
start = int(flask.request.args.get("start", 0))
|
||||
end = int(flask.request.args.get("end", start + 90))
|
||||
reql = rr.table("pages").between(
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index="least_hops").order_by(index="least_hops")[start:end]
|
||||
reql = (
|
||||
rr.table("pages")
|
||||
.between(
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
|
||||
)
|
||||
.order_by(index="least_hops")[start:end]
|
||||
)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
pages_ = reql.run()
|
||||
return flask.jsonify(pages=list(pages_))
|
||||
|
||||
|
||||
@app.route("/api/pages/<page_id>")
|
||||
@app.route("/api/page/<page_id>")
|
||||
def page(page_id):
|
||||
@ -115,6 +143,7 @@ def page(page_id):
|
||||
page_ = reql.run()
|
||||
return flask.jsonify(page_)
|
||||
|
||||
|
||||
@app.route("/api/pages/<page_id>/yaml")
|
||||
@app.route("/api/page/<page_id>/yaml")
|
||||
def page_yaml(page_id):
|
||||
@ -122,8 +151,9 @@ def page_yaml(page_id):
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
page_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(page_, default_flow_style=False),
|
||||
mimetype="application/yaml")
|
||||
yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>")
|
||||
@app.route("/api/site/<site_id>")
|
||||
@ -135,6 +165,7 @@ def site(site_id):
|
||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||
return flask.jsonify(s)
|
||||
|
||||
|
||||
@app.route("/api/sites/<site_id>/yaml")
|
||||
@app.route("/api/site/<site_id>/yaml")
|
||||
def site_yaml(site_id):
|
||||
@ -142,8 +173,9 @@ def site_yaml(site_id):
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
site_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(site_, default_flow_style=False),
|
||||
mimetype="application/yaml")
|
||||
yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/stats/<bucket>")
|
||||
def stats(bucket):
|
||||
@ -152,6 +184,7 @@ def stats(bucket):
|
||||
stats_ = reql.run()
|
||||
return flask.jsonify(stats_)
|
||||
|
||||
|
||||
@app.route("/api/jobs/<job_id>/sites")
|
||||
@app.route("/api/job/<job_id>/sites")
|
||||
def sites(job_id):
|
||||
@ -168,6 +201,7 @@ def sites(job_id):
|
||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||
return flask.jsonify(sites=sites_)
|
||||
|
||||
|
||||
@app.route("/api/jobless-sites")
|
||||
def jobless_sites():
|
||||
# XXX inefficient (unindexed) query
|
||||
@ -180,6 +214,7 @@ def jobless_sites():
|
||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||
return flask.jsonify(sites=sites_)
|
||||
|
||||
|
||||
@app.route("/api/jobs/<job_id>")
|
||||
@app.route("/api/job/<job_id>")
|
||||
def job(job_id):
|
||||
@ -192,6 +227,7 @@ def job(job_id):
|
||||
job_ = reql.run()
|
||||
return flask.jsonify(job_)
|
||||
|
||||
|
||||
@app.route("/api/jobs/<job_id>/yaml")
|
||||
@app.route("/api/job/<job_id>/yaml")
|
||||
def job_yaml(job_id):
|
||||
@ -203,19 +239,22 @@ def job_yaml(job_id):
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
job_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(job_, default_flow_style=False),
|
||||
mimetype="application/yaml")
|
||||
yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/workers")
|
||||
def workers():
|
||||
workers_ = service_registry().available_services("brozzler-worker")
|
||||
return flask.jsonify(workers=list(workers_))
|
||||
|
||||
|
||||
@app.route("/api/services")
|
||||
def services():
|
||||
services_ = service_registry().available_services()
|
||||
return flask.jsonify(services=list(services_))
|
||||
|
||||
|
||||
@app.route("/api/jobs")
|
||||
def jobs():
|
||||
reql = rr.table("jobs").order_by(r.desc("id"))
|
||||
@ -223,20 +262,24 @@ def jobs():
|
||||
jobs_ = list(reql.run())
|
||||
return flask.jsonify(jobs=jobs_)
|
||||
|
||||
|
||||
@app.route("/api/config")
|
||||
def config():
|
||||
return flask.jsonify(config=SETTINGS)
|
||||
|
||||
|
||||
@app.route("/api/<path:path>")
|
||||
@app.route("/api", defaults={"path":""})
|
||||
@app.route("/api", defaults={"path": ""})
|
||||
def api404(path):
|
||||
flask.abort(404)
|
||||
|
||||
|
||||
@app.route("/", defaults={"path": ""})
|
||||
@app.route("/<path:path>")
|
||||
def root(path):
|
||||
return flask.render_template("index.html")
|
||||
|
||||
|
||||
try:
|
||||
import gunicorn.app.base
|
||||
from gunicorn.six import iteritems
|
||||
@ -255,8 +298,12 @@ try:
|
||||
|
||||
def load_config(self):
|
||||
config = dict(
|
||||
[(key, value) for key, value in iteritems(self.options)
|
||||
if key in self.cfg.settings and value is not None])
|
||||
[
|
||||
(key, value)
|
||||
for key, value in iteritems(self.options)
|
||||
if key in self.cfg.settings and value is not None
|
||||
]
|
||||
)
|
||||
for key, value in iteritems(config):
|
||||
self.cfg.set(key.lower(), value)
|
||||
self.cfg.set("logger_class", BypassGunicornLogging)
|
||||
@ -270,37 +317,42 @@ try:
|
||||
GunicornBrozzlerDashboard(app, options).run()
|
||||
|
||||
except ImportError:
|
||||
|
||||
def run():
|
||||
logging.info("running brozzler-dashboard using simple flask app.run")
|
||||
app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT'])
|
||||
app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
import argparse
|
||||
import brozzler.cli
|
||||
|
||||
argv = argv or sys.argv
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=(
|
||||
'brozzler-dashboard - web application for viewing brozzler '
|
||||
'crawl status'),
|
||||
epilog=(
|
||||
'brozzler-dashboard has no command line options, but can be '
|
||||
'configured using the following environment variables:\n\n'
|
||||
' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. '
|
||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
|
||||
'localhost)\n'
|
||||
' BROZZLER_RETHINKDB_DB rethinkdb database name '
|
||||
'(default: brozzler)\n'
|
||||
' WAYBACK_BASEURL base url for constructing wayback '
|
||||
'links (default http://localhost:8880/brozzler)'
|
||||
' DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n'
|
||||
' DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)'))
|
||||
prog=os.path.basename(argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=(
|
||||
"brozzler-dashboard - web application for viewing brozzler " "crawl status"
|
||||
),
|
||||
epilog=(
|
||||
"brozzler-dashboard has no command line options, but can be "
|
||||
"configured using the following environment variables:\n\n"
|
||||
" BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. "
|
||||
"db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
|
||||
"localhost)\n"
|
||||
" BROZZLER_RETHINKDB_DB rethinkdb database name "
|
||||
"(default: brozzler)\n"
|
||||
" WAYBACK_BASEURL base url for constructing wayback "
|
||||
"links (default http://localhost:8880/brozzler)"
|
||||
" DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n"
|
||||
" DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
|
||||
),
|
||||
)
|
||||
brozzler.cli.add_common_options(arg_parser, argv)
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
brozzler.cli.configure_logging(args)
|
||||
run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
227
brozzler/easy.py
227
brozzler/easy.py
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
|
||||
working together in a single process
|
||||
|
||||
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
|
||||
try:
|
||||
import warcprox
|
||||
import warcprox.main
|
||||
@ -30,9 +31,11 @@ try:
|
||||
import brozzler.dashboard
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
sys.exit(1)
|
||||
import argparse
|
||||
import brozzler
|
||||
@ -46,76 +49,112 @@ import doublethink
|
||||
import traceback
|
||||
import socketserver
|
||||
|
||||
|
||||
def _build_arg_parser(argv=None):
|
||||
argv = argv or sys.argv
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
||||
prog=os.path.basename(argv[0]), description=(
|
||||
'brozzler-easy - easy deployment of brozzler, with '
|
||||
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
|
||||
'running in a single process'))
|
||||
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
||||
prog=os.path.basename(argv[0]),
|
||||
description=(
|
||||
"brozzler-easy - easy deployment of brozzler, with "
|
||||
"brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
|
||||
"running in a single process"
|
||||
),
|
||||
)
|
||||
|
||||
# common args
|
||||
brozzler.cli.add_rethinkdb_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
||||
help='where to write warcs')
|
||||
"-d",
|
||||
"--warcs-dir",
|
||||
dest="warcs_dir",
|
||||
default="./warcs",
|
||||
help="where to write warcs",
|
||||
)
|
||||
|
||||
# warcprox args
|
||||
arg_parser.add_argument(
|
||||
'-c', '--cacert', dest='cacert',
|
||||
default='./%s-warcprox-ca.pem' % socket.gethostname(),
|
||||
help=(
|
||||
'warcprox CA certificate file; if file does not exist, it '
|
||||
'will be created'))
|
||||
"-c",
|
||||
"--cacert",
|
||||
dest="cacert",
|
||||
default="./%s-warcprox-ca.pem" % socket.gethostname(),
|
||||
help=(
|
||||
"warcprox CA certificate file; if file does not exist, it "
|
||||
"will be created"
|
||||
),
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--certs-dir', dest='certs_dir',
|
||||
default='./%s-warcprox-ca' % socket.gethostname(),
|
||||
help='where warcprox will store and load generated certificates')
|
||||
"--certs-dir",
|
||||
dest="certs_dir",
|
||||
default="./%s-warcprox-ca" % socket.gethostname(),
|
||||
help="where warcprox will store and load generated certificates",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
||||
default=None, help=(
|
||||
'host:port of tor socks proxy, used only to connect to '
|
||||
'.onion sites'))
|
||||
"--onion-tor-socks-proxy",
|
||||
dest="onion_tor_socks_proxy",
|
||||
default=None,
|
||||
help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
|
||||
)
|
||||
|
||||
# brozzler-worker args
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
default=brozzler.cli.suggest_default_chrome_exe(),
|
||||
help='executable to use to invoke chrome')
|
||||
"-e",
|
||||
"--chrome-exe",
|
||||
dest="chrome_exe",
|
||||
default=brozzler.cli.suggest_default_chrome_exe(),
|
||||
help="executable to use to invoke chrome",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'-n', '--max-browsers', dest='max_browsers',
|
||||
type=int, default=1, help=(
|
||||
'max number of chrome instances simultaneously '
|
||||
'browsing pages'))
|
||||
"-n",
|
||||
"--max-browsers",
|
||||
dest="max_browsers",
|
||||
type=int,
|
||||
default=1,
|
||||
help=("max number of chrome instances simultaneously " "browsing pages"),
|
||||
)
|
||||
|
||||
# pywb args
|
||||
arg_parser.add_argument(
|
||||
'--pywb-address', dest='pywb_address',
|
||||
default='0.0.0.0',
|
||||
help='pywb wayback address to listen on')
|
||||
"--pywb-address",
|
||||
dest="pywb_address",
|
||||
default="0.0.0.0",
|
||||
help="pywb wayback address to listen on",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--pywb-port', dest='pywb_port', type=int,
|
||||
default=8880, help='pywb wayback port')
|
||||
"--pywb-port",
|
||||
dest="pywb_port",
|
||||
type=int,
|
||||
default=8880,
|
||||
help="pywb wayback port",
|
||||
)
|
||||
|
||||
# dashboard args
|
||||
arg_parser.add_argument(
|
||||
'--dashboard-address', dest='dashboard_address',
|
||||
default='localhost',
|
||||
help='brozzler dashboard address to listen on')
|
||||
"--dashboard-address",
|
||||
dest="dashboard_address",
|
||||
default="localhost",
|
||||
help="brozzler dashboard address to listen on",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--dashboard-port', dest='dashboard_port',
|
||||
type=int, default=8881, help='brozzler dashboard port')
|
||||
"--dashboard-port",
|
||||
dest="dashboard_port",
|
||||
type=int,
|
||||
default=8881,
|
||||
help="brozzler dashboard port",
|
||||
)
|
||||
|
||||
# common at the bottom args
|
||||
brozzler.cli.add_common_options(arg_parser, argv)
|
||||
|
||||
return arg_parser
|
||||
|
||||
|
||||
class ThreadingWSGIServer(
|
||||
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
|
||||
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class BrozzlerEasyController:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -123,25 +162,31 @@ class BrozzlerEasyController:
|
||||
self.stop = threading.Event()
|
||||
self.args = args
|
||||
self.warcprox_controller = warcprox.controller.WarcproxController(
|
||||
self._warcprox_opts(args))
|
||||
self._warcprox_opts(args)
|
||||
)
|
||||
self.brozzler_worker = self._init_brozzler_worker(args)
|
||||
self.pywb_httpd = self._init_pywb(args)
|
||||
self.dashboard_httpd = self._init_brozzler_dashboard(args)
|
||||
|
||||
def _init_brozzler_dashboard(self, args):
|
||||
return wsgiref.simple_server.make_server(
|
||||
args.dashboard_address, args.dashboard_port,
|
||||
brozzler.dashboard.app, ThreadingWSGIServer)
|
||||
args.dashboard_address,
|
||||
args.dashboard_port,
|
||||
brozzler.dashboard.app,
|
||||
ThreadingWSGIServer,
|
||||
)
|
||||
|
||||
def _init_brozzler_worker(self, args):
|
||||
rr = doublethink.Rethinker(
|
||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier, service_registry, chrome_exe=args.chrome_exe,
|
||||
proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
|
||||
max_browsers=args.max_browsers)
|
||||
frontier,
|
||||
service_registry,
|
||||
chrome_exe=args.chrome_exe,
|
||||
proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
|
||||
max_browsers=args.max_browsers,
|
||||
)
|
||||
return worker
|
||||
|
||||
def _init_pywb(self, args):
|
||||
@ -152,66 +197,67 @@ class BrozzlerEasyController:
|
||||
brozzler.pywb.monkey_patch_fuzzy_query()
|
||||
brozzler.pywb.monkey_patch_calc_search_range()
|
||||
|
||||
if args.warcs_dir.endswith('/'):
|
||||
if args.warcs_dir.endswith("/"):
|
||||
warcs_dir = args.warcs_dir
|
||||
else:
|
||||
warcs_dir = args.warcs_dir + '/'
|
||||
warcs_dir = args.warcs_dir + "/"
|
||||
|
||||
conf = {
|
||||
'collections': {
|
||||
'brozzler': {
|
||||
'index_paths': brozzler.pywb.RethinkCDXSource(
|
||||
"collections": {
|
||||
"brozzler": {
|
||||
"index_paths": brozzler.pywb.RethinkCDXSource(
|
||||
servers=args.rethinkdb_servers.split(","),
|
||||
db=args.rethinkdb_db, table='captures')
|
||||
db=args.rethinkdb_db,
|
||||
table="captures",
|
||||
)
|
||||
},
|
||||
},
|
||||
# 'enable_http_proxy': True,
|
||||
# 'enable_memento': True,
|
||||
'archive_paths': warcs_dir,
|
||||
'enable_cdx_api': True,
|
||||
'framed_replay': True,
|
||||
'port': args.pywb_port,
|
||||
'enable_auto_colls': False,
|
||||
"archive_paths": warcs_dir,
|
||||
"enable_cdx_api": True,
|
||||
"framed_replay": True,
|
||||
"port": args.pywb_port,
|
||||
"enable_auto_colls": False,
|
||||
}
|
||||
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
|
||||
pywb.webapp.pywb_init.create_wb_router, config=conf,
|
||||
load_yaml=False)
|
||||
pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
|
||||
)
|
||||
|
||||
# disable is_hop_by_hop restrictions
|
||||
wsgiref.handlers.is_hop_by_hop = lambda x: False
|
||||
return wsgiref.simple_server.make_server(
|
||||
args.pywb_address, args.pywb_port, wsgi_app,
|
||||
ThreadingWSGIServer)
|
||||
args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
|
||||
)
|
||||
|
||||
def start(self):
|
||||
self.logger.info('starting warcprox')
|
||||
self.logger.info("starting warcprox")
|
||||
self.warcprox_controller.start()
|
||||
|
||||
# XXX wait til fully started?
|
||||
self.logger.info('starting brozzler-worker')
|
||||
self.logger.info("starting brozzler-worker")
|
||||
self.brozzler_worker.start()
|
||||
|
||||
self.logger.info(
|
||||
'starting pywb at %s:%s', *self.pywb_httpd.server_address)
|
||||
self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
|
||||
threading.Thread(target=self.pywb_httpd.serve_forever).start()
|
||||
|
||||
self.logger.info(
|
||||
'starting brozzler-dashboard at %s:%s',
|
||||
*self.dashboard_httpd.server_address)
|
||||
"starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
|
||||
)
|
||||
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
|
||||
|
||||
def shutdown(self):
|
||||
self.logger.info('shutting down brozzler-dashboard')
|
||||
self.logger.info("shutting down brozzler-dashboard")
|
||||
self.dashboard_httpd.shutdown()
|
||||
|
||||
self.logger.info('shutting down brozzler-worker')
|
||||
self.logger.info("shutting down brozzler-worker")
|
||||
self.brozzler_worker.shutdown_now()
|
||||
# brozzler-worker is fully shut down at this point
|
||||
|
||||
self.logger.info('shutting down pywb')
|
||||
self.logger.info("shutting down pywb")
|
||||
self.pywb_httpd.shutdown()
|
||||
|
||||
self.logger.info('shutting down warcprox')
|
||||
self.logger.info("shutting down warcprox")
|
||||
self.warcprox_controller.shutdown()
|
||||
|
||||
def wait_for_shutdown_request(self):
|
||||
@ -222,14 +268,14 @@ class BrozzlerEasyController:
|
||||
self.shutdown()
|
||||
|
||||
def _warcprox_opts(self, args):
|
||||
'''
|
||||
"""
|
||||
Takes args as produced by the argument parser built by
|
||||
_build_arg_parser and builds warcprox arguments object suitable to pass
|
||||
to warcprox.main.init_controller. Copies some arguments, renames some,
|
||||
populates some with defaults appropriate for brozzler-easy, etc.
|
||||
'''
|
||||
"""
|
||||
warcprox_opts = warcprox.Options()
|
||||
warcprox_opts.address = 'localhost'
|
||||
warcprox_opts.address = "localhost"
|
||||
# let the OS choose an available port; discover it later using
|
||||
# sock.getsockname()[1]
|
||||
warcprox_opts.port = 0
|
||||
@ -237,17 +283,18 @@ class BrozzlerEasyController:
|
||||
warcprox_opts.certs_dir = args.certs_dir
|
||||
warcprox_opts.directory = args.warcs_dir
|
||||
warcprox_opts.gzip = True
|
||||
warcprox_opts.prefix = 'brozzler'
|
||||
warcprox_opts.size = 1000 * 1000* 1000
|
||||
warcprox_opts.prefix = "brozzler"
|
||||
warcprox_opts.size = 1000 * 1000 * 1000
|
||||
warcprox_opts.rollover_idle_time = 3 * 60
|
||||
warcprox_opts.digest_algorithm = 'sha1'
|
||||
warcprox_opts.digest_algorithm = "sha1"
|
||||
warcprox_opts.base32 = True
|
||||
warcprox_opts.stats_db_file = None
|
||||
warcprox_opts.playback_port = None
|
||||
warcprox_opts.playback_index_db_file = None
|
||||
warcprox_opts.rethinkdb_big_table_url = (
|
||||
'rethinkdb://%s/%s/captures' % (
|
||||
args.rethinkdb_servers, args.rethinkdb_db))
|
||||
warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
|
||||
args.rethinkdb_servers,
|
||||
args.rethinkdb_db,
|
||||
)
|
||||
warcprox_opts.queue_size = 500
|
||||
warcprox_opts.max_threads = None
|
||||
warcprox_opts.profile = False
|
||||
@ -259,9 +306,11 @@ class BrozzlerEasyController:
|
||||
for th in threading.enumerate():
|
||||
state_strs.append(str(th))
|
||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||
state_strs.append(''.join(stack))
|
||||
logging.warning('dumping state (caught signal {})\n{}'.format(
|
||||
signum, '\n'.join(state_strs)))
|
||||
state_strs.append("".join(stack))
|
||||
logging.warning(
|
||||
"dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
|
||||
)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
argv = argv or sys.argv
|
||||
@ -271,8 +320,8 @@ def main(argv=None):
|
||||
brozzler.chrome.check_version(args.chrome_exe)
|
||||
|
||||
controller = BrozzlerEasyController(args)
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGTERM, lambda a, b: controller.stop.set())
|
||||
signal.signal(signal.SIGINT, lambda a, b: controller.stop.set())
|
||||
signal.signal(signal.SIGQUIT, controller.dump_state)
|
||||
controller.start()
|
||||
controller.wait_for_shutdown_request()
|
||||
|
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
|
||||
Copyright (C) 2014-2018 Internet Archive
|
||||
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
@ -27,9 +27,11 @@ import urlcanon
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
|
||||
class UnexpectedDbResult(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RethinkDbFrontier:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -47,40 +49,49 @@ class RethinkDbFrontier:
|
||||
tables = self.rr.table_list().run()
|
||||
if not "sites" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'sites' in database %r",
|
||||
self.rr.dbname)
|
||||
"creating rethinkdb table 'sites' in database %r", self.rr.dbname
|
||||
)
|
||||
self.rr.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.rr.table("sites").index_create("sites_last_disclaimed", [
|
||||
r.row["status"], r.row["last_disclaimed"]]).run()
|
||||
"sites", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
self.rr.table("sites").index_create(
|
||||
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
|
||||
).run()
|
||||
self.rr.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'pages' in database %r",
|
||||
self.rr.dbname)
|
||||
"creating rethinkdb table 'pages' in database %r", self.rr.dbname
|
||||
)
|
||||
self.rr.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.rr.table("pages").index_create("priority_by_site", [
|
||||
r.row["site_id"], r.row["brozzle_count"],
|
||||
r.row["claimed"], r.row["priority"]]).run()
|
||||
"pages", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
self.rr.table("pages").index_create(
|
||||
"priority_by_site",
|
||||
[
|
||||
r.row["site_id"],
|
||||
r.row["brozzle_count"],
|
||||
r.row["claimed"],
|
||||
r.row["priority"],
|
||||
],
|
||||
).run()
|
||||
# this index is for displaying pages in a sensible order in the web
|
||||
# console
|
||||
self.rr.table("pages").index_create("least_hops", [
|
||||
r.row["site_id"], r.row["brozzle_count"],
|
||||
r.row["hops_from_seed"]]).run()
|
||||
self.rr.table("pages").index_create(
|
||||
"least_hops",
|
||||
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
||||
).run()
|
||||
if not "jobs" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'jobs' in database %r",
|
||||
self.rr.dbname)
|
||||
"creating rethinkdb table 'jobs' in database %r", self.rr.dbname
|
||||
)
|
||||
self.rr.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
"jobs", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
|
||||
def _vet_result(self, result, **kwargs):
|
||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||
# {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
|
||||
for k in [
|
||||
"replaced", "errors", "skipped", "inserted", "deleted",
|
||||
"unchanged"]:
|
||||
for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
|
||||
if k in kwargs:
|
||||
expected = kwargs[k]
|
||||
else:
|
||||
@ -88,81 +99,110 @@ class RethinkDbFrontier:
|
||||
if isinstance(expected, list):
|
||||
if result.get(k) not in kwargs[k]:
|
||||
raise UnexpectedDbResult(
|
||||
"expected %r to be one of %r in %r" % (
|
||||
k, expected, result))
|
||||
"expected %r to be one of %r in %r" % (k, expected, result)
|
||||
)
|
||||
else:
|
||||
if result.get(k) != expected:
|
||||
raise UnexpectedDbResult("expected %r to be %r in %r" % (
|
||||
k, expected, result))
|
||||
raise UnexpectedDbResult(
|
||||
"expected %r to be %r in %r" % (k, expected, result)
|
||||
)
|
||||
|
||||
def claim_sites(self, n=1):
|
||||
self.logger.trace('claiming up to %s sites to brozzle', n)
|
||||
self.logger.trace("claiming up to %s sites to brozzle", n)
|
||||
result = (
|
||||
self.rr.table('sites').get_all(r.args(
|
||||
r.db(self.rr.dbname).table('sites', read_mode='majority')
|
||||
.between(
|
||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||
index='sites_last_disclaimed')
|
||||
.order_by(r.desc('claimed'), 'last_disclaimed')
|
||||
.fold(
|
||||
{}, lambda acc, site: acc.merge(
|
||||
r.branch(
|
||||
site.has_fields('job_id'),
|
||||
r.object(
|
||||
site['job_id'].coerce_to('string'),
|
||||
acc[site['job_id'].coerce_to('string')].default(0).add(1)),
|
||||
{})),
|
||||
emit=lambda acc, site, new_acc: r.branch(
|
||||
r.and_(
|
||||
r.or_(
|
||||
site['claimed'].not_(),
|
||||
site['last_claimed'].lt(r.now().sub(60*60))),
|
||||
r.or_(
|
||||
site.has_fields('max_claimed_sites').not_(),
|
||||
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
|
||||
[site['id']], []))
|
||||
.limit(n)))
|
||||
self.rr.table("sites")
|
||||
.get_all(
|
||||
r.args(
|
||||
r.db(self.rr.dbname)
|
||||
.table("sites", read_mode="majority")
|
||||
.between(
|
||||
["ACTIVE", r.minval],
|
||||
["ACTIVE", r.maxval],
|
||||
index="sites_last_disclaimed",
|
||||
)
|
||||
.order_by(r.desc("claimed"), "last_disclaimed")
|
||||
.fold(
|
||||
{},
|
||||
lambda acc, site: acc.merge(
|
||||
r.branch(
|
||||
site.has_fields("job_id"),
|
||||
r.object(
|
||||
site["job_id"].coerce_to("string"),
|
||||
acc[site["job_id"].coerce_to("string")]
|
||||
.default(0)
|
||||
.add(1),
|
||||
),
|
||||
{},
|
||||
)
|
||||
),
|
||||
emit=lambda acc, site, new_acc: r.branch(
|
||||
r.and_(
|
||||
r.or_(
|
||||
site["claimed"].not_(),
|
||||
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||
),
|
||||
r.or_(
|
||||
site.has_fields("max_claimed_sites").not_(),
|
||||
new_acc[site["job_id"].coerce_to("string")].le(
|
||||
site["max_claimed_sites"]
|
||||
),
|
||||
),
|
||||
),
|
||||
[site["id"]],
|
||||
[],
|
||||
),
|
||||
)
|
||||
.limit(n)
|
||||
)
|
||||
)
|
||||
.update(
|
||||
# try to avoid a race condition resulting in multiple
|
||||
# brozzler-workers claiming the same site
|
||||
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
||||
r.branch(
|
||||
r.or_(
|
||||
r.row['claimed'].not_(),
|
||||
r.row['last_claimed'].lt(r.now().sub(60*60))),
|
||||
{'claimed': True, 'last_claimed': r.now()},
|
||||
{}),
|
||||
return_changes=True)).run()
|
||||
r.row["claimed"].not_(),
|
||||
r.row["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||
),
|
||||
{"claimed": True, "last_claimed": r.now()},
|
||||
{},
|
||||
),
|
||||
return_changes=True,
|
||||
)
|
||||
).run()
|
||||
|
||||
self._vet_result(
|
||||
result, replaced=list(range(n+1)),
|
||||
unchanged=list(range(n+1)))
|
||||
result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
|
||||
)
|
||||
sites = []
|
||||
for i in range(result["replaced"]):
|
||||
if result["changes"][i]["old_val"]["claimed"]:
|
||||
self.logger.warning(
|
||||
"re-claimed site that was still marked 'claimed' "
|
||||
"because it was last claimed a long time ago "
|
||||
"at %s, and presumably some error stopped it from "
|
||||
"being disclaimed",
|
||||
result["changes"][i]["old_val"]["last_claimed"])
|
||||
"re-claimed site that was still marked 'claimed' "
|
||||
"because it was last claimed a long time ago "
|
||||
"at %s, and presumably some error stopped it from "
|
||||
"being disclaimed",
|
||||
result["changes"][i]["old_val"]["last_claimed"],
|
||||
)
|
||||
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
||||
sites.append(site)
|
||||
self.logger.debug('claimed %s sites', len(sites))
|
||||
self.logger.debug("claimed %s sites", len(sites))
|
||||
if sites:
|
||||
return sites
|
||||
else:
|
||||
raise brozzler.NothingToClaim
|
||||
|
||||
def enforce_time_limit(self, site):
|
||||
'''
|
||||
"""
|
||||
Raises `brozzler.ReachedTimeLimit` if appropriate.
|
||||
'''
|
||||
if (site.time_limit and site.time_limit > 0
|
||||
and site.elapsed() > site.time_limit):
|
||||
"""
|
||||
if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
|
||||
self.logger.debug(
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
||||
"elapsed=%s %s", site.time_limit, site.elapsed(), site)
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
|
||||
site.time_limit,
|
||||
site.elapsed(),
|
||||
site,
|
||||
)
|
||||
raise brozzler.ReachedTimeLimit
|
||||
|
||||
def claim_page(self, site, worker_id):
|
||||
@ -170,26 +210,37 @@ class RethinkDbFrontier:
|
||||
# brozzler-worker can be working on a site at a time, and that would
|
||||
# have to be the worker calling this method, so if something is claimed
|
||||
# already, it must have been left that way because of some error
|
||||
result = self.rr.table("pages").between(
|
||||
result = (
|
||||
self.rr.table("pages")
|
||||
.between(
|
||||
[site.id, 0, r.minval, r.minval],
|
||||
[site.id, 0, r.maxval, r.maxval],
|
||||
index="priority_by_site").order_by(
|
||||
index=r.desc("priority_by_site")).limit(
|
||||
1).update({
|
||||
"claimed":True,
|
||||
"last_claimed_by":worker_id},
|
||||
return_changes="always").run()
|
||||
self._vet_result(result, unchanged=[0,1], replaced=[0,1])
|
||||
index="priority_by_site",
|
||||
)
|
||||
.order_by(index=r.desc("priority_by_site"))
|
||||
.limit(1)
|
||||
.update(
|
||||
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
||||
)
|
||||
.run()
|
||||
)
|
||||
self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
|
||||
if result["unchanged"] == 0 and result["replaced"] == 0:
|
||||
raise brozzler.NothingToClaim
|
||||
else:
|
||||
return brozzler.Page(self.rr, result["changes"][0]["new_val"])
|
||||
|
||||
def has_outstanding_pages(self, site):
|
||||
results_iter = self.rr.table("pages").between(
|
||||
results_iter = (
|
||||
self.rr.table("pages")
|
||||
.between(
|
||||
[site.id, 0, r.minval, r.minval],
|
||||
[site.id, 0, r.maxval, r.maxval],
|
||||
index="priority_by_site").limit(1).run()
|
||||
index="priority_by_site",
|
||||
)
|
||||
.limit(1)
|
||||
.run()
|
||||
)
|
||||
return len(list(results_iter)) > 0
|
||||
|
||||
def completed_page(self, site, page):
|
||||
@ -202,22 +253,24 @@ class RethinkDbFrontier:
|
||||
site.save()
|
||||
|
||||
def active_jobs(self):
|
||||
results = self.rr.table("jobs").filter({"status":"ACTIVE"}).run()
|
||||
results = self.rr.table("jobs").filter({"status": "ACTIVE"}).run()
|
||||
for result in results:
|
||||
yield brozzler.Job(self.rr, result)
|
||||
|
||||
def honor_stop_request(self, site):
|
||||
"""Raises brozzler.CrawlStopped if stop has been requested."""
|
||||
site.refresh()
|
||||
if (site.stop_requested
|
||||
and site.stop_requested <= doublethink.utcnow()):
|
||||
if site.stop_requested and site.stop_requested <= doublethink.utcnow():
|
||||
self.logger.info("stop requested for site %s", site.id)
|
||||
raise brozzler.CrawlStopped
|
||||
|
||||
if site.job_id:
|
||||
job = brozzler.Job.load(self.rr, site.job_id)
|
||||
if (job and job.stop_requested
|
||||
and job.stop_requested <= doublethink.utcnow()):
|
||||
if (
|
||||
job
|
||||
and job.stop_requested
|
||||
and job.stop_requested <= doublethink.utcnow()
|
||||
):
|
||||
self.logger.info("stop requested for job %s", site.job_id)
|
||||
raise brozzler.CrawlStopped
|
||||
|
||||
@ -239,8 +292,7 @@ class RethinkDbFrontier:
|
||||
return False
|
||||
n += 1
|
||||
|
||||
self.logger.info(
|
||||
"all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||
job.finish()
|
||||
job.save()
|
||||
return True
|
||||
@ -270,13 +322,11 @@ class RethinkDbFrontier:
|
||||
def resume_job(self, job):
|
||||
job.status = "ACTIVE"
|
||||
job.stop_requested = None
|
||||
job.starts_and_stops.append(
|
||||
{"start":doublethink.utcnow(), "stop":None})
|
||||
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||
job.save()
|
||||
for site in self.job_sites(job.id):
|
||||
site.status = "ACTIVE"
|
||||
site.starts_and_stops.append(
|
||||
{"start":doublethink.utcnow(), "stop":None})
|
||||
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||
site.save()
|
||||
|
||||
def resume_site(self, site):
|
||||
@ -285,51 +335,55 @@ class RethinkDbFrontier:
|
||||
job = brozzler.Job.load(self.rr, site.job_id)
|
||||
job.status = "ACTIVE"
|
||||
site.stop_requested = None
|
||||
job.starts_and_stops.append(
|
||||
{"start":doublethink.utcnow(), "stop":None})
|
||||
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||
job.save()
|
||||
site.status = "ACTIVE"
|
||||
site.starts_and_stops.append(
|
||||
{"start":doublethink.utcnow(), "stop":None})
|
||||
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||
site.save()
|
||||
|
||||
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
hashtag = (url_for_crawling.hash_sign
|
||||
+ url_for_crawling.fragment).decode('utf-8')
|
||||
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
|
||||
"utf-8"
|
||||
)
|
||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||
page = brozzler.Page(self.rr, {
|
||||
'url': str(url_for_crawling),
|
||||
'site_id': site.id,
|
||||
'job_id': site.job_id,
|
||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
|
||||
'via_page_id': parent_page.id,
|
||||
'via_page_url': parent_page.url,
|
||||
'hops_off_surt': hops_off,
|
||||
'hashtags': [hashtag] if hashtag else []})
|
||||
page = brozzler.Page(
|
||||
self.rr,
|
||||
{
|
||||
"url": str(url_for_crawling),
|
||||
"site_id": site.id,
|
||||
"job_id": site.job_id,
|
||||
"hops_from_seed": parent_page.hops_from_seed + 1,
|
||||
"hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
|
||||
+ "L",
|
||||
"via_page_id": parent_page.id,
|
||||
"via_page_url": parent_page.url,
|
||||
"hops_off_surt": hops_off,
|
||||
"hashtags": [hashtag] if hashtag else [],
|
||||
},
|
||||
)
|
||||
return page
|
||||
|
||||
def _merge_page(self, existing_page, fresh_page):
|
||||
'''
|
||||
"""
|
||||
Utility method for merging info from `brozzler.Page` instances
|
||||
representing the same url but with possibly different metadata.
|
||||
'''
|
||||
"""
|
||||
existing_page.priority += fresh_page.priority
|
||||
existing_page.hashtags = list(set(
|
||||
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
|
||||
existing_page.hops_off = min(
|
||||
existing_page.hops_off, fresh_page.hops_off)
|
||||
existing_page.hashtags = list(
|
||||
set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
|
||||
)
|
||||
existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)
|
||||
|
||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||
'''
|
||||
"""
|
||||
Returns tuple (
|
||||
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
||||
scope links accepted by robots policy,
|
||||
set of in scope urls (canonicalized) blocked by robots policy,
|
||||
set of out-of-scope urls (canonicalized)).
|
||||
'''
|
||||
"""
|
||||
pages = {} # {page_id: Page, ...}
|
||||
blocked = set()
|
||||
out_of_scope = set()
|
||||
@ -337,17 +391,18 @@ class RethinkDbFrontier:
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
decision = site.accept_reject_or_neither(
|
||||
url_for_scoping, parent_page=parent_page)
|
||||
url_for_scoping, parent_page=parent_page
|
||||
)
|
||||
if decision is True:
|
||||
hops_off = 0
|
||||
elif decision is None:
|
||||
decision = parent_page.hops_off < site.scope.get(
|
||||
'max_hops_off', 0)
|
||||
decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
|
||||
hops_off = parent_page.hops_off + 1
|
||||
if decision is True:
|
||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||
fresh_page = self._build_fresh_page(
|
||||
site, parent_page, url, hops_off)
|
||||
site, parent_page, url, hops_off
|
||||
)
|
||||
if fresh_page.id in pages:
|
||||
self._merge_page(pages[fresh_page.id], fresh_page)
|
||||
else:
|
||||
@ -359,31 +414,32 @@ class RethinkDbFrontier:
|
||||
return pages, blocked, out_of_scope
|
||||
|
||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
||||
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
||||
decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
|
||||
counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
|
||||
|
||||
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||
site, parent_page, outlinks)
|
||||
decisions['blocked'] = blocked
|
||||
decisions['rejected'] = out_of_scope
|
||||
counts['blocked'] += len(blocked)
|
||||
counts['rejected'] += len(out_of_scope)
|
||||
site, parent_page, outlinks
|
||||
)
|
||||
decisions["blocked"] = blocked
|
||||
decisions["rejected"] = out_of_scope
|
||||
counts["blocked"] += len(blocked)
|
||||
counts["rejected"] += len(out_of_scope)
|
||||
|
||||
# get existing pages from rethinkdb
|
||||
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
||||
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
||||
results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
|
||||
pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}
|
||||
|
||||
# build list of pages to save, consisting of new pages, and existing
|
||||
# pages updated with higher priority and new hashtags
|
||||
for fresh_page in fresh_pages.values():
|
||||
decisions['accepted'].add(fresh_page.url)
|
||||
decisions["accepted"].add(fresh_page.url)
|
||||
if fresh_page.id in pages:
|
||||
page = pages[fresh_page.id]
|
||||
self._merge_page(page, fresh_page)
|
||||
counts['updated'] += 1
|
||||
counts["updated"] += 1
|
||||
else:
|
||||
pages[fresh_page.id] = fresh_page
|
||||
counts['added'] += 1
|
||||
counts["added"] += 1
|
||||
|
||||
# make sure we're not stepping on our own toes in case we have a link
|
||||
# back to parent_page, which I think happens because of hashtags
|
||||
@ -396,19 +452,22 @@ class RethinkDbFrontier:
|
||||
# there can be many pages and each one can be very large (many videos,
|
||||
# in and out of scope links, etc)
|
||||
l = list(pages.values())
|
||||
for batch in (l[i:i+50] for i in range(0, len(l), 50)):
|
||||
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
|
||||
try:
|
||||
self.logger.debug(
|
||||
'inserting/replacing batch of %s pages', len(batch))
|
||||
reql = self.rr.table('pages').insert(batch, conflict='replace')
|
||||
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
|
||||
reql = self.rr.table("pages").insert(batch, conflict="replace")
|
||||
self.logger.trace(
|
||||
'running query self.rr.table("pages").insert(%r, '
|
||||
'conflict="replace")', batch)
|
||||
'running query self.rr.table("pages").insert(%r, '
|
||||
'conflict="replace")',
|
||||
batch,
|
||||
)
|
||||
result = reql.run()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
'problem inserting/replacing batch of %s pages',
|
||||
len(batch), exc_info=True)
|
||||
"problem inserting/replacing batch of %s pages",
|
||||
len(batch),
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
parent_page.outlinks = {}
|
||||
for k in decisions:
|
||||
@ -416,43 +475,56 @@ class RethinkDbFrontier:
|
||||
parent_page.save()
|
||||
|
||||
self.logger.info(
|
||||
'%s new links added, %s existing links updated, %s links '
|
||||
'rejected, %s links blocked by robots from %s',
|
||||
counts['added'], counts['updated'], counts['rejected'],
|
||||
counts['blocked'], parent_page)
|
||||
"%s new links added, %s existing links updated, %s links "
|
||||
"rejected, %s links blocked by robots from %s",
|
||||
counts["added"],
|
||||
counts["updated"],
|
||||
counts["rejected"],
|
||||
counts["blocked"],
|
||||
parent_page,
|
||||
)
|
||||
|
||||
def reached_limit(self, site, e):
|
||||
self.logger.info("reached_limit site=%s e=%s", site, e)
|
||||
assert isinstance(e, brozzler.ReachedLimit)
|
||||
if (site.reached_limit
|
||||
and site.reached_limit != e.warcprox_meta["reached-limit"]):
|
||||
if (
|
||||
site.reached_limit
|
||||
and site.reached_limit != e.warcprox_meta["reached-limit"]
|
||||
):
|
||||
self.logger.warning(
|
||||
"reached limit %s but site had already reached limit %s",
|
||||
e.warcprox_meta["reached-limit"], self.reached_limit)
|
||||
"reached limit %s but site had already reached limit %s",
|
||||
e.warcprox_meta["reached-limit"],
|
||||
self.reached_limit,
|
||||
)
|
||||
else:
|
||||
site.reached_limit = e.warcprox_meta["reached-limit"]
|
||||
self.finished(site, "FINISHED_REACHED_LIMIT")
|
||||
|
||||
def job_sites(self, job_id):
|
||||
results = self.rr.table('sites').get_all(job_id, index="job_id").run()
|
||||
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
|
||||
for result in results:
|
||||
yield brozzler.Site(self.rr, result)
|
||||
|
||||
def seed_page(self, site_id):
|
||||
results = self.rr.table("pages").between(
|
||||
results = (
|
||||
self.rr.table("pages")
|
||||
.between(
|
||||
[site_id, r.minval, r.minval, r.minval],
|
||||
[site_id, r.maxval, r.maxval, r.maxval],
|
||||
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
||||
index="priority_by_site",
|
||||
)
|
||||
.filter({"hops_from_seed": 0})
|
||||
.run()
|
||||
)
|
||||
pages = list(results)
|
||||
if len(pages) > 1:
|
||||
self.logger.warning(
|
||||
"more than one seed page for site_id %s ?", site_id)
|
||||
self.logger.warning("more than one seed page for site_id %s ?", site_id)
|
||||
if len(pages) < 1:
|
||||
return None
|
||||
return brozzler.Page(self.rr, pages[0])
|
||||
|
||||
def site_pages(self, site_id, brozzled=None):
|
||||
'''
|
||||
"""
|
||||
Args:
|
||||
site_id (str or int):
|
||||
brozzled (bool): if true, results include only pages that have
|
||||
@ -460,16 +532,14 @@ class RethinkDbFrontier:
|
||||
not been brozzled; and if None (the default), all pages
|
||||
Returns:
|
||||
iterator of brozzler.Page
|
||||
'''
|
||||
"""
|
||||
query = self.rr.table("pages").between(
|
||||
[site_id, 1 if brozzled is True else 0,
|
||||
r.minval, r.minval],
|
||||
[site_id, 0 if brozzled is False else r.maxval,
|
||||
r.maxval, r.maxval],
|
||||
index="priority_by_site")
|
||||
[site_id, 1 if brozzled is True else 0, r.minval, r.minval],
|
||||
[site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
|
||||
index="priority_by_site",
|
||||
)
|
||||
self.logger.trace("running query: %r", query)
|
||||
results = query.run()
|
||||
for result in results:
|
||||
self.logger.trace("yielding result: %r", result)
|
||||
yield brozzler.Page(self.rr, result)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||
related logic
|
||||
|
||||
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import brozzler
|
||||
import base64
|
||||
@ -36,15 +36,18 @@ import yaml
|
||||
import zlib
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
|
||||
with open(schema_file) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
class JobValidator(cerberus.Validator):
|
||||
def _validate_type_url(self, value):
|
||||
url = urllib.parse.urlparse(value)
|
||||
return url.scheme in ('http', 'https', 'ftp')
|
||||
return url.scheme in ("http", "https", "ftp")
|
||||
|
||||
|
||||
class InvalidJobConf(Exception):
|
||||
def __init__(self, validator):
|
||||
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
|
||||
# Cerberus does a nice job hiding the bad value. In the case I
|
||||
# debugged, I found it here. Maybe there's a better way to see it.
|
||||
value = validator._errors[0].info[0][0].info[0][0].value
|
||||
self.errors['bad value'] = value
|
||||
self.errors["bad value"] = value
|
||||
except:
|
||||
value = None
|
||||
|
||||
|
||||
def validate_conf(job_conf, schema=load_schema()):
|
||||
v = JobValidator(schema)
|
||||
if not v.validate(job_conf, normalize=False):
|
||||
raise InvalidJobConf(v)
|
||||
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
merged = dict(a)
|
||||
@ -75,19 +80,22 @@ def merge(a, b):
|
||||
else:
|
||||
return a
|
||||
|
||||
|
||||
def new_job_file(frontier, job_conf_file):
|
||||
'''Returns new Job.'''
|
||||
"""Returns new Job."""
|
||||
logging.info("loading %s", job_conf_file)
|
||||
with open(job_conf_file) as f:
|
||||
job_conf = yaml.safe_load(f)
|
||||
return new_job(frontier, job_conf)
|
||||
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
'''Returns new Job.'''
|
||||
"""Returns new Job."""
|
||||
validate_conf(job_conf)
|
||||
job = Job(frontier.rr, {
|
||||
"conf": job_conf, "status": "ACTIVE",
|
||||
"started": doublethink.utcnow()})
|
||||
job = Job(
|
||||
frontier.rr,
|
||||
{"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
|
||||
)
|
||||
if "id" in job_conf:
|
||||
job.id = job_conf["id"]
|
||||
if "max_claimed_sites" in job_conf:
|
||||
@ -108,32 +116,40 @@ def new_job(frontier, job_conf):
|
||||
|
||||
# insert in batches to avoid this error
|
||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||
for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
|
||||
logging.info('inserting batch of %s pages', len(batch))
|
||||
result = frontier.rr.table('pages').insert(batch).run()
|
||||
for batch in (sites[i:i+100] for i in range(0, len(sites), 100)):
|
||||
logging.info('inserting batch of %s sites', len(batch))
|
||||
result = frontier.rr.table('sites').insert(batch).run()
|
||||
logging.info('job %s fully started', job.id)
|
||||
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
||||
logging.info("inserting batch of %s pages", len(batch))
|
||||
result = frontier.rr.table("pages").insert(batch).run()
|
||||
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
||||
logging.info("inserting batch of %s sites", len(batch))
|
||||
result = frontier.rr.table("sites").insert(batch).run()
|
||||
logging.info("job %s fully started", job.id)
|
||||
|
||||
return job
|
||||
|
||||
|
||||
def new_seed_page(frontier, site):
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url),
|
||||
"site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"),
|
||||
"hops_from_seed": 0,
|
||||
"priority": 1000,
|
||||
"needs_robots_check": True,
|
||||
"hop_path": None})
|
||||
page = brozzler.Page(
|
||||
frontier.rr,
|
||||
{
|
||||
"url": str(url),
|
||||
"site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"),
|
||||
"hops_from_seed": 0,
|
||||
"priority": 1000,
|
||||
"needs_robots_check": True,
|
||||
"hop_path": None,
|
||||
},
|
||||
)
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
page.hashtags = [
|
||||
hashtag,
|
||||
]
|
||||
return page
|
||||
|
||||
|
||||
def new_site(frontier, site):
|
||||
logging.info("new site %s", site)
|
||||
site.id = site.id or str(uuid.uuid4())
|
||||
@ -148,9 +164,10 @@ def new_site(frontier, site):
|
||||
# finally block because we want to insert the Site no matter what
|
||||
site.save()
|
||||
|
||||
|
||||
class ElapsedMixIn(object):
|
||||
def elapsed(self):
|
||||
'''
|
||||
"""
|
||||
Returns elapsed crawl time as a float in seconds.
|
||||
|
||||
This metric includes all the time that a site was in active rotation,
|
||||
@ -158,21 +175,22 @@ class ElapsedMixIn(object):
|
||||
|
||||
In contrast `Site.active_brozzling_time` only counts time when a
|
||||
brozzler worker claimed the site and was actively brozzling it.
|
||||
'''
|
||||
"""
|
||||
dt = 0
|
||||
for ss in self.starts_and_stops[:-1]:
|
||||
if ss['stop']:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
if ss["stop"]:
|
||||
dt += (ss["stop"] - ss["start"]).total_seconds()
|
||||
else:
|
||||
self.logger.warning("missing expected ss['stop']")
|
||||
dt += (doublethink.utcnow() - ss['start']).total_seconds()
|
||||
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
|
||||
ss = self.starts_and_stops[-1]
|
||||
if ss['stop']:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
else: # crawl is active
|
||||
dt += (doublethink.utcnow() - ss['start']).total_seconds()
|
||||
if ss["stop"]:
|
||||
dt += (ss["stop"] - ss["start"]).total_seconds()
|
||||
else: # crawl is active
|
||||
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
|
||||
return dt
|
||||
|
||||
|
||||
class Job(doublethink.Document, ElapsedMixIn):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
table = "jobs"
|
||||
@ -181,29 +199,30 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
if not "status" in self:
|
||||
self.status = "ACTIVE"
|
||||
if not "starts_and_stops" in self:
|
||||
if self.get("started"): # backward compatibility
|
||||
self.starts_and_stops = [{
|
||||
"start": self.get("started"),
|
||||
"stop": self.get("finished")}]
|
||||
if self.get("started"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
{"start": self.get("started"), "stop": self.get("finished")}
|
||||
]
|
||||
del self["started"]
|
||||
if "finished" in self:
|
||||
del self["finished"]
|
||||
else:
|
||||
self.starts_and_stops = [
|
||||
{"start":doublethink.utcnow(),"stop":None}]
|
||||
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
|
||||
|
||||
def finish(self):
|
||||
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
|
||||
self.logger.error(
|
||||
"job is already finished status=%s "
|
||||
"starts_and_stops[-1]['stop']=%s", self.status,
|
||||
self.starts_and_stops[-1]["stop"])
|
||||
"job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
|
||||
self.status,
|
||||
self.starts_and_stops[-1]["stop"],
|
||||
)
|
||||
self.status = "FINISHED"
|
||||
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
||||
|
||||
|
||||
class Site(doublethink.Document, ElapsedMixIn):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
table = 'sites'
|
||||
table = "sites"
|
||||
|
||||
def populate_defaults(self):
|
||||
if not "status" in self:
|
||||
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
del self.scope["surt"]
|
||||
|
||||
# backward compatibility
|
||||
if ("max_hops_off_surt" in self.scope
|
||||
and not "max_hops_off" in self.scope):
|
||||
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
|
||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||
if "max_hops_off_surt" in self.scope:
|
||||
del self.scope["max_hops_off_surt"]
|
||||
|
||||
if self.seed:
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
|
||||
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
||||
)
|
||||
|
||||
if not "starts_and_stops" in self:
|
||||
if self.get("start_time"): # backward compatibility
|
||||
self.starts_and_stops = [{
|
||||
"start":self.get("start_time"),"stop":None}]
|
||||
if self.get("start_time"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
{"start": self.get("start_time"), "stop": None}
|
||||
]
|
||||
if self.get("status") != "ACTIVE":
|
||||
self.starts_and_stops[0]["stop"] = self.last_disclaimed
|
||||
del self["start_time"]
|
||||
else:
|
||||
self.starts_and_stops = [
|
||||
{"start":doublethink.utcnow(),"stop":None}]
|
||||
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
|
||||
|
||||
def __str__(self):
|
||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
if not "accepts" in self.scope:
|
||||
self.scope["accepts"] = []
|
||||
simple_rule_ssurts = (
|
||||
rule["ssurt"] for rule in self.scope["accepts"]
|
||||
if set(rule.keys()) == {'ssurt'})
|
||||
rule["ssurt"]
|
||||
for rule in self.scope["accepts"]
|
||||
if set(rule.keys()) == {"ssurt"}
|
||||
)
|
||||
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
||||
self.logger.info(
|
||||
"adding ssurt %s to scope accept rules", ssurt)
|
||||
self.logger.info("adding ssurt %s to scope accept rules", ssurt)
|
||||
self.scope["accepts"].append({"ssurt": ssurt})
|
||||
|
||||
def note_seed_redirect(self, url):
|
||||
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
|
||||
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also
|
||||
# put all of https://foo.com/ in scope
|
||||
if (canon_seed_redirect.authority == canon_seed.authority
|
||||
and canon_seed_redirect.scheme != canon_seed.scheme):
|
||||
if (
|
||||
canon_seed_redirect.authority == canon_seed.authority
|
||||
and canon_seed_redirect.scheme != canon_seed.scheme
|
||||
):
|
||||
canon_seed.scheme = canon_seed_redirect.scheme
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
canon_seed.ssurt().decode('ascii'))
|
||||
self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))
|
||||
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
canon_seed_redirect.ssurt().decode('ascii'))
|
||||
self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))
|
||||
|
||||
def extra_headers(self, page: Optional["Page"] = None):
|
||||
hdrs = {}
|
||||
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
|
||||
if "blocks" in self.warcprox_meta:
|
||||
# delete temp_warcprox_meta's 'blocks' (they may be big!)
|
||||
del temp_warcprox_meta['blocks']
|
||||
del temp_warcprox_meta["blocks"]
|
||||
# str-ify blocks
|
||||
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
|
||||
blocks_str = json.dumps(
|
||||
self.warcprox_meta["blocks"], separators=(",", ":")
|
||||
)
|
||||
# encode(), compress, b64encode, decode()
|
||||
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
|
||||
temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
|
||||
zlib.compress(blocks_str.encode())
|
||||
).decode()
|
||||
if page is not None:
|
||||
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
|
||||
hdrs["Warcprox-Meta"] = json.dumps(
|
||||
temp_warcprox_meta, separators=(",", ":")
|
||||
)
|
||||
return hdrs
|
||||
|
||||
def accept_reject_or_neither(self, url, parent_page=None):
|
||||
'''
|
||||
"""
|
||||
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
||||
|
||||
`None` usually means rejected, unless `max_hops_off` comes into play.
|
||||
'''
|
||||
"""
|
||||
if not isinstance(url, urlcanon.ParsedUrl):
|
||||
url = urlcanon.semantic(url)
|
||||
|
||||
if not url.scheme in (b'http', b'https'):
|
||||
if not url.scheme in (b"http", b"https"):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
if parent_page:
|
||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||
if parent_page.redirect_url:
|
||||
try_parent_urls.append(
|
||||
urlcanon.semantic(parent_page.redirect_url))
|
||||
try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))
|
||||
|
||||
# enforce max_hops
|
||||
if (parent_page and "max_hops" in self.scope
|
||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||
if (
|
||||
parent_page
|
||||
and "max_hops" in self.scope
|
||||
and parent_page.hops_from_seed >= self.scope["max_hops"]
|
||||
):
|
||||
return False
|
||||
|
||||
# enforce reject rules
|
||||
@ -326,7 +354,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
return False
|
||||
return False
|
||||
else:
|
||||
if rule.applies(url):
|
||||
return False
|
||||
@ -337,7 +365,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
return True
|
||||
return True
|
||||
else:
|
||||
if rule.applies(url):
|
||||
return True
|
||||
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
# no decision if we reach here
|
||||
return None
|
||||
|
||||
|
||||
class Page(doublethink.Document):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
table = "pages"
|
||||
@ -398,4 +427,3 @@ class Page(doublethink.Document):
|
||||
if self._canon_hurl is None:
|
||||
self._canon_hurl = urlcanon.semantic(self.url)
|
||||
return str(self._canon_hurl)
|
||||
|
||||
|
239
brozzler/pywb.py
239
brozzler/pywb.py
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
||||
loading from warcs still being written to, canonicalization rules matching
|
||||
brozzler conventions, support for screenshot: and thumbnail: urls
|
||||
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
|
||||
try:
|
||||
import pywb.apps.cli
|
||||
import pywb.cdx.cdxdomainspecific
|
||||
@ -30,9 +31,11 @@ try:
|
||||
import pywb.rewrite.wburl
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
@ -43,6 +46,7 @@ import argparse
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
|
||||
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
def __init__(self, servers, db, table):
|
||||
self.servers = servers
|
||||
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
# XXX inefficient, it gets parsed later, figure out how to
|
||||
# short-circuit this step and create the CDXObject directly
|
||||
blob = {
|
||||
'url': record['url'],
|
||||
'status': str(record['response_code']),
|
||||
'digest': record['sha1base32'],
|
||||
'length': str(record.get('record_length', '-')),
|
||||
'offset': str(record['offset']),
|
||||
'filename': record['filename'],
|
||||
"url": record["url"],
|
||||
"status": str(record["response_code"]),
|
||||
"digest": record["sha1base32"],
|
||||
"length": str(record.get("record_length", "-")),
|
||||
"offset": str(record["offset"]),
|
||||
"filename": record["filename"],
|
||||
}
|
||||
if record['warc_type'] != 'revisit':
|
||||
blob['mime'] = record['content_type'] or '-'
|
||||
if record["warc_type"] != "revisit":
|
||||
blob["mime"] = record["content_type"] or "-"
|
||||
else:
|
||||
blob['mime'] = 'warc/revisit'
|
||||
blob["mime"] = "warc/revisit"
|
||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
||||
record['canon_surt'], record['timestamp'],
|
||||
json.dumps(blob))
|
||||
yield cdx_line.encode('utf-8')
|
||||
cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
|
||||
record["canon_surt"], record["timestamp"], json.dumps(blob)
|
||||
)
|
||||
yield cdx_line.encode("utf-8")
|
||||
|
||||
def _query_rethinkdb(self, cdx_query):
|
||||
start_key = cdx_query.key.decode('utf-8')
|
||||
end_key = cdx_query.end_key.decode('utf-8')
|
||||
start_key = cdx_query.key.decode("utf-8")
|
||||
end_key = cdx_query.end_key.decode("utf-8")
|
||||
reql = self.rr.table(self.table).between(
|
||||
[start_key[:150], r.minval], [end_key[:150], r.maxval],
|
||||
index='abbr_canon_surt_timestamp', right_bound='closed')
|
||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
||||
[start_key[:150], r.minval],
|
||||
[end_key[:150], r.maxval],
|
||||
index="abbr_canon_surt_timestamp",
|
||||
right_bound="closed",
|
||||
)
|
||||
reql = reql.order_by(index="abbr_canon_surt_timestamp")
|
||||
# TODO support for POST, etc
|
||||
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
||||
reql = reql.filter(
|
||||
lambda capture: r.expr(
|
||||
['WARCPROX_WRITE_RECORD','GET']).contains(
|
||||
capture['http_method']))
|
||||
lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
|
||||
capture["http_method"]
|
||||
)
|
||||
)
|
||||
reql = reql.filter(
|
||||
lambda capture: (capture['canon_surt'] >= start_key)
|
||||
& (capture['canon_surt'] < end_key))
|
||||
lambda capture: (capture["canon_surt"] >= start_key)
|
||||
& (capture["canon_surt"] < end_key)
|
||||
)
|
||||
if cdx_query.limit:
|
||||
reql = reql.limit(cdx_query.limit)
|
||||
logging.debug('rethinkdb query: %s', reql)
|
||||
logging.debug("rethinkdb query: %s", reql)
|
||||
results = reql.run()
|
||||
return results
|
||||
|
||||
|
||||
class TheGoodUrlCanonicalizer(object):
|
||||
'''
|
||||
"""
|
||||
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
|
||||
surts with scheme and with trailing comma, and does not "massage"
|
||||
www.foo.org into foo.org.
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, surt_ordered=True):
|
||||
'''We are always surt ordered (surt_ordered param is ignored)'''
|
||||
"""We are always surt ordered (surt_ordered param is ignored)"""
|
||||
self.surt_ordered = True
|
||||
|
||||
def __call__(self, url):
|
||||
try:
|
||||
key = urlcanon.semantic(url).surt().decode('ascii')
|
||||
key = urlcanon.semantic(url).surt().decode("ascii")
|
||||
# logging.debug('%s -> %s', url, key)
|
||||
return key
|
||||
except Exception as e:
|
||||
return url
|
||||
|
||||
def replace_default_canonicalizer():
|
||||
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
||||
"""Replace parent class of CustomUrlCanonicalizer with this class."""
|
||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||
TheGoodUrlCanonicalizer,)
|
||||
TheGoodUrlCanonicalizer,
|
||||
)
|
||||
|
||||
def good_surts_from_default(default_surt):
|
||||
'''
|
||||
"""
|
||||
Takes a standard surt without scheme and without trailing comma, and
|
||||
returns a list of "good" surts that together match the same set of
|
||||
urls. For example:
|
||||
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
|
||||
'http://(com,example,www,)/path',
|
||||
'https://(com,example,www,)/path']
|
||||
|
||||
'''
|
||||
if default_surt == '':
|
||||
return ['']
|
||||
"""
|
||||
if default_surt == "":
|
||||
return [""]
|
||||
|
||||
parts = default_surt.split(')', 1)
|
||||
parts = default_surt.split(")", 1)
|
||||
if len(parts) == 2:
|
||||
orig_host_part, path_part = parts
|
||||
good_surts = [
|
||||
'http://(%s,)%s' % (orig_host_part, path_part),
|
||||
'https://(%s,)%s' % (orig_host_part, path_part),
|
||||
'http://(%s,www,)%s' % (orig_host_part, path_part),
|
||||
'https://(%s,www,)%s' % (orig_host_part, path_part),
|
||||
"http://(%s,)%s" % (orig_host_part, path_part),
|
||||
"https://(%s,)%s" % (orig_host_part, path_part),
|
||||
"http://(%s,www,)%s" % (orig_host_part, path_part),
|
||||
"https://(%s,www,)%s" % (orig_host_part, path_part),
|
||||
]
|
||||
else: # no path part
|
||||
else: # no path part
|
||||
host_part = parts[0]
|
||||
good_surts = [
|
||||
'http://(%s' % host_part,
|
||||
'https://(%s' % host_part,
|
||||
"http://(%s" % host_part,
|
||||
"https://(%s" % host_part,
|
||||
]
|
||||
return good_surts
|
||||
|
||||
def monkey_patch_dsrules_init():
|
||||
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
|
||||
|
||||
def cdx_dsrule_init(self, url_prefix, rules):
|
||||
good_surts = []
|
||||
url_prefixes = [url_prefix] if isinstance(
|
||||
url_prefix, str) else url_prefix
|
||||
url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
|
||||
for bad_surt in url_prefixes:
|
||||
good_surts.extend(
|
||||
TheGoodUrlCanonicalizer.good_surts_from_default(
|
||||
bad_surt))
|
||||
if 'match' in rules and 'regex' in rules['match']:
|
||||
rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
|
||||
TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
|
||||
)
|
||||
if "match" in rules and "regex" in rules["match"]:
|
||||
rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
|
||||
orig_init(self, good_surts, rules)
|
||||
|
||||
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
|
||||
|
||||
|
||||
def support_in_progress_warcs():
|
||||
'''
|
||||
"""
|
||||
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
||||
being written to (warcs having ".open" suffix). This way if a cdx entry
|
||||
references foo.warc.gz, pywb will try both foo.warc.gz and
|
||||
foo.warc.gz.open.
|
||||
'''
|
||||
"""
|
||||
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
||||
|
||||
def _prefix_resolver_call(self, filename, cdx=None):
|
||||
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
||||
results = []
|
||||
for warc_path in raw_results:
|
||||
results.append(warc_path)
|
||||
results.append('%s.open' % warc_path)
|
||||
results.append("%s.open" % warc_path)
|
||||
return results
|
||||
|
||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||
|
||||
|
||||
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
def __init__(self, orig_url):
|
||||
import re
|
||||
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
||||
|
||||
if six.PY2 and isinstance(orig_url, six.text_type):
|
||||
orig_url = orig_url.encode('utf-8')
|
||||
orig_url = orig_url.encode("utf-8")
|
||||
orig_url = quote(orig_url)
|
||||
|
||||
self._original_url = orig_url
|
||||
|
||||
if not self._init_query(orig_url):
|
||||
if not self._init_replay(orig_url):
|
||||
raise Exception('Invalid WbUrl: ', orig_url)
|
||||
raise Exception("Invalid WbUrl: ", orig_url)
|
||||
|
||||
new_uri = WbUrl.to_uri(self.url)
|
||||
|
||||
@ -227,21 +244,24 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
self.url = new_uri
|
||||
|
||||
# begin brozzler changes
|
||||
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
|
||||
or self.url.startswith('thumbnail:')):
|
||||
if (
|
||||
self.url.startswith("urn:")
|
||||
or self.url.startswith("screenshot:")
|
||||
or self.url.startswith("thumbnail:")
|
||||
):
|
||||
return
|
||||
# end brozzler changes
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
#inx = self.url.find('://')
|
||||
# inx = self.url.find('://')
|
||||
inx = -1
|
||||
m = self.SCHEME_RX.match(self.url)
|
||||
if m:
|
||||
inx = m.span(1)[0]
|
||||
|
||||
#if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
# if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
# m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
# if m:
|
||||
# len_ = len(m.group(0))
|
||||
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
else:
|
||||
inx += 2
|
||||
if inx < len(self.url) and self.url[inx] != '/':
|
||||
self.url = self.url[:inx] + '/' + self.url[inx:]
|
||||
if inx < len(self.url) and self.url[inx] != "/":
|
||||
self.url = self.url[:inx] + "/" + self.url[inx:]
|
||||
|
||||
|
||||
def _get_wburl_type(self):
|
||||
return SomeWbUrl
|
||||
|
||||
|
||||
def monkey_patch_wburl():
|
||||
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
|
||||
|
||||
|
||||
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
|
||||
def _extend_parser(self, arg_parser):
|
||||
super()._extend_parser(arg_parser)
|
||||
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
|
||||
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
|
||||
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
|
||||
arg_parser.epilog = '''
|
||||
arg_parser.epilog = """
|
||||
Run pywb like so:
|
||||
|
||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||
|
||||
See README.rst for more information.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||
# such below
|
||||
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):
|
||||
|
||||
matched_rule = None
|
||||
|
||||
urlkey = to_native_str(query.key, 'utf-8')
|
||||
urlkey = to_native_str(query.key, "utf-8")
|
||||
url = query.url
|
||||
filter_ = query.filters
|
||||
output = query.output
|
||||
@ -306,42 +330,42 @@ def _fuzzy_query_call(self, query):
|
||||
if not matched_rule:
|
||||
return None
|
||||
|
||||
repl = '?'
|
||||
repl = "?"
|
||||
if matched_rule.replace:
|
||||
repl = matched_rule.replace
|
||||
|
||||
inx = url.find(repl)
|
||||
if inx > 0:
|
||||
url = url[:inx + len(repl)]
|
||||
url = url[: inx + len(repl)]
|
||||
|
||||
# begin brozzler changes
|
||||
if matched_rule.match_type == 'domain':
|
||||
if matched_rule.match_type == "domain":
|
||||
orig_split_url = urlsplit(url)
|
||||
# remove the subdomain, path, query and fragment
|
||||
host = orig_split_url.netloc.split('.', 1)[1]
|
||||
new_split_url = (orig_split_url.scheme, host, '', '', '')
|
||||
host = orig_split_url.netloc.split(".", 1)[1]
|
||||
new_split_url = (orig_split_url.scheme, host, "", "", "")
|
||||
url = urlunsplit(new_split_url)
|
||||
# end brozzler changes
|
||||
|
||||
params = query.params
|
||||
params.update({'url': url,
|
||||
'matchType': matched_rule.match_type,
|
||||
'filter': filter_})
|
||||
params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})
|
||||
|
||||
if 'reverse' in params:
|
||||
del params['reverse']
|
||||
if "reverse" in params:
|
||||
del params["reverse"]
|
||||
|
||||
if 'closest' in params:
|
||||
del params['closest']
|
||||
if "closest" in params:
|
||||
del params["closest"]
|
||||
|
||||
if 'end_key' in params:
|
||||
del params['end_key']
|
||||
if "end_key" in params:
|
||||
del params["end_key"]
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def monkey_patch_fuzzy_query():
|
||||
pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
|
||||
|
||||
|
||||
# copied and pasted from pywb/utils/canonicalize.py, only changes are commented
|
||||
# as such
|
||||
def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
|
||||
start_key = url_canon(url)
|
||||
|
||||
if match_type == 'exact':
|
||||
end_key = start_key + '!'
|
||||
if match_type == "exact":
|
||||
end_key = start_key + "!"
|
||||
|
||||
elif match_type == 'prefix':
|
||||
elif match_type == "prefix":
|
||||
# add trailing slash if url has it
|
||||
if url.endswith('/') and not start_key.endswith('/'):
|
||||
start_key += '/'
|
||||
if url.endswith("/") and not start_key.endswith("/"):
|
||||
start_key += "/"
|
||||
|
||||
end_key = inc_last_char(start_key)
|
||||
|
||||
elif match_type == 'host':
|
||||
elif match_type == "host":
|
||||
if surt_ordered:
|
||||
host = start_key.split(')/')[0]
|
||||
host = start_key.split(")/")[0]
|
||||
|
||||
start_key = host + ')/'
|
||||
end_key = host + '*'
|
||||
start_key = host + ")/"
|
||||
end_key = host + "*"
|
||||
else:
|
||||
host = urlparse.urlsplit(url).netloc
|
||||
|
||||
start_key = host + '/'
|
||||
end_key = host + '0'
|
||||
start_key = host + "/"
|
||||
end_key = host + "0"
|
||||
|
||||
elif match_type == 'domain':
|
||||
elif match_type == "domain":
|
||||
if not surt_ordered:
|
||||
msg = 'matchType=domain unsupported for non-surt'
|
||||
msg = "matchType=domain unsupported for non-surt"
|
||||
raise UrlCanonicalizeException(msg)
|
||||
|
||||
host = start_key.split(')/')[0]
|
||||
host = start_key.split(")/")[0]
|
||||
|
||||
# if tld, use com, as start_key
|
||||
# otherwise, stick with com,example)/
|
||||
if ',' not in host:
|
||||
start_key = host + ','
|
||||
if "," not in host:
|
||||
start_key = host + ","
|
||||
else:
|
||||
start_key = host + ')/'
|
||||
start_key = host + ")/"
|
||||
|
||||
# begin brozzler changes
|
||||
end_key = host + '~'
|
||||
end_key = host + "~"
|
||||
# end brozzler changes
|
||||
else:
|
||||
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
||||
raise UrlCanonicalizeException("Invalid match_type: " + match_type)
|
||||
|
||||
return (start_key, end_key)
|
||||
|
||||
|
||||
def monkey_patch_calc_search_range():
|
||||
pywb.utils.canonicalize.calc_search_range = _calc_search_range
|
||||
pywb.cdx.query.calc_search_range = _calc_search_range
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||
@ -417,7 +443,10 @@ def main(argv=sys.argv):
|
||||
brozzler.pywb.monkey_patch_fuzzy_query()
|
||||
brozzler.pywb.monkey_patch_calc_search_range()
|
||||
wayback_cli = BrozzlerWaybackCli(
|
||||
args=argv[1:], default_port=8880,
|
||||
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
||||
'with brozzler)'))
|
||||
args=argv[1:],
|
||||
default_port=8880,
|
||||
desc=(
|
||||
"brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
|
||||
),
|
||||
)
|
||||
wayback_cli.run()
|
||||
|
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/robots.py - robots.txt support
|
||||
|
||||
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
||||
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
@ -34,48 +34,60 @@ __all__ = ["is_permitted_by_robots"]
|
||||
|
||||
# monkey-patch reppy to do substring user-agent matching, see top of file
|
||||
reppy.Utility.short_user_agent = lambda strng: strng
|
||||
|
||||
|
||||
def _reppy_rules_getitem(self, agent):
|
||||
'''
|
||||
"""
|
||||
Find the user-agent token matching the supplied full user-agent, using
|
||||
a case-insensitive substring search.
|
||||
'''
|
||||
"""
|
||||
lc_agent = agent.lower()
|
||||
for s in self.agents:
|
||||
if s in lc_agent:
|
||||
return self.agents[s]
|
||||
return self.agents.get('*')
|
||||
return self.agents.get("*")
|
||||
|
||||
|
||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||
|
||||
|
||||
class _SessionRaiseOn420(requests.Session):
|
||||
timeout = 60
|
||||
|
||||
def get(self, url, *args, **kwargs):
|
||||
res = super().get(url, timeout=self.timeout, *args, **kwargs)
|
||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||
if res.status_code == 420 and "warcprox-meta" in res.headers:
|
||||
raise brozzler.ReachedLimit(
|
||||
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
||||
http_payload=res.text)
|
||||
warcprox_meta=json.loads(res.headers["warcprox-meta"]),
|
||||
http_payload=res.text,
|
||||
)
|
||||
else:
|
||||
return res
|
||||
|
||||
|
||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
|
||||
|
||||
def _robots_cache(site, proxy=None):
|
||||
if not site.id in _robots_caches:
|
||||
req_sesh = _SessionRaiseOn420()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if proxy:
|
||||
proxie = "http://%s" % proxy
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
req_sesh.proxies = {"http": proxie, "https": proxie}
|
||||
if site.extra_headers():
|
||||
req_sesh.headers.update(site.extra_headers())
|
||||
if site.user_agent:
|
||||
req_sesh.headers['User-Agent'] = site.user_agent
|
||||
req_sesh.headers["User-Agent"] = site.user_agent
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(
|
||||
session=req_sesh, disallow_forbidden=False)
|
||||
session=req_sesh, disallow_forbidden=False
|
||||
)
|
||||
|
||||
return _robots_caches[site.id]
|
||||
|
||||
|
||||
def is_permitted_by_robots(site, url, proxy=None):
|
||||
'''
|
||||
"""
|
||||
Checks if `url` is permitted by robots.txt.
|
||||
|
||||
Treats any kind of error fetching robots.txt as "allow all". See
|
||||
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
Raises:
|
||||
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
|
||||
requests.exceptions.ProxyError: if the proxy is down
|
||||
'''
|
||||
"""
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
|
||||
try:
|
||||
result = _robots_cache(site, proxy).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except Exception as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||
e.args[0], brozzler.ReachedLimit):
|
||||
e.args[0], brozzler.ReachedLimit
|
||||
):
|
||||
raise e.args[0]
|
||||
elif hasattr(e, 'args') and isinstance(
|
||||
e.args[0], requests.exceptions.ProxyError):
|
||||
elif hasattr(e, "args") and isinstance(
|
||||
e.args[0], requests.exceptions.ProxyError
|
||||
):
|
||||
# reppy has wrapped an exception that we want to bubble up
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
logging.warning(
|
||||
"returning true (permitted) after problem fetching "
|
||||
"robots.txt for %r: %r", url, e)
|
||||
"returning true (permitted) after problem fetching "
|
||||
"robots.txt for %r: %r",
|
||||
url,
|
||||
e,
|
||||
)
|
||||
return True
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
|
||||
scopes and adds outlinks to the frontier
|
||||
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
@ -39,6 +39,7 @@ from . import ydl
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
|
||||
class BrozzlerWorker:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -50,13 +51,26 @@ class BrozzlerWorker:
|
||||
SITE_SESSION_MINUTES = 15
|
||||
|
||||
def __init__(
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
|
||||
download_throughput=-1, stealth=False,
|
||||
window_height=900, window_width=1400):
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
warcprox_auto=False,
|
||||
proxy=None,
|
||||
skip_extract_outlinks=False,
|
||||
skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False,
|
||||
simpler404=False,
|
||||
screenshot_full_page=False,
|
||||
page_timeout=300,
|
||||
behavior_timeout=900,
|
||||
extract_outlinks_timeout=60,
|
||||
download_throughput=-1,
|
||||
stealth=False,
|
||||
window_height=900,
|
||||
window_width=1400,
|
||||
):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -79,7 +93,8 @@ class BrozzlerWorker:
|
||||
self._stealth = stealth
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(
|
||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
|
||||
)
|
||||
self._browsing_threads = set()
|
||||
self._browsing_threads_lock = threading.Lock()
|
||||
|
||||
@ -88,24 +103,32 @@ class BrozzlerWorker:
|
||||
self._shutdown = threading.Event()
|
||||
|
||||
def _choose_warcprox(self):
|
||||
warcproxes = self._service_registry.available_services('warcprox')
|
||||
warcproxes = self._service_registry.available_services("warcprox")
|
||||
if not warcproxes:
|
||||
return None
|
||||
# .group('proxy').count() makes this query about 99% more efficient
|
||||
reql = self._frontier.rr.table('sites').between(
|
||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||
index='sites_last_disclaimed').group('proxy').count()
|
||||
# returns results like
|
||||
reql = (
|
||||
self._frontier.rr.table("sites")
|
||||
.between(
|
||||
["ACTIVE", r.minval],
|
||||
["ACTIVE", r.maxval],
|
||||
index="sites_last_disclaimed",
|
||||
)
|
||||
.group("proxy")
|
||||
.count()
|
||||
)
|
||||
# returns results like
|
||||
# {
|
||||
# "wbgrp-svc030.us.archive.org:8000": 148,
|
||||
# "wbgrp-svc030.us.archive.org:8001": 145
|
||||
# }
|
||||
proxy_scoreboard = dict(reql.run())
|
||||
for warcprox in warcproxes:
|
||||
address = '%s:%s' % (warcprox['host'], warcprox['port'])
|
||||
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
|
||||
warcproxes.sort(key=lambda warcprox: (
|
||||
warcprox['assigned_sites'], warcprox['load']))
|
||||
address = "%s:%s" % (warcprox["host"], warcprox["port"])
|
||||
warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
|
||||
warcproxes.sort(
|
||||
key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
|
||||
)
|
||||
# XXX make this heuristic more advanced?
|
||||
return warcproxes[0]
|
||||
|
||||
@ -118,13 +141,15 @@ class BrozzlerWorker:
|
||||
svc = self._choose_warcprox()
|
||||
if svc is None:
|
||||
raise brozzler.ProxyError(
|
||||
'no available instances of warcprox in the service '
|
||||
'registry')
|
||||
site.proxy = '%s:%s' % (svc['host'], svc['port'])
|
||||
"no available instances of warcprox in the service " "registry"
|
||||
)
|
||||
site.proxy = "%s:%s" % (svc["host"], svc["port"])
|
||||
site.save()
|
||||
self.logger.info(
|
||||
'chose warcprox instance %r from service registry for %r',
|
||||
site.proxy, site)
|
||||
"chose warcprox instance %r from service registry for %r",
|
||||
site.proxy,
|
||||
site,
|
||||
)
|
||||
return site.proxy
|
||||
return None
|
||||
|
||||
@ -132,14 +157,16 @@ class BrozzlerWorker:
|
||||
if self._proxy:
|
||||
if self._proxy_is_warcprox is None:
|
||||
try:
|
||||
response = requests.get('http://%s/status' % self._proxy)
|
||||
response = requests.get("http://%s/status" % self._proxy)
|
||||
status = json.loads(response.text)
|
||||
self._proxy_is_warcprox = (status['role'] == 'warcprox')
|
||||
self._proxy_is_warcprox = status["role"] == "warcprox"
|
||||
except Exception as e:
|
||||
self._proxy_is_warcprox = False
|
||||
logging.info(
|
||||
'%s %s warcprox', self._proxy,
|
||||
'IS' if self._proxy_is_warcprox else 'IS NOT')
|
||||
"%s %s warcprox",
|
||||
self._proxy,
|
||||
"IS" if self._proxy_is_warcprox else "IS NOT",
|
||||
)
|
||||
return self._proxy_is_warcprox
|
||||
else:
|
||||
# I should have commented when I originally wrote this code, but I
|
||||
@ -148,13 +175,20 @@ class BrozzlerWorker:
|
||||
return bool(site.proxy or self._warcprox_auto)
|
||||
|
||||
def _warcprox_write_record(
|
||||
self, warcprox_address, url, warc_type, content_type,
|
||||
payload, extra_headers=None):
|
||||
headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
|
||||
self,
|
||||
warcprox_address,
|
||||
url,
|
||||
warc_type,
|
||||
content_type,
|
||||
payload,
|
||||
extra_headers=None,
|
||||
):
|
||||
headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
|
||||
if extra_headers:
|
||||
headers.update(extra_headers)
|
||||
request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
|
||||
headers=headers, data=payload)
|
||||
request = urllib.request.Request(
|
||||
url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
|
||||
)
|
||||
|
||||
# XXX setting request.type="http" is a hack to stop urllib from trying
|
||||
# to tunnel if url is https
|
||||
@ -165,26 +199,31 @@ class BrozzlerWorker:
|
||||
with urllib.request.urlopen(request, timeout=600) as response:
|
||||
if response.getcode() != 204:
|
||||
self.logger.warning(
|
||||
'got "%s %s" response on warcprox '
|
||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||
response.getcode(), response.reason)
|
||||
'got "%s %s" response on warcprox '
|
||||
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||
response.getcode(),
|
||||
response.reason,
|
||||
)
|
||||
return request, response
|
||||
except urllib.error.HTTPError as e:
|
||||
self.logger.warning(
|
||||
'got "%s %s" response on warcprox '
|
||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||
e.getcode(), e.info())
|
||||
'got "%s %s" response on warcprox '
|
||||
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||
e.getcode(),
|
||||
e.info(),
|
||||
)
|
||||
return request, None
|
||||
except urllib.error.URLError as e:
|
||||
raise brozzler.ProxyError(
|
||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||
"proxy error on WARCPROX_WRITE_RECORD %s" % url
|
||||
) from e
|
||||
except ConnectionError as e:
|
||||
raise brozzler.ProxyError(
|
||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||
"proxy error on WARCPROX_WRITE_RECORD %s" % url
|
||||
) from e
|
||||
|
||||
def thumb_jpeg(self, full_jpeg):
|
||||
"""Create JPEG thumbnail.
|
||||
"""
|
||||
"""Create JPEG thumbnail."""
|
||||
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
||||
thumb_width = 300
|
||||
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
||||
@ -193,8 +232,15 @@ class BrozzlerWorker:
|
||||
img.save(out, "jpeg", quality=95)
|
||||
return out.getbuffer()
|
||||
|
||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||
on_request=None, enable_youtube_dl=True):
|
||||
def brozzle_page(
|
||||
self,
|
||||
browser,
|
||||
site,
|
||||
page,
|
||||
on_screenshot=None,
|
||||
on_request=None,
|
||||
enable_youtube_dl=True,
|
||||
):
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl_fetches = None
|
||||
outlinks = set()
|
||||
@ -208,31 +254,38 @@ class BrozzlerWorker:
|
||||
except brozzler.ProxyError:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], 'code')
|
||||
and e.exc_info[1].code == 430):
|
||||
if (
|
||||
hasattr(e, "exc_info")
|
||||
and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], "code")
|
||||
and e.exc_info[1].code == 430
|
||||
):
|
||||
self.logger.info(
|
||||
'youtube-dl got %s %s processing %s',
|
||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||
"youtube-dl got %s %s processing %s",
|
||||
e.exc_info[1].code,
|
||||
e.exc_info[1].msg,
|
||||
page.url,
|
||||
)
|
||||
else:
|
||||
self.logger.error(
|
||||
'youtube_dl raised exception on %s', page,
|
||||
exc_info=True)
|
||||
"youtube_dl raised exception on %s", page, exc_info=True
|
||||
)
|
||||
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
self.logger.info("needs browsing: %s", page)
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request)
|
||||
browser, site, page, on_screenshot, on_request
|
||||
)
|
||||
outlinks.update(browser_outlinks)
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
self.logger.info("already fetched: %s", page)
|
||||
|
||||
return outlinks
|
||||
|
||||
@ -242,85 +295,103 @@ class BrozzlerWorker:
|
||||
on_screenshot(screenshot_jpeg)
|
||||
if self._using_warcprox(site):
|
||||
self.logger.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||
"screenshot for %s", self._proxy_for(site), page)
|
||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||
"screenshot for %s",
|
||||
self._proxy_for(site),
|
||||
page,
|
||||
)
|
||||
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers(page))
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource",
|
||||
content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers(page),
|
||||
)
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers(page))
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource",
|
||||
content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers(page),
|
||||
)
|
||||
|
||||
def _on_response(chrome_msg):
|
||||
if ('params' in chrome_msg
|
||||
and 'response' in chrome_msg['params']
|
||||
and 'mimeType' in chrome_msg['params']['response']
|
||||
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
|
||||
# skip manifests of DASH segmented video -
|
||||
# see https://github.com/internetarchive/brozzler/pull/70
|
||||
and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
|
||||
and chrome_msg['params']['response'].get('status') in (200, 206)):
|
||||
if (
|
||||
"params" in chrome_msg
|
||||
and "response" in chrome_msg["params"]
|
||||
and "mimeType" in chrome_msg["params"]["response"]
|
||||
and chrome_msg["params"]["response"]
|
||||
.get("mimeType", "")
|
||||
.startswith("video/")
|
||||
# skip manifests of DASH segmented video -
|
||||
# see https://github.com/internetarchive/brozzler/pull/70
|
||||
and chrome_msg["params"]["response"]["mimeType"]
|
||||
!= "video/vnd.mpeg.dash.mpd"
|
||||
and chrome_msg["params"]["response"].get("status") in (200, 206)
|
||||
):
|
||||
video = {
|
||||
'blame': 'browser',
|
||||
'url': chrome_msg['params']['response'].get('url'),
|
||||
'response_code': chrome_msg['params']['response']['status'],
|
||||
'content-type': chrome_msg['params']['response']['mimeType'],
|
||||
"blame": "browser",
|
||||
"url": chrome_msg["params"]["response"].get("url"),
|
||||
"response_code": chrome_msg["params"]["response"]["status"],
|
||||
"content-type": chrome_msg["params"]["response"]["mimeType"],
|
||||
}
|
||||
response_headers = CaseInsensitiveDict(
|
||||
chrome_msg['params']['response']['headers'])
|
||||
if 'content-length' in response_headers:
|
||||
video['content-length'] = int(response_headers['content-length'])
|
||||
if 'content-range' in response_headers:
|
||||
video['content-range'] = response_headers['content-range']
|
||||
logging.debug('embedded video %s', video)
|
||||
if not 'videos' in page:
|
||||
chrome_msg["params"]["response"]["headers"]
|
||||
)
|
||||
if "content-length" in response_headers:
|
||||
video["content-length"] = int(response_headers["content-length"])
|
||||
if "content-range" in response_headers:
|
||||
video["content-range"] = response_headers["content-range"]
|
||||
logging.debug("embedded video %s", video)
|
||||
if not "videos" in page:
|
||||
page.videos = []
|
||||
page.videos.append(video)
|
||||
|
||||
sw_fetched = set()
|
||||
|
||||
def _on_service_worker_version_updated(chrome_msg):
|
||||
# https://github.com/internetarchive/brozzler/issues/140
|
||||
self.logger.trace('%r', chrome_msg)
|
||||
if chrome_msg.get('params', {}).get('versions'):
|
||||
url = chrome_msg.get('params', {}).get('versions')[0]\
|
||||
.get('scriptURL')
|
||||
self.logger.trace("%r", chrome_msg)
|
||||
if chrome_msg.get("params", {}).get("versions"):
|
||||
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
|
||||
if url and url not in sw_fetched:
|
||||
self.logger.info('fetching service worker script %s', url)
|
||||
self.logger.info("fetching service worker script %s", url)
|
||||
self._fetch_url(site, url=url)
|
||||
sw_fetched.add(url)
|
||||
|
||||
if not browser.is_running():
|
||||
browser.start(
|
||||
proxy=self._proxy_for(site),
|
||||
cookie_db=site.get('cookie_db'),
|
||||
window_height=self._window_height,
|
||||
window_width=self._window_width)
|
||||
proxy=self._proxy_for(site),
|
||||
cookie_db=site.get("cookie_db"),
|
||||
window_height=self._window_height,
|
||||
window_width=self._window_width,
|
||||
)
|
||||
final_page_url, outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers(page),
|
||||
behavior_parameters=site.get('behavior_parameters'),
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||
on_request=on_request,
|
||||
on_service_worker_version_updated=_on_service_worker_version_updated,
|
||||
hashtags=page.hashtags,
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||
skip_youtube_dl=self._skip_youtube_dl,
|
||||
simpler404=self._simpler404,
|
||||
screenshot_full_page=self._screenshot_full_page,
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout,
|
||||
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
||||
download_throughput=self._download_throughput,
|
||||
stealth=self._stealth)
|
||||
page.url,
|
||||
extra_headers=site.extra_headers(page),
|
||||
behavior_parameters=site.get("behavior_parameters"),
|
||||
username=site.get("username"),
|
||||
password=site.get("password"),
|
||||
user_agent=site.get("user_agent"),
|
||||
on_screenshot=_on_screenshot,
|
||||
on_response=_on_response,
|
||||
on_request=on_request,
|
||||
on_service_worker_version_updated=_on_service_worker_version_updated,
|
||||
hashtags=page.hashtags,
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||
skip_youtube_dl=self._skip_youtube_dl,
|
||||
simpler404=self._simpler404,
|
||||
screenshot_full_page=self._screenshot_full_page,
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout,
|
||||
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
||||
download_throughput=self._download_throughput,
|
||||
stealth=self._stealth,
|
||||
)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
@ -328,22 +399,21 @@ class BrozzlerWorker:
|
||||
def _fetch_url(self, site, url=None, page=None):
|
||||
proxies = None
|
||||
if page:
|
||||
url=page.url
|
||||
url = page.url
|
||||
if self._proxy_for(site):
|
||||
proxies = {
|
||||
'http': 'http://%s' % self._proxy_for(site),
|
||||
'https': 'http://%s' % self._proxy_for(site),
|
||||
"http": "http://%s" % self._proxy_for(site),
|
||||
"https": "http://%s" % self._proxy_for(site),
|
||||
}
|
||||
|
||||
self.logger.info('fetching %s', url)
|
||||
self.logger.info("fetching %s", url)
|
||||
try:
|
||||
# response is ignored
|
||||
requests.get(
|
||||
url, proxies=proxies, headers=site.extra_headers(page),
|
||||
verify=False)
|
||||
url, proxies=proxies, headers=site.extra_headers(page), verify=False
|
||||
)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise brozzler.ProxyError(
|
||||
'proxy error fetching %s' % url) from e
|
||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||
|
||||
def _needs_browsing(self, page, ydl_fetches):
|
||||
if ydl_fetches:
|
||||
@ -351,8 +421,10 @@ class BrozzlerWorker:
|
||||
if not final_bounces:
|
||||
return True
|
||||
for txn in final_bounces:
|
||||
if txn['response_headers'].get_content_type() in [
|
||||
'text/html', 'application/xhtml+xml']:
|
||||
if txn["response_headers"].get_content_type() in [
|
||||
"text/html",
|
||||
"application/xhtml+xml",
|
||||
]:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
@ -361,14 +433,13 @@ class BrozzlerWorker:
|
||||
def _already_fetched(self, page, ydl_fetches):
|
||||
if ydl_fetches:
|
||||
for fetch in ydl.final_bounces(ydl_fetches, page.url):
|
||||
if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
|
||||
if fetch["method"] == "GET" and fetch["response_code"] == 200:
|
||||
return True
|
||||
return False
|
||||
|
||||
def brozzle_site(self, browser, site):
|
||||
try:
|
||||
site.last_claimed_by = '%s:%s' % (
|
||||
socket.gethostname(), browser.chrome.port)
|
||||
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
|
||||
site.save()
|
||||
start = time.time()
|
||||
page = None
|
||||
@ -377,28 +448,28 @@ class BrozzlerWorker:
|
||||
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
||||
# which is why we honor time limit and stop request first☝🏻
|
||||
self.logger.info(
|
||||
"brozzling site (proxy=%r) %s",
|
||||
self._proxy_for(site), site)
|
||||
"brozzling site (proxy=%r) %s", self._proxy_for(site), site
|
||||
)
|
||||
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
||||
site.refresh()
|
||||
self._frontier.enforce_time_limit(site)
|
||||
self._frontier.honor_stop_request(site)
|
||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||
socket.gethostname(), browser.chrome.port))
|
||||
page = self._frontier.claim_page(
|
||||
site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
|
||||
)
|
||||
|
||||
if (page.needs_robots_check and
|
||||
not brozzler.is_permitted_by_robots(
|
||||
site, page.url, self._proxy_for(site))):
|
||||
if page.needs_robots_check and not brozzler.is_permitted_by_robots(
|
||||
site, page.url, self._proxy_for(site)
|
||||
):
|
||||
logging.warning("page %s is blocked by robots.txt", page.url)
|
||||
page.blocked_by_robots = True
|
||||
self._frontier.completed_page(site, page)
|
||||
else:
|
||||
outlinks = self.brozzle_page(
|
||||
browser, site, page,
|
||||
enable_youtube_dl=not self._skip_youtube_dl)
|
||||
browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
|
||||
)
|
||||
self._frontier.completed_page(site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
site, page, outlinks)
|
||||
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
||||
if browser.is_running():
|
||||
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
|
||||
|
||||
@ -418,31 +489,36 @@ class BrozzlerWorker:
|
||||
except brozzler.ProxyError as e:
|
||||
if self._warcprox_auto:
|
||||
logging.error(
|
||||
'proxy error (site.proxy=%s), will try to choose a '
|
||||
'healthy instance next time site is brozzled: %s',
|
||||
site.proxy, e)
|
||||
"proxy error (site.proxy=%s), will try to choose a "
|
||||
"healthy instance next time site is brozzled: %s",
|
||||
site.proxy,
|
||||
e,
|
||||
)
|
||||
site.proxy = None
|
||||
else:
|
||||
# using brozzler-worker --proxy, nothing to do but try the
|
||||
# same proxy again next time
|
||||
logging.error(
|
||||
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
|
||||
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
||||
except:
|
||||
self.logger.error(
|
||||
'unexpected exception site=%r page=%r', site, page,
|
||||
exc_info=True)
|
||||
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
||||
)
|
||||
if page:
|
||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||
self.logger.info(
|
||||
'marking page "completed" after %s unexpected '
|
||||
'exceptions attempting to brozzle %s',
|
||||
page.failed_attempts, page)
|
||||
'marking page "completed" after %s unexpected '
|
||||
"exceptions attempting to brozzle %s",
|
||||
page.failed_attempts,
|
||||
page,
|
||||
)
|
||||
self._frontier.completed_page(site, page)
|
||||
page = None
|
||||
finally:
|
||||
if start:
|
||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
||||
site.active_brozzling_time = (
|
||||
(site.active_brozzling_time or 0) + time.time() - start
|
||||
)
|
||||
self._frontier.disclaim_site(site, page)
|
||||
|
||||
def _brozzle_site_thread_target(self, browser, site):
|
||||
@ -462,21 +538,25 @@ class BrozzlerWorker:
|
||||
"role": "brozzler-worker",
|
||||
"ttl": self.HEARTBEAT_INTERVAL * 3,
|
||||
}
|
||||
status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
|
||||
status_info["load"] = (
|
||||
1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
|
||||
)
|
||||
status_info["browser_pool_size"] = self._browser_pool.size
|
||||
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
||||
|
||||
try:
|
||||
self.status_info = self._service_registry.heartbeat(status_info)
|
||||
self.logger.trace(
|
||||
"status in service registry: %s", self.status_info)
|
||||
self.logger.trace("status in service registry: %s", self.status_info)
|
||||
except r.ReqlError as e:
|
||||
self.logger.error(
|
||||
"failed to send heartbeat and update service registry "
|
||||
"with info %s: %s", status_info, e)
|
||||
"failed to send heartbeat and update service registry "
|
||||
"with info %s: %s",
|
||||
status_info,
|
||||
e,
|
||||
)
|
||||
|
||||
def _service_heartbeat_if_due(self):
|
||||
'''Sends service registry heartbeat if due'''
|
||||
"""Sends service registry heartbeat if due"""
|
||||
due = False
|
||||
if self._service_registry:
|
||||
if not hasattr(self, "status_info"):
|
||||
@ -489,15 +569,16 @@ class BrozzlerWorker:
|
||||
self._service_heartbeat()
|
||||
|
||||
def _start_browsing_some_sites(self):
|
||||
'''
|
||||
"""
|
||||
Starts browsing some sites.
|
||||
|
||||
Raises:
|
||||
NoBrowsersAvailable if none available
|
||||
'''
|
||||
"""
|
||||
# acquire_multi() raises NoBrowsersAvailable if none available
|
||||
browsers = self._browser_pool.acquire_multi(
|
||||
(self._browser_pool.num_available() + 1) // 2)
|
||||
(self._browser_pool.num_available() + 1) // 2
|
||||
)
|
||||
try:
|
||||
sites = self._frontier.claim_sites(len(browsers))
|
||||
except:
|
||||
@ -507,10 +588,11 @@ class BrozzlerWorker:
|
||||
for i in range(len(browsers)):
|
||||
if i < len(sites):
|
||||
th = threading.Thread(
|
||||
target=self._brozzle_site_thread_target,
|
||||
args=(browsers[i], sites[i]),
|
||||
name="BrozzlingThread:%s" % browsers[i].chrome.port,
|
||||
daemon=True)
|
||||
target=self._brozzle_site_thread_target,
|
||||
args=(browsers[i], sites[i]),
|
||||
name="BrozzlingThread:%s" % browsers[i].chrome.port,
|
||||
daemon=True,
|
||||
)
|
||||
with self._browsing_threads_lock:
|
||||
self._browsing_threads.add(th)
|
||||
th.start()
|
||||
@ -519,7 +601,8 @@ class BrozzlerWorker:
|
||||
|
||||
def run(self):
|
||||
self.logger.notice(
|
||||
'brozzler %s - brozzler-worker starting', brozzler.__version__)
|
||||
"brozzler %s - brozzler-worker starting", brozzler.__version__
|
||||
)
|
||||
last_nothing_to_claim = 0
|
||||
try:
|
||||
while not self._shutdown.is_set():
|
||||
@ -528,39 +611,38 @@ class BrozzlerWorker:
|
||||
try:
|
||||
self._start_browsing_some_sites()
|
||||
except brozzler.browser.NoBrowsersAvailable:
|
||||
logging.trace(
|
||||
"all %s browsers are in use",
|
||||
self._max_browsers)
|
||||
logging.trace("all %s browsers are in use", self._max_browsers)
|
||||
except brozzler.NothingToClaim:
|
||||
last_nothing_to_claim = time.time()
|
||||
logging.trace(
|
||||
"nothing to claim, all available active sites "
|
||||
"are already claimed by a brozzler worker")
|
||||
"nothing to claim, all available active sites "
|
||||
"are already claimed by a brozzler worker"
|
||||
)
|
||||
time.sleep(0.5)
|
||||
|
||||
self.logger.notice("shutdown requested")
|
||||
except r.ReqlError as e:
|
||||
self.logger.error(
|
||||
"caught rethinkdb exception, will try to proceed",
|
||||
exc_info=True)
|
||||
"caught rethinkdb exception, will try to proceed", exc_info=True
|
||||
)
|
||||
except brozzler.ShutdownRequested:
|
||||
self.logger.info("shutdown requested")
|
||||
except:
|
||||
self.logger.critical(
|
||||
"thread exiting due to unexpected exception",
|
||||
exc_info=True)
|
||||
"thread exiting due to unexpected exception", exc_info=True
|
||||
)
|
||||
finally:
|
||||
if self._service_registry and hasattr(self, "status_info"):
|
||||
try:
|
||||
self._service_registry.unregister(self.status_info["id"])
|
||||
except:
|
||||
self.logger.error(
|
||||
"failed to unregister from service registry",
|
||||
exc_info=True)
|
||||
"failed to unregister from service registry", exc_info=True
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
'shutting down %s brozzling threads',
|
||||
len(self._browsing_threads))
|
||||
"shutting down %s brozzling threads", len(self._browsing_threads)
|
||||
)
|
||||
with self._browsing_threads_lock:
|
||||
for th in self._browsing_threads:
|
||||
if th.is_alive():
|
||||
@ -575,11 +657,10 @@ class BrozzlerWorker:
|
||||
with self._start_stop_lock:
|
||||
if self._thread:
|
||||
self.logger.warning(
|
||||
'ignoring start request because self._thread is '
|
||||
'not None')
|
||||
"ignoring start request because self._thread is " "not None"
|
||||
)
|
||||
return
|
||||
self._thread = threading.Thread(
|
||||
target=self.run, name="BrozzlerWorker")
|
||||
self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
|
||||
self._thread.start()
|
||||
|
||||
def shutdown_now(self):
|
||||
@ -590,4 +671,3 @@ class BrozzlerWorker:
|
||||
|
||||
def is_alive(self):
|
||||
return self._thread and self._thread.is_alive()
|
||||
|
||||
|
299
brozzler/ydl.py
299
brozzler/ydl.py
@ -1,4 +1,4 @@
|
||||
'''
|
||||
"""
|
||||
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
||||
|
||||
Copyright (C) 2023 Internet Archive
|
||||
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import logging
|
||||
import yt_dlp
|
||||
@ -31,6 +31,7 @@ import threading
|
||||
|
||||
thread_local = threading.local()
|
||||
|
||||
|
||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
def __init__(self, extra_headers):
|
||||
self.extra_headers = extra_headers
|
||||
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
req.add_header(h, v)
|
||||
return req
|
||||
|
||||
|
||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
|
||||
def _http_response(self, request, response):
|
||||
fetch = {
|
||||
'url': request.full_url,
|
||||
'method': request.get_method(),
|
||||
'response_code': response.code,
|
||||
'response_headers': response.headers,
|
||||
"url": request.full_url,
|
||||
"method": request.get_method(),
|
||||
"response_code": response.code,
|
||||
"response_headers": response.headers,
|
||||
}
|
||||
self.fetches.append(fetch)
|
||||
return response
|
||||
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
def reset(self):
|
||||
self.fetches = []
|
||||
|
||||
|
||||
def final_bounces(fetches, url):
|
||||
"""
|
||||
Resolves redirect chains in `fetches` and returns a list of fetches
|
||||
@ -73,26 +76,28 @@ def final_bounces(fetches, url):
|
||||
"""
|
||||
redirects = {}
|
||||
for fetch in fetches:
|
||||
# XXX check http status 301,302,303,307? check for "uri" header
|
||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||
if 'location' in fetch['response_headers']:
|
||||
redirects[fetch['url']] = fetch
|
||||
# XXX check http status 301,302,303,307? check for "uri" header
|
||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||
if "location" in fetch["response_headers"]:
|
||||
redirects[fetch["url"]] = fetch
|
||||
|
||||
final_url = url
|
||||
while final_url in redirects:
|
||||
fetch = redirects.pop(final_url)
|
||||
final_url = urllib.parse.urljoin(
|
||||
fetch['url'], fetch['response_headers']['location'])
|
||||
fetch["url"], fetch["response_headers"]["location"]
|
||||
)
|
||||
|
||||
final_bounces = []
|
||||
for fetch in fetches:
|
||||
if fetch['url'] == final_url:
|
||||
if fetch["url"] == final_url:
|
||||
final_bounces.append(fetch)
|
||||
|
||||
return final_bounces
|
||||
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site, page):
|
||||
'''
|
||||
"""
|
||||
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
|
||||
|
||||
The `YoutubeDL` instance does a few special brozzler-specific things:
|
||||
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
|
||||
Returns:
|
||||
a yt-dlp `yt_dlp.YoutubeDL` instance
|
||||
'''
|
||||
"""
|
||||
|
||||
class _YoutubeDL(yt_dlp.YoutubeDL):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
def add_default_extra_info(self, ie_result, ie, url):
|
||||
# hook in some logging
|
||||
super().add_default_extra_info(ie_result, ie, url)
|
||||
if ie_result.get('_type') == 'playlist':
|
||||
self.logger.info(
|
||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
|
||||
if ie_result.get("_type") == "playlist":
|
||||
self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
|
||||
if ie.IE_NAME in {
|
||||
"youtube:playlist",
|
||||
"youtube:tab",
|
||||
"soundcloud:user",
|
||||
"instagram:user",
|
||||
}:
|
||||
# At this point ie_result['entries'] is an iterator that
|
||||
# will fetch more metadata from youtube to list all the
|
||||
# videos. We unroll that iterator here partly because
|
||||
# otherwise `process_ie_result()` will clobber it, and we
|
||||
# use it later to extract the watch pages as outlinks.
|
||||
try:
|
||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||
ie_result["entries_no_dl"] = list(ie_result["entries"])
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
|
||||
ie.IE_NAME, url, e)
|
||||
ie_result['entries_no_dl'] =[]
|
||||
ie_result['entries'] = []
|
||||
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
|
||||
ie.IE_NAME,
|
||||
url,
|
||||
e,
|
||||
)
|
||||
ie_result["entries_no_dl"] = []
|
||||
ie_result["entries"] = []
|
||||
self.logger.info(
|
||||
'not downloading %s media files from this '
|
||||
'playlist because we expect to capture them from '
|
||||
'individual watch/track/detail pages',
|
||||
len(ie_result['entries_no_dl']))
|
||||
"not downloading %s media files from this "
|
||||
"playlist because we expect to capture them from "
|
||||
"individual watch/track/detail pages",
|
||||
len(ie_result["entries_no_dl"]),
|
||||
)
|
||||
else:
|
||||
self.logger.info(
|
||||
'extractor %r found a download in %s', ie.IE_NAME, url)
|
||||
self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
|
||||
|
||||
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
|
||||
# 220211 update: does yt-dlp supply content-type? no, not as such
|
||||
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
# youtube-dl produces a stitched-up video that /usr/bin/file fails
|
||||
# to identify (says "application/octet-stream"). `ffprobe` doesn't
|
||||
# give us a mimetype.
|
||||
if info_dict.get('ext') == 'mp4':
|
||||
mimetype = 'video/mp4'
|
||||
if info_dict.get("ext") == "mp4":
|
||||
mimetype = "video/mp4"
|
||||
else:
|
||||
try:
|
||||
import magic
|
||||
mimetype = magic.from_file(info_dict['filepath'], mime=True)
|
||||
|
||||
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
||||
except ImportError as e:
|
||||
mimetype = 'video/%s' % info_dict['ext']
|
||||
self.logger.warning(
|
||||
'guessing mimetype %s because %r', mimetype, e)
|
||||
mimetype = "video/%s" % info_dict["ext"]
|
||||
self.logger.warning("guessing mimetype %s because %r", mimetype, e)
|
||||
|
||||
# youtube watch page postprocessor is MoveFiles
|
||||
|
||||
if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
|
||||
url = 'youtube-dl:%05d:%s' % (
|
||||
info_dict.get('playlist_index') or 1,
|
||||
info_dict['webpage_url'])
|
||||
if postprocessor == "FixupM3u8" or postprocessor == "Merger":
|
||||
url = "youtube-dl:%05d:%s" % (
|
||||
info_dict.get("playlist_index") or 1,
|
||||
info_dict["webpage_url"],
|
||||
)
|
||||
else:
|
||||
url = info_dict.get('url', '')
|
||||
url = info_dict.get("url", "")
|
||||
|
||||
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
|
||||
if url.endswith('.m3u8') or url == '':
|
||||
if url.endswith(".m3u8") or url == "":
|
||||
return
|
||||
|
||||
size = os.path.getsize(info_dict['filepath'])
|
||||
size = os.path.getsize(info_dict["filepath"])
|
||||
self.logger.info(
|
||||
'pushing %r video as %s (%s bytes) to '
|
||||
'warcprox at %s with url %s', info_dict['format'],
|
||||
mimetype, size, worker._proxy_for(site), url)
|
||||
with open(info_dict['filepath'], 'rb') as f:
|
||||
"pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
|
||||
info_dict["format"],
|
||||
mimetype,
|
||||
size,
|
||||
worker._proxy_for(site),
|
||||
url,
|
||||
)
|
||||
with open(info_dict["filepath"], "rb") as f:
|
||||
# include content-length header to avoid chunked
|
||||
# transfer, which warcprox currently rejects
|
||||
extra_headers = dict(site.extra_headers())
|
||||
extra_headers['content-length'] = size
|
||||
extra_headers["content-length"] = size
|
||||
request, response = worker._warcprox_write_record(
|
||||
warcprox_address=worker._proxy_for(site), url=url,
|
||||
warc_type='resource', content_type=mimetype, payload=f,
|
||||
extra_headers=extra_headers)
|
||||
warcprox_address=worker._proxy_for(site),
|
||||
url=url,
|
||||
warc_type="resource",
|
||||
content_type=mimetype,
|
||||
payload=f,
|
||||
extra_headers=extra_headers,
|
||||
)
|
||||
# consulted by _remember_videos()
|
||||
ydl.pushed_videos.append({
|
||||
'url': url,
|
||||
'response_code': response.code,
|
||||
'content-type': mimetype,
|
||||
'content-length': size,
|
||||
})
|
||||
ydl.pushed_videos.append(
|
||||
{
|
||||
"url": url,
|
||||
"response_code": response.code,
|
||||
"content-type": mimetype,
|
||||
"content-length": size,
|
||||
}
|
||||
)
|
||||
|
||||
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||
# in case yt-dlp takes a long time, heartbeat site.last_claimed
|
||||
# to prevent another brozzler-worker from claiming the site
|
||||
try:
|
||||
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
|
||||
if (
|
||||
site.rr
|
||||
and doublethink.utcnow() - site.last_claimed
|
||||
> datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
|
||||
):
|
||||
worker.logger.debug(
|
||||
'heartbeating site.last_claimed to prevent another '
|
||||
'brozzler-worker claiming this site id=%r', site.id)
|
||||
"heartbeating site.last_claimed to prevent another "
|
||||
"brozzler-worker claiming this site id=%r",
|
||||
site.id,
|
||||
)
|
||||
site.last_claimed = doublethink.utcnow()
|
||||
site.save()
|
||||
except:
|
||||
worker.logger.debug(
|
||||
'problem heartbeating site.last_claimed site id=%r',
|
||||
site.id, exc_info=True)
|
||||
"problem heartbeating site.last_claimed site id=%r",
|
||||
site.id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
def ydl_postprocess_hook(d):
|
||||
if d['status'] == 'finished':
|
||||
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
|
||||
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
|
||||
if d["status"] == "finished":
|
||||
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
|
||||
worker.logger.info(
|
||||
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
|
||||
)
|
||||
if worker._using_warcprox(site):
|
||||
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
||||
_YoutubeDL._push_video_to_warcprox(
|
||||
_YoutubeDL, site, d["info_dict"], d["postprocessor"]
|
||||
)
|
||||
|
||||
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
||||
ydl_opts = {
|
||||
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
"socket_timeout": 40,
|
||||
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
||||
"postprocessor_hooks": [ydl_postprocess_hook],
|
||||
|
||||
# https://github.com/yt-dlp/yt-dlp#format-selection
|
||||
# "By default, yt-dlp tries to download the best available quality..."
|
||||
# pre-v.2023.07.06: "format_sort": ["ext"],
|
||||
@ -238,16 +272,13 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
# recommended: convert working cli to api call with
|
||||
# https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py
|
||||
"format": "b/bv+ba",
|
||||
"format_sort": ["res:720","vcodec:h264","acodec:aac"],
|
||||
"format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
|
||||
# skip live streams
|
||||
"match_filter": match_filter_func("!is_live"),
|
||||
|
||||
"extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
|
||||
|
||||
"extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
|
||||
# --cache-dir local or..
|
||||
# this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
|
||||
"cache_dir": "/home/archiveit",
|
||||
|
||||
"logger": logging.getLogger("yt_dlp"),
|
||||
"verbose": False,
|
||||
"quiet": False,
|
||||
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
ydl._opener.add_handler(ydl.fetch_spy)
|
||||
return ydl
|
||||
|
||||
|
||||
def _remember_videos(page, fetches, pushed_videos=None):
|
||||
'''
|
||||
"""
|
||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||
'''
|
||||
if not 'videos' in page:
|
||||
"""
|
||||
if not "videos" in page:
|
||||
page.videos = []
|
||||
for fetch in fetches or []:
|
||||
content_type = fetch['response_headers'].get_content_type()
|
||||
if (content_type.startswith('video/')
|
||||
# skip manifests of DASH segmented video -
|
||||
# see https://github.com/internetarchive/brozzler/pull/70
|
||||
and content_type != 'video/vnd.mpeg.dash.mpd'
|
||||
and fetch['method'] == 'GET'
|
||||
and fetch['response_code'] in (200, 206)):
|
||||
content_type = fetch["response_headers"].get_content_type()
|
||||
if (
|
||||
content_type.startswith("video/")
|
||||
# skip manifests of DASH segmented video -
|
||||
# see https://github.com/internetarchive/brozzler/pull/70
|
||||
and content_type != "video/vnd.mpeg.dash.mpd"
|
||||
and fetch["method"] == "GET"
|
||||
and fetch["response_code"] in (200, 206)
|
||||
):
|
||||
video = {
|
||||
'blame': 'youtube-dl',
|
||||
'url': fetch['url'],
|
||||
'response_code': fetch['response_code'],
|
||||
'content-type': content_type,
|
||||
"blame": "youtube-dl",
|
||||
"url": fetch["url"],
|
||||
"response_code": fetch["response_code"],
|
||||
"content-type": content_type,
|
||||
}
|
||||
if 'content-length' in fetch['response_headers']:
|
||||
video['content-length'] = int(
|
||||
fetch['response_headers']['content-length'])
|
||||
if 'content-range' in fetch['response_headers']:
|
||||
if "content-length" in fetch["response_headers"]:
|
||||
video["content-length"] = int(
|
||||
fetch["response_headers"]["content-length"]
|
||||
)
|
||||
if "content-range" in fetch["response_headers"]:
|
||||
# skip chunked youtube video
|
||||
if 'googlevideo.com/videoplayback' in fetch['url']:
|
||||
if "googlevideo.com/videoplayback" in fetch["url"]:
|
||||
continue
|
||||
video['content-range'] = fetch[
|
||||
'response_headers']['content-range']
|
||||
logging.debug('embedded video %s', video)
|
||||
video["content-range"] = fetch["response_headers"]["content-range"]
|
||||
logging.debug("embedded video %s", video)
|
||||
page.videos.append(video)
|
||||
for pushed_video in pushed_videos or []:
|
||||
if pushed_video['content-type'].startswith('video/'):
|
||||
if pushed_video["content-type"].startswith("video/"):
|
||||
video = {
|
||||
'blame': 'youtube-dl',
|
||||
'url': pushed_video['url'],
|
||||
'response_code': pushed_video['response_code'],
|
||||
'content-type': pushed_video['content-type'],
|
||||
'content-length': pushed_video['content-length'],
|
||||
"blame": "youtube-dl",
|
||||
"url": pushed_video["url"],
|
||||
"response_code": pushed_video["response_code"],
|
||||
"content-type": pushed_video["content-type"],
|
||||
"content-length": pushed_video["content-length"],
|
||||
}
|
||||
logging.debug('embedded video %s', video)
|
||||
logging.debug("embedded video %s", video)
|
||||
page.videos.append(video)
|
||||
|
||||
|
||||
def _try_youtube_dl(worker, ydl, site, page):
|
||||
try:
|
||||
logging.info("trying yt-dlp on %s", page)
|
||||
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
# no host given>" resulting in ProxyError
|
||||
# needs automated test
|
||||
# and yt-dlp needs sanitize_info for extract_info
|
||||
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
||||
ie_result = ydl.sanitize_info(
|
||||
ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
||||
)
|
||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
|
||||
if worker._using_warcprox(site):
|
||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||
logging.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||
"with yt-dlp json for %s", page)
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||
"with yt-dlp json for %s",
|
||||
page,
|
||||
)
|
||||
worker._warcprox_write_record(
|
||||
warcprox_address=worker._proxy_for(site),
|
||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers(page))
|
||||
warcprox_address=worker._proxy_for(site),
|
||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers(page),
|
||||
)
|
||||
return ie_result
|
||||
except brozzler.ShutdownRequested as e:
|
||||
raise
|
||||
except Exception as e:
|
||||
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
|
||||
return None
|
||||
elif (hasattr(e, "exc_info")
|
||||
and e.exc_info[0] == urllib.error.HTTPError
|
||||
and hasattr(e.exc_info[1], "code")
|
||||
and e.exc_info[1].code == 420):
|
||||
elif (
|
||||
hasattr(e, "exc_info")
|
||||
and e.exc_info[0] == urllib.error.HTTPError
|
||||
and hasattr(e.exc_info[1], "code")
|
||||
and e.exc_info[1].code == 420
|
||||
):
|
||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||
elif (hasattr(e, 'exc_info')
|
||||
and e.exc_info[0] == urllib.error.URLError
|
||||
and worker._proxy_for(site)):
|
||||
elif (
|
||||
hasattr(e, "exc_info")
|
||||
and e.exc_info[0] == urllib.error.URLError
|
||||
and worker._proxy_for(site)
|
||||
):
|
||||
# connection problem when using a proxy == proxy error (XXX?)
|
||||
raise brozzler.ProxyError(
|
||||
'yt-dlp hit apparent proxy error from '
|
||||
'%s' % page.url) from e
|
||||
"yt-dlp hit apparent proxy error from " "%s" % page.url
|
||||
) from e
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def do_youtube_dl(worker, site, page):
|
||||
'''
|
||||
"""
|
||||
Runs yt-dlp configured for `worker` and `site` to download videos from
|
||||
`page`.
|
||||
|
||||
@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
|
||||
'response_headers': ...,
|
||||
}, ...]
|
||||
`list` of `str`: outlink urls
|
||||
'''
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
"""
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = set()
|
||||
if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or
|
||||
ie_result.get('extractor') == 'youtube:tab'):
|
||||
if ie_result and (
|
||||
ie_result.get("extractor") == "youtube:playlist"
|
||||
or ie_result.get("extractor") == "youtube:tab"
|
||||
):
|
||||
# youtube watch pages as outlinks
|
||||
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
|
||||
for e in ie_result.get('entries_no_dl', [])}
|
||||
outlinks = {
|
||||
"https://www.youtube.com/watch?v=%s" % e["id"]
|
||||
for e in ie_result.get("entries_no_dl", [])
|
||||
}
|
||||
# any outlinks for other cases?
|
||||
return ydl.fetch_spy.fetches, outlinks
|
||||
|
147
setup.py
147
setup.py
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
setup.py - brozzler setup script
|
||||
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
@ -15,89 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import setuptools
|
||||
import os
|
||||
|
||||
|
||||
def find_package_data(package):
|
||||
pkg_data = []
|
||||
depth = len(package.split('.'))
|
||||
path = os.path.join(*package.split('.'))
|
||||
depth = len(package.split("."))
|
||||
path = os.path.join(*package.split("."))
|
||||
for dirpath, dirnames, filenames in os.walk(path):
|
||||
if not os.path.exists(os.path.join(dirpath, '__init__.py')):
|
||||
if not os.path.exists(os.path.join(dirpath, "__init__.py")):
|
||||
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
|
||||
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
|
||||
return pkg_data
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.5.44',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||
license='Apache License 2.0',
|
||||
packages=['brozzler', 'brozzler.dashboard'],
|
||||
package_data={
|
||||
'brozzler': [
|
||||
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
||||
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
|
||||
},
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'brozzle-page=brozzler.cli:brozzle_page',
|
||||
'brozzler-new-job=brozzler.cli:brozzler_new_job',
|
||||
'brozzler-new-site=brozzler.cli:brozzler_new_site',
|
||||
'brozzler-worker=brozzler.cli:brozzler_worker',
|
||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
|
||||
'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
|
||||
'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
|
||||
'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
|
||||
'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl',
|
||||
'brozzler-purge=brozzler.cli:brozzler_purge',
|
||||
'brozzler-dashboard=brozzler.dashboard:main',
|
||||
'brozzler-easy=brozzler.easy:main',
|
||||
'brozzler-wayback=brozzler.pywb:main',
|
||||
],
|
||||
},
|
||||
install_requires=[
|
||||
'PyYAML>=5.1',
|
||||
'yt_dlp<2023.11.16',
|
||||
'reppy==0.3.4',
|
||||
'requests>=2.21',
|
||||
'websocket-client>=0.39.0,<=0.48.0',
|
||||
'pillow>=5.2.0',
|
||||
'urlcanon>=0.1.dev23',
|
||||
'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311',
|
||||
'rethinkdb<2.4.10',
|
||||
'cerberus>=1.0.1',
|
||||
'jinja2>=2.10',
|
||||
'cryptography>=2.3',
|
||||
'python-magic>=0.4.15',
|
||||
name="brozzler",
|
||||
version="1.5.44",
|
||||
description="Distributed web crawling with browsers",
|
||||
url="https://github.com/internetarchive/brozzler",
|
||||
author="Noah Levitt",
|
||||
author_email="nlevitt@archive.org",
|
||||
long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
|
||||
license="Apache License 2.0",
|
||||
packages=["brozzler", "brozzler.dashboard"],
|
||||
package_data={
|
||||
"brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
|
||||
"brozzler.dashboard": find_package_data("brozzler.dashboard"),
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"brozzle-page=brozzler.cli:brozzle_page",
|
||||
"brozzler-new-job=brozzler.cli:brozzler_new_job",
|
||||
"brozzler-new-site=brozzler.cli:brozzler_new_site",
|
||||
"brozzler-worker=brozzler.cli:brozzler_worker",
|
||||
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
|
||||
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",
|
||||
"brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
|
||||
"brozzler-list-sites=brozzler.cli:brozzler_list_sites",
|
||||
"brozzler-list-pages=brozzler.cli:brozzler_list_pages",
|
||||
"brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
|
||||
"brozzler-purge=brozzler.cli:brozzler_purge",
|
||||
"brozzler-dashboard=brozzler.dashboard:main",
|
||||
"brozzler-easy=brozzler.easy:main",
|
||||
"brozzler-wayback=brozzler.pywb:main",
|
||||
],
|
||||
extras_require={
|
||||
'dashboard': [
|
||||
'flask>=1.0',
|
||||
'gunicorn>=19.8.1'
|
||||
],
|
||||
'easy': [
|
||||
'warcprox>=2.4.31',
|
||||
'pywb>=0.33.2,<2',
|
||||
'flask>=1.0',
|
||||
'gunicorn>=19.8.1'
|
||||
],
|
||||
},
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Topic :: Internet :: WWW/HTTP',
|
||||
'Topic :: System :: Archiving',
|
||||
])
|
||||
},
|
||||
install_requires=[
|
||||
"PyYAML>=5.1",
|
||||
"yt_dlp<2023.11.16",
|
||||
"reppy==0.3.4",
|
||||
"requests>=2.21",
|
||||
"websocket-client>=0.39.0,<=0.48.0",
|
||||
"pillow>=5.2.0",
|
||||
"urlcanon>=0.1.dev23",
|
||||
"doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
|
||||
"rethinkdb<2.4.10",
|
||||
"cerberus>=1.0.1",
|
||||
"jinja2>=2.10",
|
||||
"cryptography>=2.3",
|
||||
"python-magic>=0.4.15",
|
||||
],
|
||||
extras_require={
|
||||
"dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
|
||||
"easy": [
|
||||
"warcprox>=2.4.31",
|
||||
"pywb>=0.33.2,<2",
|
||||
"flask>=1.0",
|
||||
"gunicorn>=19.8.1",
|
||||
],
|
||||
},
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
"Topic :: System :: Archiving",
|
||||
],
|
||||
)
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
test_brozzling.py - XXX explain
|
||||
|
||||
Copyright (C) 2016-2018 Internet Archive
|
||||
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import brozzler
|
||||
@ -34,79 +34,81 @@ args.log_level = logging.INFO
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
WARCPROX_META_420 = {
|
||||
'stats': {
|
||||
'test_limits_bucket': {
|
||||
'total': {'urls': 0, 'wire_bytes': 0},
|
||||
'new': {'urls': 0, 'wire_bytes': 0},
|
||||
'revisit': {'urls': 0, 'wire_bytes': 0},
|
||||
'bucket': 'test_limits_bucket'
|
||||
"stats": {
|
||||
"test_limits_bucket": {
|
||||
"total": {"urls": 0, "wire_bytes": 0},
|
||||
"new": {"urls": 0, "wire_bytes": 0},
|
||||
"revisit": {"urls": 0, "wire_bytes": 0},
|
||||
"bucket": "test_limits_bucket",
|
||||
}
|
||||
},
|
||||
'reached-limit': {'test_limits_bucket/total/urls': 0}
|
||||
"reached-limit": {"test_limits_bucket/total/urls": 0},
|
||||
}
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def httpd(request):
|
||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd'
|
||||
self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
|
||||
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == '/420':
|
||||
self.send_response(420, 'Reached limit')
|
||||
self.send_header('Connection', 'close')
|
||||
self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
|
||||
payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
|
||||
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
||||
self.send_header('Content-Length', len(payload))
|
||||
if self.path == "/420":
|
||||
self.send_response(420, "Reached limit")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
|
||||
payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Content-Length", len(payload))
|
||||
self.end_headers()
|
||||
self.wfile.write(payload)
|
||||
elif self.path == '/401':
|
||||
elif self.path == "/401":
|
||||
self.send_response(401)
|
||||
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
|
||||
self.send_header('Content-type', 'text/html')
|
||||
self.send_header("WWW-Authenticate", 'Basic realm="Test"')
|
||||
self.send_header("Content-type", "text/html")
|
||||
self.end_headers()
|
||||
self.wfile.write(self.headers.get('Authorization', b''))
|
||||
self.wfile.write(b'not authenticated')
|
||||
self.wfile.write(self.headers.get("Authorization", b""))
|
||||
self.wfile.write(b"not authenticated")
|
||||
else:
|
||||
super().do_GET()
|
||||
|
||||
def do_POST(self):
|
||||
if self.path == '/login-action':
|
||||
if self.path == "/login-action":
|
||||
self.send_response(200)
|
||||
payload = b'login successful\n'
|
||||
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
||||
self.send_header('Content-Length', len(payload))
|
||||
payload = b"login successful\n"
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Content-Length", len(payload))
|
||||
self.end_headers()
|
||||
self.wfile.write(payload)
|
||||
else:
|
||||
super().do_POST()
|
||||
|
||||
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
||||
|
||||
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
|
||||
def test_httpd(httpd):
|
||||
'''
|
||||
"""
|
||||
Tests that our http server is working as expected, and that two fetches
|
||||
of the same url return the same payload, proving it can be used to test
|
||||
deduplication.
|
||||
'''
|
||||
"""
|
||||
payload1 = content2 = None
|
||||
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
||||
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
|
||||
with urllib.request.urlopen(url) as response:
|
||||
assert response.status == 200
|
||||
payload1 = response.read()
|
||||
@ -119,123 +121,136 @@ def test_httpd(httpd):
|
||||
|
||||
assert payload1 == payload2
|
||||
|
||||
url = 'http://localhost:%s/420' % httpd.server_port
|
||||
url = "http://localhost:%s/420" % httpd.server_port
|
||||
with pytest.raises(urllib.error.HTTPError) as excinfo:
|
||||
urllib.request.urlopen(url)
|
||||
assert excinfo.value.getcode() == 420
|
||||
|
||||
|
||||
def test_aw_snap_hes_dead_jim():
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
with pytest.raises(brozzler.BrowsingException):
|
||||
browser.browse_page('chrome://crash')
|
||||
browser.browse_page("chrome://crash")
|
||||
|
||||
|
||||
# chromium's 401 handling changed???
|
||||
@pytest.mark.xfail
|
||||
def test_page_interstitial_exception(httpd):
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
url = 'http://localhost:%s/401' % httpd.server_port
|
||||
url = "http://localhost:%s/401" % httpd.server_port
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
with pytest.raises(brozzler.PageInterstitialShown):
|
||||
browser.browse_page(url)
|
||||
|
||||
|
||||
def test_on_response(httpd):
|
||||
response_urls = []
|
||||
|
||||
def on_response(msg):
|
||||
response_urls.append(msg['params']['response']['url'])
|
||||
response_urls.append(msg["params"]["response"]["url"])
|
||||
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
url = 'http://localhost:%s/site3/page.html' % httpd.server_port
|
||||
url = "http://localhost:%s/site3/page.html" % httpd.server_port
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.browse_page(url, on_response=on_response)
|
||||
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
|
||||
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
|
||||
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
|
||||
assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
|
||||
assert (
|
||||
response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
|
||||
)
|
||||
assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
|
||||
|
||||
|
||||
def test_420(httpd):
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
url = 'http://localhost:%s/420' % httpd.server_port
|
||||
url = "http://localhost:%s/420" % httpd.server_port
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
with pytest.raises(brozzler.ReachedLimit) as excinfo:
|
||||
browser.browse_page(url)
|
||||
assert excinfo.value.warcprox_meta == WARCPROX_META_420
|
||||
|
||||
|
||||
def test_js_dialogs(httpd):
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
|
||||
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
# before commit d2ed6b97a24 these would hang and eventually raise
|
||||
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
||||
browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
|
||||
browser.browse_page(
|
||||
'http://localhost:%s/site4/alert.html' % httpd.server_port)
|
||||
browser.browse_page(
|
||||
'http://localhost:%s/site4/confirm.html' % httpd.server_port)
|
||||
browser.browse_page(
|
||||
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
|
||||
"http://localhost:%s/site4/confirm.html" % httpd.server_port
|
||||
)
|
||||
browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
|
||||
# XXX print dialog unresolved
|
||||
# browser.browse_page(
|
||||
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
||||
|
||||
|
||||
def test_page_videos(httpd):
|
||||
# test depends on behavior of youtube-dl and chromium, could fail and need
|
||||
# to be adjusted on youtube-dl or chromium updates
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
worker = brozzler.BrozzlerWorker(None)
|
||||
site = brozzler.Site(None, {})
|
||||
page = brozzler.Page(None, {
|
||||
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
||||
page = brozzler.Page(
|
||||
None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
|
||||
)
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
worker.brozzle_page(browser, site, page)
|
||||
assert page.videos
|
||||
assert len(page.videos) == 4
|
||||
assert page.videos[0] == {
|
||||
'blame': 'youtube-dl',
|
||||
'response_code': 200,
|
||||
'content-length': 383631,
|
||||
'content-type': 'video/mp4',
|
||||
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
|
||||
"blame": "youtube-dl",
|
||||
"response_code": 200,
|
||||
"content-length": 383631,
|
||||
"content-type": "video/mp4",
|
||||
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[1] == {
|
||||
'blame': 'youtube-dl',
|
||||
'content-length': 92728,
|
||||
'content-type': 'video/webm',
|
||||
'response_code': 200,
|
||||
'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 92728,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||
% httpd.server_port,
|
||||
}
|
||||
assert page.videos[2] == {
|
||||
'blame': 'youtube-dl',
|
||||
'content-length': 101114,
|
||||
'content-type': 'video/webm',
|
||||
'response_code': 200,
|
||||
'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 101114,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[3] == {
|
||||
'blame': 'browser',
|
||||
"blame": "browser",
|
||||
# 'response_code': 206,
|
||||
# 'content-range': 'bytes 0-229454/229455',
|
||||
'response_code': 200,
|
||||
'content-length': 229455,
|
||||
'content-type': 'video/webm',
|
||||
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
||||
"response_code": 200,
|
||||
"content-length": 229455,
|
||||
"content-type": "video/webm",
|
||||
"url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
|
||||
}
|
||||
|
||||
|
||||
def test_extract_outlinks(httpd):
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
worker = brozzler.BrozzlerWorker(None)
|
||||
site = brozzler.Site(None, {})
|
||||
page = brozzler.Page(None, {
|
||||
'url':'http://localhost:%s/site8/' % httpd.server_port})
|
||||
page = brozzler.Page(
|
||||
None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
|
||||
)
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
outlinks = worker.brozzle_page(browser, site, page)
|
||||
assert outlinks == {
|
||||
'http://example.com/offsite',
|
||||
'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
|
||||
'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
|
||||
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
|
||||
"http://example.com/offsite",
|
||||
"http://localhost:%s/site8/baz/zuh" % httpd.server_port,
|
||||
"http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
|
||||
"http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
|
||||
}
|
||||
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
"""
|
||||
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
||||
|
||||
See also `test_proxy_down` in test_units.py.
|
||||
@ -243,40 +258,41 @@ def test_proxy_down():
|
||||
Tests two different kinds of connection error:
|
||||
- nothing listening the port (nobody listens on on port 4 :))
|
||||
- port bound but not accepting connections
|
||||
'''
|
||||
"""
|
||||
sock = socket.socket()
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
for not_listening_proxy in (
|
||||
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||
site = brozzler.Site(None, {"seed": "http://example.com/"})
|
||||
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||
|
||||
worker = brozzler.BrozzlerWorker(
|
||||
frontier=None, proxy=not_listening_proxy)
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker.brozzle_page(browser, site, page)
|
||||
|
||||
|
||||
def test_try_login(httpd):
|
||||
"""Test try_login behavior.
|
||||
"""
|
||||
"""Test try_login behavior."""
|
||||
response_urls = []
|
||||
|
||||
def on_response(msg):
|
||||
response_urls.append(msg['params']['response']['url'])
|
||||
response_urls.append(msg["params"]["response"]["url"])
|
||||
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
|
||||
form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port
|
||||
favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
|
||||
login_url = 'http://localhost:%s/login-action' % httpd.server_port
|
||||
form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
|
||||
form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
|
||||
favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
|
||||
login_url = "http://localhost:%s/login-action" % httpd.server_port
|
||||
# When username and password are defined and initial page has login form,
|
||||
# detect login form, submit login, and then return to the initial page.
|
||||
username = 'user1'
|
||||
password = 'pass1'
|
||||
username = "user1"
|
||||
password = "pass1"
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.browse_page(form_url, username=username, password=password,
|
||||
on_response=on_response)
|
||||
browser.browse_page(
|
||||
form_url, username=username, password=password, on_response=on_response
|
||||
)
|
||||
assert len(response_urls) == 4
|
||||
assert response_urls[0] == form_url
|
||||
assert response_urls[1] == favicon_url
|
||||
@ -285,11 +301,15 @@ def test_try_login(httpd):
|
||||
|
||||
# We are now supporting a different type of form, we'll test that here.
|
||||
response_urls = []
|
||||
username = 'user1'
|
||||
password = 'pass1'
|
||||
username = "user1"
|
||||
password = "pass1"
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.browse_page(form_url_other, username=username, password=password,
|
||||
on_response=on_response)
|
||||
browser.browse_page(
|
||||
form_url_other,
|
||||
username=username,
|
||||
password=password,
|
||||
on_response=on_response,
|
||||
)
|
||||
assert len(response_urls) == 4
|
||||
assert response_urls[0] == form_url_other
|
||||
assert response_urls[1] == favicon_url
|
||||
@ -306,10 +326,16 @@ def test_try_login(httpd):
|
||||
|
||||
# when the page doesn't have a form with username/password, don't submit it
|
||||
response_urls = []
|
||||
form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
|
||||
form_without_login_url = (
|
||||
"http://localhost:%s/site11/form-no-login.html" % httpd.server_port
|
||||
)
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.browse_page(form_without_login_url, username=username,
|
||||
password=password, on_response=on_response)
|
||||
browser.browse_page(
|
||||
form_without_login_url,
|
||||
username=username,
|
||||
password=password,
|
||||
on_response=on_response,
|
||||
)
|
||||
assert len(response_urls) == 2
|
||||
assert response_urls[0] == form_without_login_url
|
||||
assert response_urls[1] == favicon_url
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
test_cli.py - test brozzler commands
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import brozzler.cli
|
||||
import pkg_resources
|
||||
@ -23,59 +23,62 @@ import pytest
|
||||
import subprocess
|
||||
import doublethink
|
||||
|
||||
|
||||
def cli_commands():
|
||||
commands = set(pkg_resources.get_entry_map(
|
||||
'brozzler')['console_scripts'].keys())
|
||||
commands.remove('brozzler-wayback')
|
||||
commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
|
||||
commands.remove("brozzler-wayback")
|
||||
try:
|
||||
import gunicorn
|
||||
except ImportError:
|
||||
commands.remove('brozzler-dashboard')
|
||||
commands.remove("brozzler-dashboard")
|
||||
try:
|
||||
import pywb
|
||||
except ImportError:
|
||||
commands.remove('brozzler-easy')
|
||||
commands.remove("brozzler-easy")
|
||||
return commands
|
||||
|
||||
@pytest.mark.parametrize('cmd', cli_commands())
|
||||
|
||||
@pytest.mark.parametrize("cmd", cli_commands())
|
||||
def test_call_entrypoint(capsys, cmd):
|
||||
entrypoint = pkg_resources.get_entry_map(
|
||||
'brozzler')['console_scripts'][cmd]
|
||||
entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
|
||||
callable = entrypoint.resolve()
|
||||
with pytest.raises(SystemExit):
|
||||
callable(['/whatever/bin/%s' % cmd, '--version'])
|
||||
callable(["/whatever/bin/%s" % cmd, "--version"])
|
||||
out, err = capsys.readouterr()
|
||||
assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd)
|
||||
assert err == ''
|
||||
assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
|
||||
assert err == ""
|
||||
|
||||
@pytest.mark.parametrize('cmd', cli_commands())
|
||||
|
||||
@pytest.mark.parametrize("cmd", cli_commands())
|
||||
def test_run_command(capsys, cmd):
|
||||
proc = subprocess.Popen(
|
||||
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
out, err = proc.communicate()
|
||||
assert err == b''
|
||||
assert out == ('brozzler %s - %s\n' % (
|
||||
brozzler.__version__, cmd)).encode('ascii')
|
||||
assert err == b""
|
||||
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
|
||||
|
||||
|
||||
def test_rethinkdb_up():
|
||||
'''Check that rethinkdb is up and running.'''
|
||||
"""Check that rethinkdb is up and running."""
|
||||
# check that rethinkdb is listening and looks sane
|
||||
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
|
||||
# XXX don't know why this test is failing in travis-ci and vagrant while
|
||||
# test_call_entrypoint tests pass :( (also fails with capfd)
|
||||
@pytest.mark.xfail
|
||||
def test_stop_nonexistent_crawl(capsys):
|
||||
with pytest.raises(SystemExit):
|
||||
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123'])
|
||||
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
|
||||
out, err = capsys.readouterr()
|
||||
assert err.endswith('site not found with id=123\n')
|
||||
assert out == ''
|
||||
assert err.endswith("site not found with id=123\n")
|
||||
assert out == ""
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc'])
|
||||
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
|
||||
out, err = capsys.readouterr()
|
||||
assert err.endswith('''job not found with id='abc'\n''')
|
||||
assert out == ''
|
||||
assert err.endswith("""job not found with id='abc'\n""")
|
||||
assert out == ""
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||
|
||||
Copyright (C) 2016-2017 Internet Archive
|
||||
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import http.server
|
||||
@ -37,99 +37,131 @@ import threading
|
||||
from unittest import mock
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
stream=sys.stderr,
|
||||
level=logging.INFO,
|
||||
format=(
|
||||
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
|
||||
),
|
||||
)
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def httpd(request):
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
||||
|
||||
httpd = http.server.HTTPServer(
|
||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
("localhost", 0), http.server.SimpleHTTPRequestHandler
|
||||
)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
|
||||
def test_robots(httpd):
|
||||
'''
|
||||
"""
|
||||
Basic test of robots.txt user-agent substring matching.
|
||||
'''
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
||||
"""
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
||||
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_robots_http_statuses():
|
||||
for status in (
|
||||
200, 204, 400, 401, 402, 403, 404, 405,
|
||||
500, 501, 502, 503, 504, 505):
|
||||
200,
|
||||
204,
|
||||
400,
|
||||
401,
|
||||
402,
|
||||
403,
|
||||
404,
|
||||
405,
|
||||
500,
|
||||
501,
|
||||
502,
|
||||
503,
|
||||
504,
|
||||
505,
|
||||
):
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
||||
+ 'Content-length: 0\r\n'
|
||||
+ '\r\n') % status).encode('utf-8')
|
||||
response = (
|
||||
(
|
||||
"HTTP/1.1 %s Meaningless message\r\n"
|
||||
+ "Content-length: 0\r\n"
|
||||
+ "\r\n"
|
||||
)
|
||||
% status
|
||||
).encode("utf-8")
|
||||
self.connection.sendall(response)
|
||||
# self.send_response(status)
|
||||
# self.end_headers()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_empty_response():
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.connection.shutdown(socket.SHUT_RDWR)
|
||||
self.connection.close()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_socket_timeout():
|
||||
stop_hanging = threading.Event()
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
stop_hanging.wait(60)
|
||||
self.connection.sendall(
|
||||
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
||||
self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
|
||||
|
||||
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
||||
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
brozzler.robots._SessionRaiseOn420.timeout = 2
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_dns_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://whatever.invalid./'
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://whatever.invalid./"
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_robots_connection_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://localhost:4/' # nobody listens on port 4
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:4/" # nobody listens on port 4
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_scoping():
|
||||
test_scope = yaml.safe_load('''
|
||||
test_scope = yaml.safe_load(
|
||||
"""
|
||||
max_hops: 100
|
||||
accepts:
|
||||
- url_match: REGEX_MATCH
|
||||
@ -169,40 +205,73 @@ blocks:
|
||||
- domain: twitter.com
|
||||
url_match: REGEX_MATCH
|
||||
value: ^.*lang=(?!en).*$
|
||||
''')
|
||||
"""
|
||||
)
|
||||
|
||||
site = brozzler.Site(None, {
|
||||
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||
'scope': test_scope})
|
||||
page = brozzler.Page(None, {
|
||||
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||
'site_id': site.id})
|
||||
site = brozzler.Site(
|
||||
None,
|
||||
{
|
||||
"id": 1,
|
||||
"seed": "http://example.com/foo/bar?baz=quux#monkey",
|
||||
"scope": test_scope,
|
||||
},
|
||||
)
|
||||
page = brozzler.Page(
|
||||
None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
||||
assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
|
||||
assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
|
||||
|
||||
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
||||
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
||||
assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
|
||||
assert (
|
||||
site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
|
||||
is True
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
||||
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
||||
assert (
|
||||
site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
||||
assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
|
||||
)
|
||||
|
||||
assert (
|
||||
site.accept_reject_or_neither(
|
||||
"https://www.youtube.com/watch?v=dUIn5OAPS5s", page
|
||||
)
|
||||
is None
|
||||
)
|
||||
yt_user_page = brozzler.Page(
|
||||
None,
|
||||
{
|
||||
"url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
|
||||
"site_id": site.id,
|
||||
"hops_from_seed": 10,
|
||||
},
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither(
|
||||
"https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
|
||||
)
|
||||
is True
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
||||
yt_user_page = brozzler.Page(None, {
|
||||
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
||||
'site_id': site.id, 'hops_from_seed': 10})
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
"""
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
|
||||
This test needs to cover every possible fetch through the proxy other than
|
||||
@ -211,24 +280,24 @@ def test_proxy_down():
|
||||
Tests two different kinds of connection error:
|
||||
- nothing listening the port (nobody listens on on port 4 :))
|
||||
- port bound but not accepting connections
|
||||
'''
|
||||
"""
|
||||
sock = socket.socket()
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
for not_listening_proxy in (
|
||||
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(
|
||||
frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(None, {
|
||||
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
|
||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(
|
||||
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
|
||||
)
|
||||
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||
|
||||
# robots.txt fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.is_permitted_by_robots(
|
||||
site, 'http://example.com/', proxy=not_listening_proxy)
|
||||
site, "http://example.com/", proxy=not_listening_proxy
|
||||
)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
@ -239,47 +308,58 @@ def test_proxy_down():
|
||||
# WARCPROX_WRITE_RECORD
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._warcprox_write_record(
|
||||
warcprox_address=not_listening_proxy,
|
||||
url='test://proxy_down/warcprox_write_record',
|
||||
warc_type='metadata',
|
||||
content_type='text/plain',
|
||||
payload=b'''payload doesn't matter here''')
|
||||
warcprox_address=not_listening_proxy,
|
||||
url="test://proxy_down/warcprox_write_record",
|
||||
warc_type="metadata",
|
||||
content_type="text/plain",
|
||||
payload=b"""payload doesn't matter here""",
|
||||
)
|
||||
|
||||
|
||||
def test_start_stop_backwards_compat():
|
||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||
site = brozzler.Site(None, {"seed": "http://example.com/"})
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
assert not 'start_time' in site
|
||||
assert site.starts_and_stops[0]["start"]
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
|
||||
site = brozzler.Site(None, {
|
||||
'seed': 'http://example.com/',
|
||||
'start_time': datetime.datetime(2017,1,1)})
|
||||
site = brozzler.Site(
|
||||
None,
|
||||
{"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
|
||||
)
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
assert not 'start_time' in site
|
||||
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
|
||||
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
|
||||
assert job.starts_and_stops[0]['start']
|
||||
assert job.starts_and_stops[0]['stop'] is None
|
||||
assert not 'started' in job
|
||||
assert not 'finished' in job
|
||||
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
||||
assert job.starts_and_stops[0]["start"]
|
||||
assert job.starts_and_stops[0]["stop"] is None
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
|
||||
job = brozzler.Job(
|
||||
None,
|
||||
{
|
||||
"seeds": [{"url": "https://example.com/"}],
|
||||
"started": datetime.datetime(2017, 1, 1),
|
||||
"finished": datetime.datetime(2017, 1, 2),
|
||||
},
|
||||
)
|
||||
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
|
||||
job = brozzler.Job(None, {
|
||||
'seeds': [{'url':'https://example.com/'}],
|
||||
'started': datetime.datetime(2017, 1, 1),
|
||||
'finished': datetime.datetime(2017, 1, 2)})
|
||||
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
||||
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
|
||||
assert not 'started' in job
|
||||
assert not 'finished' in job
|
||||
|
||||
class Exception1(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Exception2(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def test_thread_raise_not_accept():
|
||||
def never_accept():
|
||||
try:
|
||||
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
|
||||
th.join()
|
||||
assert thread_caught_exception is None
|
||||
|
||||
|
||||
def test_thread_raise_immediate():
|
||||
def accept_immediately():
|
||||
try:
|
||||
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
|
||||
assert isinstance(thread_caught_exception, Exception1)
|
||||
assert time.time() - start < 1.0
|
||||
|
||||
|
||||
def test_thread_raise_safe_exit():
|
||||
def delay_context_exit():
|
||||
gate = brozzler.thread_accept_exceptions()
|
||||
orig_exit = type(gate).__exit__
|
||||
try:
|
||||
type(gate).__exit__ = lambda self, et, ev, t: (
|
||||
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
|
||||
brozzler.sleep(2),
|
||||
orig_exit(self, et, ev, t),
|
||||
False,
|
||||
)[-1]
|
||||
with brozzler.thread_accept_exceptions() as gate:
|
||||
brozzler.sleep(2)
|
||||
except Exception as e:
|
||||
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
|
||||
assert thread_caught_exception
|
||||
assert isinstance(thread_caught_exception, Exception1)
|
||||
|
||||
|
||||
def test_thread_raise_pending_exception():
|
||||
def accept_eventually():
|
||||
try:
|
||||
@ -365,16 +451,17 @@ def test_thread_raise_pending_exception():
|
||||
assert isinstance(thread_caught_exception, Exception1)
|
||||
assert time.time() - start > 1.0
|
||||
|
||||
|
||||
def test_thread_raise_second_with_block():
|
||||
def two_with_blocks():
|
||||
try:
|
||||
with brozzler.thread_accept_exceptions():
|
||||
time.sleep(2)
|
||||
return # test fails
|
||||
return # test fails
|
||||
except Exception1 as e:
|
||||
pass
|
||||
except:
|
||||
return # fail test
|
||||
return # fail test
|
||||
|
||||
try:
|
||||
with brozzler.thread_accept_exceptions():
|
||||
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
|
||||
th.join()
|
||||
assert isinstance(thread_caught_exception, Exception2)
|
||||
|
||||
|
||||
def test_needs_browsing():
|
||||
# only one test case here right now, which exposed a bug
|
||||
|
||||
class ConvenientHeaders(http.client.HTTPMessage):
|
||||
def __init__(self, headers):
|
||||
http.client.HTTPMessage.__init__(self)
|
||||
for (k, v) in headers.items():
|
||||
for k, v in headers.items():
|
||||
self.add_header(k, v)
|
||||
|
||||
page = brozzler.Page(None, {
|
||||
'url':'http://example.com/a'})
|
||||
page = brozzler.Page(None, {"url": "http://example.com/a"})
|
||||
|
||||
spy = brozzler.ydl.YoutubeDLSpy()
|
||||
spy.fetches.append({
|
||||
'url': 'http://example.com/a',
|
||||
'method': 'HEAD',
|
||||
'response_code': 301,
|
||||
'response_headers': ConvenientHeaders({'Location': '/b'})})
|
||||
spy.fetches.append({
|
||||
'url': 'http://example.com/b',
|
||||
'method': 'GET',
|
||||
'response_code': 200,
|
||||
'response_headers': ConvenientHeaders({
|
||||
'Content-Type': 'application/pdf'})})
|
||||
spy.fetches.append(
|
||||
{
|
||||
"url": "http://example.com/a",
|
||||
"method": "HEAD",
|
||||
"response_code": 301,
|
||||
"response_headers": ConvenientHeaders({"Location": "/b"}),
|
||||
}
|
||||
)
|
||||
spy.fetches.append(
|
||||
{
|
||||
"url": "http://example.com/b",
|
||||
"method": "GET",
|
||||
"response_code": 200,
|
||||
"response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
|
||||
}
|
||||
)
|
||||
|
||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
|
||||
|
||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
||||
None, page, spy.fetches)
|
||||
|
||||
def test_seed_redirect():
|
||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||
site.note_seed_redirect('https://foo.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,foo,//https:/',}]}
|
||||
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||
site.note_seed_redirect("https://foo.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,foo,//https:/",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
|
||||
site.note_seed_redirect('http://foo.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//https:/',},
|
||||
{'ssurt': 'com,foo,//http:/',}]}
|
||||
site = brozzler.Site(None, {"seed": "https://foo.com/"})
|
||||
site.note_seed_redirect("http://foo.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//https:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||
site.note_seed_redirect("https://bar.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,bar,//https:/a/b/c",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||
site.note_seed_redirect('https://bar.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||
|
||||
def test_limit_failures():
|
||||
page = mock.Mock()
|
||||
@ -446,9 +560,9 @@ def test_limit_failures():
|
||||
page.brozzle_count = 0
|
||||
|
||||
site = mock.Mock()
|
||||
site.status = 'ACTIVE'
|
||||
site.status = "ACTIVE"
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
||||
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
@ -456,11 +570,12 @@ def test_limit_failures():
|
||||
rr.db_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table = mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
limit=mock.Mock(
|
||||
return_value=rethink_query)))))
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
|
||||
)
|
||||
)
|
||||
)
|
||||
assert rr.table().between().limit().run() == []
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.enforce_time_limit = mock.Mock()
|
||||
@ -475,20 +590,19 @@ def test_limit_failures():
|
||||
|
||||
assert page.failed_attempts is None
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 1
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 2
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 3
|
||||
assert page.brozzle_count == 1
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
assert site.status == "FINISHED"
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
|
||||
queue a job for your vagrant brozzler deployment.
|
||||
|
||||
@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
|
||||
def main(argv=[]):
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||
arg_parser.add_argument(
|
||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||
help='brozzler job configuration file in yaml')
|
||||
"job_conf_file",
|
||||
metavar="JOB_CONF_FILE",
|
||||
help="brozzler job configuration file in yaml",
|
||||
)
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
with open(args.job_conf_file, 'rb') as f:
|
||||
subprocess.call([
|
||||
'vagrant', 'ssh', '--',
|
||||
'f=`mktemp` && cat > $f && '
|
||||
'/home/vagrant/brozzler-ve3/bin/python '
|
||||
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
|
||||
stdin=f)
|
||||
with open(args.job_conf_file, "rb") as f:
|
||||
subprocess.call(
|
||||
[
|
||||
"vagrant",
|
||||
"ssh",
|
||||
"--",
|
||||
"f=`mktemp` && cat > $f && "
|
||||
"/home/vagrant/brozzler-ve3/bin/python "
|
||||
"/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
|
||||
],
|
||||
stdin=f,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
|
||||
queue a site for your vagrant brozzler deployment.
|
||||
|
||||
@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
from shlex import quote
|
||||
except:
|
||||
from pipes import quote
|
||||
|
||||
|
||||
def main(argv=[]):
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||
arg_parser.add_argument("seed", metavar="SEED", help="seed url")
|
||||
arg_parser.add_argument(
|
||||
'--time-limit', dest='time_limit', default=None,
|
||||
help='time limit in seconds for this site')
|
||||
"--time-limit",
|
||||
dest="time_limit",
|
||||
default=None,
|
||||
help="time limit in seconds for this site",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--ignore-robots', dest='ignore_robots', action='store_true',
|
||||
help='ignore robots.txt for this site')
|
||||
"--ignore-robots",
|
||||
dest="ignore_robots",
|
||||
action="store_true",
|
||||
help="ignore robots.txt for this site",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
'--warcprox-meta', dest='warcprox_meta',
|
||||
help=(
|
||||
'Warcprox-Meta http request header to send with each request; '
|
||||
'must be a json blob, ignored unless warcprox features are '
|
||||
'enabled'))
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='quiet', action='store_true')
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true')
|
||||
"--warcprox-meta",
|
||||
dest="warcprox_meta",
|
||||
help=(
|
||||
"Warcprox-Meta http request header to send with each request; "
|
||||
"must be a json blob, ignored unless warcprox features are "
|
||||
"enabled"
|
||||
),
|
||||
)
|
||||
arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
|
||||
arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
|
||||
options = []
|
||||
if args.time_limit:
|
||||
options.append('--time-limit=%s' % args.time_limit)
|
||||
options.append("--time-limit=%s" % args.time_limit)
|
||||
if args.ignore_robots:
|
||||
options.append('--ignore-robots')
|
||||
options.append("--ignore-robots")
|
||||
if args.warcprox_meta:
|
||||
# I think this shell escaping is correct?
|
||||
options.append(
|
||||
'--warcprox-meta=%s' % quote(args.warcprox_meta))
|
||||
options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
|
||||
if args.quiet:
|
||||
options.append('--quiet')
|
||||
options.append("--quiet")
|
||||
if args.verbose:
|
||||
options.append('--verbose')
|
||||
options.append("--verbose")
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
cmd = (
|
||||
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
||||
'%s %s') % (' '.join(options), args.seed)
|
||||
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
||||
"/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
|
||||
) % (" ".join(options), args.seed)
|
||||
subprocess.call(["vagrant", "ssh", "--", cmd])
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user