mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-21 06:00:45 -04:00
Use black, enforce with GitHub Actions
This commit is contained in:
parent
c4620c3018
commit
8b23430a87
23 changed files with 4048 additions and 2797 deletions
31
.github/workflows/python-formatting.yml
vendored
Normal file
31
.github/workflows/python-formatting.yml
vendored
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
name: Python Formatting Check
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
formatting:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set up Python 3.8
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: Create virtual environment
|
||||||
|
run: python -m venv venv
|
||||||
|
|
||||||
|
- name: Install black
|
||||||
|
run: |
|
||||||
|
./venv/bin/pip install --upgrade pip
|
||||||
|
./venv/bin/pip install black
|
||||||
|
|
||||||
|
- name: Run formatting check
|
||||||
|
run: make ck-format
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -2,3 +2,5 @@
|
||||||
*.diff
|
*.diff
|
||||||
.*.sw*
|
.*.sw*
|
||||||
/brozzler.egg-info/
|
/brozzler.egg-info/
|
||||||
|
venv
|
||||||
|
.idea
|
||||||
|
|
7
Makefile
Normal file
7
Makefile
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
.PHONY: format
|
||||||
|
format:
|
||||||
|
venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
|
||||||
|
|
||||||
|
.PHONY: ck-format
|
||||||
|
ck-format:
|
||||||
|
venv/bin/black --check .
|
|
@ -19,33 +19,41 @@ limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from pkg_resources import get_distribution as _get_distribution
|
from pkg_resources import get_distribution as _get_distribution
|
||||||
__version__ = _get_distribution('brozzler').version
|
|
||||||
|
__version__ = _get_distribution("brozzler").version
|
||||||
|
|
||||||
|
|
||||||
class ShutdownRequested(Exception):
|
class ShutdownRequested(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class NothingToClaim(Exception):
|
class NothingToClaim(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CrawlStopped(Exception):
|
class CrawlStopped(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PageInterstitialShown(Exception):
|
class PageInterstitialShown(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ProxyError(Exception):
|
class ProxyError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReachedTimeLimit(Exception):
|
class ReachedTimeLimit(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReachedLimit(Exception):
|
class ReachedLimit(Exception):
|
||||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
if http_error:
|
if http_error:
|
||||||
if "warcprox-meta" in http_error.headers:
|
if "warcprox-meta" in http_error.headers:
|
||||||
self.warcprox_meta = json.loads(
|
self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
|
||||||
http_error.headers["warcprox-meta"])
|
|
||||||
else:
|
else:
|
||||||
self.warcprox_meta = None
|
self.warcprox_meta = None
|
||||||
self.http_payload = http_error.read()
|
self.http_payload = http_error.read()
|
||||||
|
@ -55,28 +63,39 @@ class ReachedLimit(Exception):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
|
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
|
||||||
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
|
self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
|
||||||
self.http_payload if hasattr(self, 'http_payload') else None)
|
self.http_payload if hasattr(self, "http_payload") else None,
|
||||||
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
|
|
||||||
# monkey-patch log levels TRACE and NOTICE
|
# monkey-patch log levels TRACE and NOTICE
|
||||||
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
||||||
|
|
||||||
|
|
||||||
def _logger_trace(self, msg, *args, **kwargs):
|
def _logger_trace(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(logging.TRACE):
|
if self.isEnabledFor(logging.TRACE):
|
||||||
self._log(logging.TRACE, msg, args, **kwargs)
|
self._log(logging.TRACE, msg, args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
logging.Logger.trace = _logger_trace
|
logging.Logger.trace = _logger_trace
|
||||||
logging.trace = logging.root.trace
|
logging.trace = logging.root.trace
|
||||||
logging.addLevelName(logging.TRACE, 'TRACE')
|
logging.addLevelName(logging.TRACE, "TRACE")
|
||||||
|
|
||||||
logging.NOTICE = (logging.INFO + logging.WARN) // 2
|
logging.NOTICE = (logging.INFO + logging.WARN) // 2
|
||||||
|
|
||||||
|
|
||||||
def _logger_notice(self, msg, *args, **kwargs):
|
def _logger_notice(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(logging.NOTICE):
|
if self.isEnabledFor(logging.NOTICE):
|
||||||
self._log(logging.NOTICE, msg, args, **kwargs)
|
self._log(logging.NOTICE, msg, args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
logging.Logger.notice = _logger_notice
|
logging.Logger.notice = _logger_notice
|
||||||
logging.notice = logging.root.notice
|
logging.notice = logging.root.notice
|
||||||
logging.addLevelName(logging.NOTICE, 'NOTICE')
|
logging.addLevelName(logging.NOTICE, "NOTICE")
|
||||||
|
|
||||||
|
|
||||||
# see https://github.com/internetarchive/brozzler/issues/91
|
# see https://github.com/internetarchive/brozzler/issues/91
|
||||||
def _logging_handler_handle(self, record):
|
def _logging_handler_handle(self, record):
|
||||||
|
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
|
||||||
logging.Handler.handle = _logging_handler_handle
|
logging.Handler.handle = _logging_handler_handle
|
||||||
|
|
||||||
_behaviors = None
|
_behaviors = None
|
||||||
|
|
||||||
|
|
||||||
def behaviors(behaviors_dir=None):
|
def behaviors(behaviors_dir=None):
|
||||||
"""Return list of JS behaviors loaded from YAML file.
|
"""Return list of JS behaviors loaded from YAML file.
|
||||||
|
|
||||||
|
@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
|
||||||
`js-templates/`. Defaults to brozzler dir.
|
`js-templates/`. Defaults to brozzler dir.
|
||||||
"""
|
"""
|
||||||
import os, yaml, string
|
import os, yaml, string
|
||||||
|
|
||||||
global _behaviors
|
global _behaviors
|
||||||
if _behaviors is None:
|
if _behaviors is None:
|
||||||
d = behaviors_dir or os.path.dirname(__file__)
|
d = behaviors_dir or os.path.dirname(__file__)
|
||||||
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
behaviors_yaml = os.path.join(d, "behaviors.yaml")
|
||||||
with open(behaviors_yaml) as fin:
|
with open(behaviors_yaml) as fin:
|
||||||
_behaviors = yaml.safe_load(fin)
|
_behaviors = yaml.safe_load(fin)
|
||||||
return _behaviors
|
return _behaviors
|
||||||
|
|
||||||
|
|
||||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||||
'''
|
"""
|
||||||
Returns the javascript behavior string populated with template_parameters.
|
Returns the javascript behavior string populated with template_parameters.
|
||||||
'''
|
"""
|
||||||
import re, logging, json
|
import re, logging, json
|
||||||
|
|
||||||
for behavior in behaviors(behaviors_dir=behaviors_dir):
|
for behavior in behaviors(behaviors_dir=behaviors_dir):
|
||||||
if re.match(behavior['url_regex'], url):
|
if re.match(behavior["url_regex"], url):
|
||||||
parameters = dict()
|
parameters = dict()
|
||||||
if 'default_parameters' in behavior:
|
if "default_parameters" in behavior:
|
||||||
parameters.update(behavior['default_parameters'])
|
parameters.update(behavior["default_parameters"])
|
||||||
if template_parameters:
|
if template_parameters:
|
||||||
parameters.update(template_parameters)
|
parameters.update(template_parameters)
|
||||||
template = jinja2_environment(behaviors_dir).get_template(
|
template = jinja2_environment(behaviors_dir).get_template(
|
||||||
behavior['behavior_js_template'])
|
behavior["behavior_js_template"]
|
||||||
|
)
|
||||||
script = template.render(parameters)
|
script = template.render(parameters)
|
||||||
logging.info(
|
logging.info(
|
||||||
'using template=%r populated with parameters=%r for %r',
|
"using template=%r populated with parameters=%r for %r",
|
||||||
behavior['behavior_js_template'], json.dumps(parameters), url)
|
behavior["behavior_js_template"],
|
||||||
|
json.dumps(parameters),
|
||||||
|
url,
|
||||||
|
)
|
||||||
return script
|
return script
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class ThreadExceptionGate:
|
class ThreadExceptionGate:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -142,8 +173,7 @@ class ThreadExceptionGate:
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
assert self.thread == threading.current_thread()
|
assert self.thread == threading.current_thread()
|
||||||
if self.pending_exception:
|
if self.pending_exception:
|
||||||
self.logger.info(
|
self.logger.info("raising pending exception %s", self.pending_exception)
|
||||||
'raising pending exception %s', self.pending_exception)
|
|
||||||
tmp = self.pending_exception
|
tmp = self.pending_exception
|
||||||
self.pending_exception = None
|
self.pending_exception = None
|
||||||
raise tmp
|
raise tmp
|
||||||
|
@ -160,19 +190,26 @@ class ThreadExceptionGate:
|
||||||
with self.lock:
|
with self.lock:
|
||||||
if self.pending_exception:
|
if self.pending_exception:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'%r already pending for thread %r, discarding %r',
|
"%r already pending for thread %r, discarding %r",
|
||||||
self.pending_exception, self.thread, e)
|
self.pending_exception,
|
||||||
|
self.thread,
|
||||||
|
e,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.pending_exception = e
|
self.pending_exception = e
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<ThreadExceptionGate(%s)>' % self.thread
|
return "<ThreadExceptionGate(%s)>" % self.thread
|
||||||
|
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
_thread_exception_gates = {}
|
_thread_exception_gates = {}
|
||||||
_thread_exception_gates_lock = threading.Lock()
|
_thread_exception_gates_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
def thread_exception_gate(thread=None):
|
def thread_exception_gate(thread=None):
|
||||||
'''
|
"""
|
||||||
Returns a `ThreadExceptionGate` for `thread` (current thread by default).
|
Returns a `ThreadExceptionGate` for `thread` (current thread by default).
|
||||||
|
|
||||||
`ThreadExceptionGate` is a context manager which allows exceptions to be
|
`ThreadExceptionGate` is a context manager which allows exceptions to be
|
||||||
|
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
|
||||||
is queued, and raised immediately if and when the thread enters the
|
is queued, and raised immediately if and when the thread enters the
|
||||||
context. Only one exception will be queued this way at a time, others are
|
context. Only one exception will be queued this way at a time, others are
|
||||||
discarded.
|
discarded.
|
||||||
'''
|
"""
|
||||||
if not thread:
|
if not thread:
|
||||||
thread = threading.current_thread()
|
thread = threading.current_thread()
|
||||||
|
|
||||||
|
@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):
|
||||||
|
|
||||||
return _thread_exception_gates[thread]
|
return _thread_exception_gates[thread]
|
||||||
|
|
||||||
|
|
||||||
thread_accept_exceptions = thread_exception_gate
|
thread_accept_exceptions = thread_exception_gate
|
||||||
|
|
||||||
|
|
||||||
def thread_raise(thread, exctype):
|
def thread_raise(thread, exctype):
|
||||||
'''
|
"""
|
||||||
Raises or queues the exception `exctype` for the thread `thread`.
|
Raises or queues the exception `exctype` for the thread `thread`.
|
||||||
|
|
||||||
See the documentation on the function `thread_exception_gate()` for more
|
See the documentation on the function `thread_exception_gate()` for more
|
||||||
|
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
|
||||||
Raises:
|
Raises:
|
||||||
TypeError if `exctype` is not a class
|
TypeError if `exctype` is not a class
|
||||||
ValueError, SystemError in case of unexpected problems
|
ValueError, SystemError in case of unexpected problems
|
||||||
'''
|
"""
|
||||||
import ctypes, inspect, threading, logging
|
import ctypes, inspect, threading, logging
|
||||||
|
|
||||||
if not inspect.isclass(exctype):
|
if not inspect.isclass(exctype):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
'cannot raise %s, only exception types can be raised (not '
|
"cannot raise %s, only exception types can be raised (not "
|
||||||
'instances)' % exctype)
|
"instances)" % exctype
|
||||||
|
)
|
||||||
|
|
||||||
gate = thread_exception_gate(thread)
|
gate = thread_exception_gate(thread)
|
||||||
with gate.lock:
|
with gate.lock:
|
||||||
if gate.ok_to_raise.is_set() and thread.is_alive():
|
if gate.ok_to_raise.is_set() and thread.is_alive():
|
||||||
gate.ok_to_raise.clear()
|
gate.ok_to_raise.clear()
|
||||||
logging.info('raising %s in thread %s', exctype, thread)
|
logging.info("raising %s in thread %s", exctype, thread)
|
||||||
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
||||||
ctypes.c_long(thread.ident), ctypes.py_object(exctype))
|
ctypes.c_long(thread.ident), ctypes.py_object(exctype)
|
||||||
|
)
|
||||||
if res == 0:
|
if res == 0:
|
||||||
raise ValueError(
|
raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
|
||||||
'invalid thread id? thread.ident=%s' % thread.ident)
|
|
||||||
elif res != 1:
|
elif res != 1:
|
||||||
# if it returns a number greater than one, you're in trouble,
|
# if it returns a number greater than one, you're in trouble,
|
||||||
# and you should call it again with exc=NULL to revert the effect
|
# and you should call it again with exc=NULL to revert the effect
|
||||||
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
|
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
|
||||||
raise SystemError('PyThreadState_SetAsyncExc failed')
|
raise SystemError("PyThreadState_SetAsyncExc failed")
|
||||||
else:
|
else:
|
||||||
logging.info('queueing %s for thread %s', exctype, thread)
|
logging.info("queueing %s for thread %s", exctype, thread)
|
||||||
gate.queue_exception(exctype)
|
gate.queue_exception(exctype)
|
||||||
|
|
||||||
|
|
||||||
def sleep(duration):
|
def sleep(duration):
|
||||||
'''
|
"""
|
||||||
Sleeps for duration seconds in increments of 0.5 seconds.
|
Sleeps for duration seconds in increments of 0.5 seconds.
|
||||||
|
|
||||||
Use this so that the sleep can be interrupted by thread_raise().
|
Use this so that the sleep can be interrupted by thread_raise().
|
||||||
'''
|
"""
|
||||||
import time
|
import time
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while True:
|
while True:
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
|
@ -259,32 +301,41 @@ def sleep(duration):
|
||||||
break
|
break
|
||||||
time.sleep(min(duration - elapsed, 0.5))
|
time.sleep(min(duration - elapsed, 0.5))
|
||||||
|
|
||||||
|
|
||||||
_jinja2_env = None
|
_jinja2_env = None
|
||||||
|
|
||||||
|
|
||||||
def jinja2_environment(behaviors_dir=None):
|
def jinja2_environment(behaviors_dir=None):
|
||||||
global _jinja2_env
|
global _jinja2_env
|
||||||
if not _jinja2_env:
|
if not _jinja2_env:
|
||||||
import os, jinja2, json
|
import os, jinja2, json
|
||||||
|
|
||||||
if behaviors_dir:
|
if behaviors_dir:
|
||||||
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
|
_loader = jinja2.FileSystemLoader(
|
||||||
'js-templates'))
|
os.path.join(behaviors_dir, "js-templates")
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
_loader=jinja2.PackageLoader('brozzler', 'js-templates')
|
_loader = jinja2.PackageLoader("brozzler", "js-templates")
|
||||||
_jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
|
_jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
|
||||||
_jinja2_env.filters['json'] = json.dumps
|
_jinja2_env.filters["json"] = json.dumps
|
||||||
return _jinja2_env
|
return _jinja2_env
|
||||||
|
|
||||||
|
|
||||||
import urlcanon
|
import urlcanon
|
||||||
|
|
||||||
|
|
||||||
def _remove_query(url):
|
def _remove_query(url):
|
||||||
url.question_mark = b''
|
url.question_mark = b""
|
||||||
url.query = b''
|
url.query = b""
|
||||||
|
|
||||||
|
|
||||||
# XXX chop off path after last slash??
|
# XXX chop off path after last slash??
|
||||||
site_surt_canon = urlcanon.Canonicalizer(
|
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
|
||||||
urlcanon.semantic.steps + [_remove_query])
|
|
||||||
|
|
||||||
import doublethink
|
import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
|
||||||
tzinfo=doublethink.UTC)
|
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||||
|
|
||||||
# we could make this configurable if there's a good reason
|
# we could make this configurable if there's a good reason
|
||||||
MAX_PAGE_FAILURES = 3
|
MAX_PAGE_FAILURES = 3
|
||||||
|
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
|
||||||
from brozzler.frontier import RethinkDbFrontier
|
from brozzler.frontier import RethinkDbFrontier
|
||||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||||
from brozzler.model import (
|
from brozzler.model import (
|
||||||
new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf)
|
new_job,
|
||||||
|
new_job_file,
|
||||||
|
new_site,
|
||||||
|
Job,
|
||||||
|
Page,
|
||||||
|
Site,
|
||||||
|
InvalidJobConf,
|
||||||
|
)
|
||||||
from brozzler.cli import suggest_default_chrome_exe
|
from brozzler.cli import suggest_default_chrome_exe
|
||||||
|
|
||||||
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
|
__all__ = [
|
||||||
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
|
"Page",
|
||||||
'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf',
|
"Site",
|
||||||
'sleep', 'thread_accept_exceptions', 'thread_raise']
|
"BrozzlerWorker",
|
||||||
|
"is_permitted_by_robots",
|
||||||
|
"RethinkDbFrontier",
|
||||||
|
"Browser",
|
||||||
|
"BrowserPool",
|
||||||
|
"BrowsingException",
|
||||||
|
"new_job",
|
||||||
|
"new_site",
|
||||||
|
"Job",
|
||||||
|
"new_job_file",
|
||||||
|
"InvalidJobConf",
|
||||||
|
"sleep",
|
||||||
|
"thread_accept_exceptions",
|
||||||
|
"thread_raise",
|
||||||
|
]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/browser.py - manages the browsers for brozzler
|
brozzler/browser.py - manages the browsers for brozzler
|
||||||
|
|
||||||
Copyright (C) 2014-2023 Internet Archive
|
Copyright (C) 2014-2023 Internet Archive
|
||||||
|
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
@ -33,30 +33,35 @@ from brozzler.chrome import Chrome
|
||||||
import socket
|
import socket
|
||||||
import urlcanon
|
import urlcanon
|
||||||
|
|
||||||
|
|
||||||
class BrowsingException(Exception):
|
class BrowsingException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class NoBrowsersAvailable(Exception):
|
class NoBrowsersAvailable(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BrowsingTimeout(BrowsingException):
|
class BrowsingTimeout(BrowsingException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BrowserPool:
|
class BrowserPool:
|
||||||
'''
|
"""
|
||||||
Manages pool of browsers. Automatically chooses available port for the
|
Manages pool of browsers. Automatically chooses available port for the
|
||||||
debugging protocol.
|
debugging protocol.
|
||||||
'''
|
"""
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, size=3, **kwargs):
|
def __init__(self, size=3, **kwargs):
|
||||||
'''
|
"""
|
||||||
Initializes the pool.
|
Initializes the pool.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
size: size of pool (default 3)
|
size: size of pool (default 3)
|
||||||
**kwargs: arguments for Browser(...)
|
**kwargs: arguments for Browser(...)
|
||||||
'''
|
"""
|
||||||
self.size = size
|
self.size = size
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
self._in_use = set()
|
self._in_use = set()
|
||||||
|
@ -65,7 +70,7 @@ class BrowserPool:
|
||||||
def _fresh_browser(self):
|
def _fresh_browser(self):
|
||||||
# choose available port
|
# choose available port
|
||||||
sock = socket.socket()
|
sock = socket.socket()
|
||||||
sock.bind(('0.0.0.0', 0))
|
sock.bind(("0.0.0.0", 0))
|
||||||
port = sock.getsockname()[1]
|
port = sock.getsockname()[1]
|
||||||
sock.close()
|
sock.close()
|
||||||
|
|
||||||
|
@ -73,12 +78,12 @@ class BrowserPool:
|
||||||
return browser
|
return browser
|
||||||
|
|
||||||
def acquire_multi(self, n=1):
|
def acquire_multi(self, n=1):
|
||||||
'''
|
"""
|
||||||
Returns a list of up to `n` browsers.
|
Returns a list of up to `n` browsers.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
NoBrowsersAvailable if none available
|
NoBrowsersAvailable if none available
|
||||||
'''
|
"""
|
||||||
browsers = []
|
browsers = []
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if len(self._in_use) >= self.size:
|
if len(self._in_use) >= self.size:
|
||||||
|
@ -90,7 +95,7 @@ class BrowserPool:
|
||||||
return browsers
|
return browsers
|
||||||
|
|
||||||
def acquire(self):
|
def acquire(self):
|
||||||
'''
|
"""
|
||||||
Returns an available instance.
|
Returns an available instance.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -98,7 +103,7 @@ class BrowserPool:
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
NoBrowsersAvailable if none available
|
NoBrowsersAvailable if none available
|
||||||
'''
|
"""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if len(self._in_use) >= self.size:
|
if len(self._in_use) >= self.size:
|
||||||
raise NoBrowsersAvailable
|
raise NoBrowsersAvailable
|
||||||
|
@ -120,8 +125,8 @@ class BrowserPool:
|
||||||
|
|
||||||
def shutdown_now(self):
|
def shutdown_now(self):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'shutting down browser pool (%s browsers in use)',
|
"shutting down browser pool (%s browsers in use)", len(self._in_use)
|
||||||
len(self._in_use))
|
)
|
||||||
with self._lock:
|
with self._lock:
|
||||||
for browser in self._in_use:
|
for browser in self._in_use:
|
||||||
browser.stop()
|
browser.stop()
|
||||||
|
@ -132,8 +137,9 @@ class BrowserPool:
|
||||||
def num_in_use(self):
|
def num_in_use(self):
|
||||||
return len(self._in_use)
|
return len(self._in_use)
|
||||||
|
|
||||||
|
|
||||||
class WebsockReceiverThread(threading.Thread):
|
class WebsockReceiverThread(threading.Thread):
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, websock, name=None, daemon=True):
|
def __init__(self, websock, name=None, daemon=True):
|
||||||
super().__init__(name=name, daemon=daemon)
|
super().__init__(name=name, daemon=daemon)
|
||||||
|
@ -175,50 +181,54 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.is_open = True
|
self.is_open = True
|
||||||
|
|
||||||
def _on_error(self, websock, e):
|
def _on_error(self, websock, e):
|
||||||
'''
|
"""
|
||||||
Raises BrowsingException in the thread that created this instance.
|
Raises BrowsingException in the thread that created this instance.
|
||||||
'''
|
"""
|
||||||
if isinstance(e, (
|
if isinstance(
|
||||||
websocket.WebSocketConnectionClosedException,
|
e, (websocket.WebSocketConnectionClosedException, ConnectionResetError)
|
||||||
ConnectionResetError)):
|
):
|
||||||
self.logger.error('websocket closed, did chrome die?')
|
self.logger.error("websocket closed, did chrome die?")
|
||||||
else:
|
else:
|
||||||
self.logger.error(
|
self.logger.error("exception from websocket receiver thread", exc_info=1)
|
||||||
'exception from websocket receiver thread',
|
|
||||||
exc_info=1)
|
|
||||||
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
# ping_timeout is used as the timeout for the call to select.select()
|
# ping_timeout is used as the timeout for the call to select.select()
|
||||||
# in addition to its documented purpose, and must have a value to avoid
|
# in addition to its documented purpose, and must have a value to avoid
|
||||||
# hangs in certain situations
|
# hangs in certain situations
|
||||||
self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),),
|
self.websock.run_forever(
|
||||||
ping_timeout=0.5)
|
sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5
|
||||||
|
)
|
||||||
|
|
||||||
def _on_message(self, websock, message):
|
def _on_message(self, websock, message):
|
||||||
try:
|
try:
|
||||||
self._handle_message(websock, message)
|
self._handle_message(websock, message)
|
||||||
except:
|
except:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'uncaught exception in _handle_message message=%s',
|
"uncaught exception in _handle_message message=%s",
|
||||||
message, exc_info=True)
|
message,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
status = message['params']['response'].get('status')
|
status = message["params"]["response"].get("status")
|
||||||
if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict(
|
if status == 420 and "Warcprox-Meta" in CaseInsensitiveDict(
|
||||||
message['params']['response']['headers'])):
|
message["params"]["response"]["headers"]
|
||||||
|
):
|
||||||
if not self.reached_limit:
|
if not self.reached_limit:
|
||||||
warcprox_meta = json.loads(CaseInsensitiveDict(
|
warcprox_meta = json.loads(
|
||||||
message['params']['response']['headers'])['Warcprox-Meta'])
|
CaseInsensitiveDict(message["params"]["response"]["headers"])[
|
||||||
self.reached_limit = brozzler.ReachedLimit(
|
"Warcprox-Meta"
|
||||||
warcprox_meta=warcprox_meta)
|
]
|
||||||
self.logger.info('reached limit %s', self.reached_limit)
|
)
|
||||||
brozzler.thread_raise(
|
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
||||||
self.calling_thread, brozzler.ReachedLimit)
|
self.logger.info("reached limit %s", self.reached_limit)
|
||||||
|
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'reached limit but self.reached_limit is already set, '
|
"reached limit but self.reached_limit is already set, "
|
||||||
'assuming the calling thread is already handling this')
|
"assuming the calling thread is already handling this"
|
||||||
|
)
|
||||||
if self.on_response:
|
if self.on_response:
|
||||||
self.on_response(message)
|
self.on_response(message)
|
||||||
|
|
||||||
|
@ -226,75 +236,92 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.page_status = status
|
self.page_status = status
|
||||||
|
|
||||||
def _javascript_dialog_opening(self, message):
|
def _javascript_dialog_opening(self, message):
|
||||||
self.logger.info('javascript dialog opened: %s', message)
|
self.logger.info("javascript dialog opened: %s", message)
|
||||||
if message['params']['type'] == 'alert':
|
if message["params"]["type"] == "alert":
|
||||||
accept = True
|
accept = True
|
||||||
else:
|
else:
|
||||||
accept = False
|
accept = False
|
||||||
self.websock.send(
|
self.websock.send(
|
||||||
json.dumps(dict(
|
json.dumps(
|
||||||
id=0, method='Page.handleJavaScriptDialog',
|
dict(
|
||||||
params={'accept': accept}), separators=',:'))
|
id=0,
|
||||||
|
method="Page.handleJavaScriptDialog",
|
||||||
|
params={"accept": accept},
|
||||||
|
),
|
||||||
|
separators=",:",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def _handle_message(self, websock, json_message):
|
def _handle_message(self, websock, json_message):
|
||||||
message = json.loads(json_message)
|
message = json.loads(json_message)
|
||||||
if 'method' in message:
|
if "method" in message:
|
||||||
if message['method'] == 'Page.loadEventFired':
|
if message["method"] == "Page.loadEventFired":
|
||||||
self.got_page_load_event = datetime.datetime.utcnow()
|
self.got_page_load_event = datetime.datetime.utcnow()
|
||||||
elif message['method'] == 'Network.responseReceived':
|
elif message["method"] == "Network.responseReceived":
|
||||||
self._network_response_received(message)
|
self._network_response_received(message)
|
||||||
elif message['method'] == 'Network.requestWillBeSent':
|
elif message["method"] == "Network.requestWillBeSent":
|
||||||
if self.on_request:
|
if self.on_request:
|
||||||
self.on_request(message)
|
self.on_request(message)
|
||||||
elif message['method'] == 'Page.interstitialShown':
|
elif message["method"] == "Page.interstitialShown":
|
||||||
# AITFIVE-1529: handle http auth
|
# AITFIVE-1529: handle http auth
|
||||||
# we should kill the browser when we receive Page.interstitialShown and
|
# we should kill the browser when we receive Page.interstitialShown and
|
||||||
# consider the page finished, until this is fixed:
|
# consider the page finished, until this is fixed:
|
||||||
# https://bugs.chromium.org/p/chromium/issues/detail?id=764505
|
# https://bugs.chromium.org/p/chromium/issues/detail?id=764505
|
||||||
self.logger.info('Page.interstialShown (likely unsupported http auth request)')
|
self.logger.info(
|
||||||
brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown)
|
"Page.interstialShown (likely unsupported http auth request)"
|
||||||
elif message['method'] == 'Inspector.targetCrashed':
|
)
|
||||||
self.logger.error(
|
brozzler.thread_raise(
|
||||||
'''chrome tab went "aw snap" or "he's dead jim"!''')
|
self.calling_thread, brozzler.PageInterstitialShown
|
||||||
|
)
|
||||||
|
elif message["method"] == "Inspector.targetCrashed":
|
||||||
|
self.logger.error("""chrome tab went "aw snap" or "he's dead jim"!""")
|
||||||
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
||||||
elif message['method'] == 'Console.messageAdded':
|
elif message["method"] == "Console.messageAdded":
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'console.%s %s', message['params']['message']['level'],
|
"console.%s %s",
|
||||||
message['params']['message']['text'])
|
message["params"]["message"]["level"],
|
||||||
elif message['method'] == 'Runtime.exceptionThrown':
|
message["params"]["message"]["text"],
|
||||||
self.logger.debug('uncaught exception: %s', message)
|
)
|
||||||
elif message['method'] == 'Page.javascriptDialogOpening':
|
elif message["method"] == "Runtime.exceptionThrown":
|
||||||
|
self.logger.debug("uncaught exception: %s", message)
|
||||||
|
elif message["method"] == "Page.javascriptDialogOpening":
|
||||||
self._javascript_dialog_opening(message)
|
self._javascript_dialog_opening(message)
|
||||||
elif (message['method'] == 'Network.loadingFailed'
|
elif (
|
||||||
and 'params' in message and 'errorText' in message['params']
|
message["method"] == "Network.loadingFailed"
|
||||||
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
|
and "params" in message
|
||||||
|
and "errorText" in message["params"]
|
||||||
|
and message["params"]["errorText"] == "net::ERR_PROXY_CONNECTION_FAILED"
|
||||||
|
):
|
||||||
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
|
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
|
||||||
elif message['method'] == 'ServiceWorker.workerVersionUpdated':
|
elif message["method"] == "ServiceWorker.workerVersionUpdated":
|
||||||
if self.on_service_worker_version_updated:
|
if self.on_service_worker_version_updated:
|
||||||
self.on_service_worker_version_updated(message)
|
self.on_service_worker_version_updated(message)
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s %s", message["method"], json_message)
|
# self.logger.debug("%s %s", message["method"], json_message)
|
||||||
elif 'result' in message:
|
elif "result" in message:
|
||||||
if message['id'] in self._result_messages:
|
if message["id"] in self._result_messages:
|
||||||
self._result_messages[message['id']] = message
|
self._result_messages[message["id"]] = message
|
||||||
|
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s", json_message)
|
# self.logger.debug("%s", json_message)
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s", json_message)
|
# self.logger.debug("%s", json_message)
|
||||||
|
|
||||||
|
|
||||||
class Browser:
|
class Browser:
|
||||||
'''
|
"""
|
||||||
Manages an instance of Chrome for browsing pages.
|
Manages an instance of Chrome for browsing pages.
|
||||||
'''
|
"""
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
'''
|
"""
|
||||||
Initializes the Browser.
|
Initializes the Browser.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
**kwargs: arguments for Chrome(...)
|
**kwargs: arguments for Chrome(...)
|
||||||
'''
|
"""
|
||||||
self.chrome = Chrome(**kwargs)
|
self.chrome = Chrome(**kwargs)
|
||||||
self.websock_url = None
|
self.websock_url = None
|
||||||
self.websock = None
|
self.websock = None
|
||||||
|
@ -311,9 +338,9 @@ class Browser:
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def _wait_for(self, callback, timeout=None):
|
def _wait_for(self, callback, timeout=None):
|
||||||
'''
|
"""
|
||||||
Spins until callback() returns truthy.
|
Spins until callback() returns truthy.
|
||||||
'''
|
"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while True:
|
while True:
|
||||||
if callback():
|
if callback():
|
||||||
|
@ -321,112 +348,140 @@ class Browser:
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
if timeout and elapsed > timeout:
|
if timeout and elapsed > timeout:
|
||||||
raise BrowsingTimeout(
|
raise BrowsingTimeout(
|
||||||
'timed out after %.1fs waiting for: %s' % (
|
"timed out after %.1fs waiting for: %s" % (elapsed, callback)
|
||||||
elapsed, callback))
|
)
|
||||||
brozzler.sleep(self._wait_interval)
|
brozzler.sleep(self._wait_interval)
|
||||||
|
|
||||||
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
||||||
msg_id = next(self._command_id)
|
msg_id = next(self._command_id)
|
||||||
kwargs['id'] = msg_id
|
kwargs["id"] = msg_id
|
||||||
msg = json.dumps(kwargs, separators=',:')
|
msg = json.dumps(kwargs, separators=",:")
|
||||||
logging.log(
|
logging.log(
|
||||||
logging.TRACE if suppress_logging else logging.DEBUG,
|
logging.TRACE if suppress_logging else logging.DEBUG,
|
||||||
'sending message to %s: %s', self.websock, msg)
|
"sending message to %s: %s",
|
||||||
|
self.websock,
|
||||||
|
msg,
|
||||||
|
)
|
||||||
self.websock.send(msg)
|
self.websock.send(msg)
|
||||||
return msg_id
|
return msg_id
|
||||||
|
|
||||||
def start(self, **kwargs):
|
def start(self, **kwargs):
|
||||||
'''
|
"""
|
||||||
Starts chrome if it's not running.
|
Starts chrome if it's not running.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
**kwargs: arguments for self.chrome.start(...)
|
**kwargs: arguments for self.chrome.start(...)
|
||||||
'''
|
"""
|
||||||
if not self.is_running():
|
if not self.is_running():
|
||||||
self.websock_url = self.chrome.start(**kwargs)
|
self.websock_url = self.chrome.start(**kwargs)
|
||||||
self.websock = websocket.WebSocketApp(self.websock_url)
|
self.websock = websocket.WebSocketApp(self.websock_url)
|
||||||
self.websock_thread = WebsockReceiverThread(
|
self.websock_thread = WebsockReceiverThread(
|
||||||
self.websock, name='WebsockThread:%s' % self.chrome.port)
|
self.websock, name="WebsockThread:%s" % self.chrome.port
|
||||||
|
)
|
||||||
self.websock_thread.start()
|
self.websock_thread.start()
|
||||||
|
|
||||||
self._wait_for(lambda: self.websock_thread.is_open, timeout=30)
|
self._wait_for(lambda: self.websock_thread.is_open, timeout=30)
|
||||||
|
|
||||||
# tell browser to send us messages we're interested in
|
# tell browser to send us messages we're interested in
|
||||||
self.send_to_chrome(method='Network.enable')
|
self.send_to_chrome(method="Network.enable")
|
||||||
self.send_to_chrome(method='Page.enable')
|
self.send_to_chrome(method="Page.enable")
|
||||||
# Enable Console & Runtime output only when debugging.
|
# Enable Console & Runtime output only when debugging.
|
||||||
# After all, we just print these events with debug(), we don't use
|
# After all, we just print these events with debug(), we don't use
|
||||||
# them in Brozzler logic.
|
# them in Brozzler logic.
|
||||||
if self.logger.isEnabledFor(logging.DEBUG):
|
if self.logger.isEnabledFor(logging.DEBUG):
|
||||||
self.send_to_chrome(method='Console.enable')
|
self.send_to_chrome(method="Console.enable")
|
||||||
self.send_to_chrome(method='Runtime.enable')
|
self.send_to_chrome(method="Runtime.enable")
|
||||||
self.send_to_chrome(method='ServiceWorker.enable')
|
self.send_to_chrome(method="ServiceWorker.enable")
|
||||||
self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')
|
self.send_to_chrome(method="ServiceWorker.setForceUpdateOnPageLoad")
|
||||||
|
|
||||||
# disable google analytics and amp analytics
|
# disable google analytics and amp analytics
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Network.setBlockedURLs',
|
method="Network.setBlockedURLs",
|
||||||
params={'urls': ['*google-analytics.com/analytics.js*',
|
params={
|
||||||
'*google-analytics.com/ga.js*',
|
"urls": [
|
||||||
'*google-analytics.com/ga_exp.js*',
|
"*google-analytics.com/analytics.js*",
|
||||||
'*google-analytics.com/urchin.js*',
|
"*google-analytics.com/ga.js*",
|
||||||
'*google-analytics.com/collect*',
|
"*google-analytics.com/ga_exp.js*",
|
||||||
'*google-analytics.com/r/collect*',
|
"*google-analytics.com/urchin.js*",
|
||||||
'*google-analytics.com/__utm.gif*',
|
"*google-analytics.com/collect*",
|
||||||
'*google-analytics.com/gtm/js?*',
|
"*google-analytics.com/r/collect*",
|
||||||
'*google-analytics.com/cx/api.js*',
|
"*google-analytics.com/__utm.gif*",
|
||||||
'*cdn.ampproject.org/*/amp-analytics*.js']})
|
"*google-analytics.com/gtm/js?*",
|
||||||
|
"*google-analytics.com/cx/api.js*",
|
||||||
|
"*cdn.ampproject.org/*/amp-analytics*.js",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
'''
|
"""
|
||||||
Stops chrome if it's running.
|
Stops chrome if it's running.
|
||||||
'''
|
"""
|
||||||
try:
|
try:
|
||||||
if (self.websock and self.websock.sock
|
if self.websock and self.websock.sock and self.websock.sock.connected:
|
||||||
and self.websock.sock.connected):
|
self.logger.info("shutting down websocket connection")
|
||||||
self.logger.info('shutting down websocket connection')
|
|
||||||
try:
|
try:
|
||||||
self.websock.close()
|
self.websock.close()
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'exception closing websocket %s - %s',
|
"exception closing websocket %s - %s", self.websock, e
|
||||||
self.websock, e)
|
)
|
||||||
|
|
||||||
self.chrome.stop()
|
self.chrome.stop()
|
||||||
|
|
||||||
if self.websock_thread and (
|
if self.websock_thread and (
|
||||||
self.websock_thread != threading.current_thread()):
|
self.websock_thread != threading.current_thread()
|
||||||
|
):
|
||||||
self.websock_thread.join(timeout=30)
|
self.websock_thread.join(timeout=30)
|
||||||
if self.websock_thread.is_alive():
|
if self.websock_thread.is_alive():
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'%s still alive 30 seconds after closing %s, will '
|
"%s still alive 30 seconds after closing %s, will "
|
||||||
'forcefully nudge it again', self.websock_thread,
|
"forcefully nudge it again",
|
||||||
self.websock)
|
self.websock_thread,
|
||||||
|
self.websock,
|
||||||
|
)
|
||||||
self.websock.keep_running = False
|
self.websock.keep_running = False
|
||||||
self.websock_thread.join(timeout=30)
|
self.websock_thread.join(timeout=30)
|
||||||
if self.websock_thread.is_alive():
|
if self.websock_thread.is_alive():
|
||||||
self.logger.critical(
|
self.logger.critical(
|
||||||
'%s still alive 60 seconds after closing %s',
|
"%s still alive 60 seconds after closing %s",
|
||||||
self.websock_thread, self.websock)
|
self.websock_thread,
|
||||||
|
self.websock,
|
||||||
|
)
|
||||||
|
|
||||||
self.websock_url = None
|
self.websock_url = None
|
||||||
except:
|
except:
|
||||||
self.logger.error('problem stopping', exc_info=True)
|
self.logger.error("problem stopping", exc_info=True)
|
||||||
|
|
||||||
def is_running(self):
|
def is_running(self):
|
||||||
return self.websock_url is not None
|
return self.websock_url is not None
|
||||||
|
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, page_url, extra_headers=None,
|
self,
|
||||||
user_agent=None, behavior_parameters=None, behaviors_dir=None,
|
page_url,
|
||||||
on_request=None, on_response=None,
|
extra_headers=None,
|
||||||
on_service_worker_version_updated=None, on_screenshot=None,
|
user_agent=None,
|
||||||
username=None, password=None, hashtags=None,
|
behavior_parameters=None,
|
||||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
behaviors_dir=None,
|
||||||
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
on_request=None,
|
||||||
page_timeout=300, behavior_timeout=900,
|
on_response=None,
|
||||||
extract_outlinks_timeout=60, download_throughput=-1, stealth=False):
|
on_service_worker_version_updated=None,
|
||||||
'''
|
on_screenshot=None,
|
||||||
|
username=None,
|
||||||
|
password=None,
|
||||||
|
hashtags=None,
|
||||||
|
screenshot_full_page=False,
|
||||||
|
skip_extract_outlinks=False,
|
||||||
|
skip_visit_hashtags=False,
|
||||||
|
skip_youtube_dl=False,
|
||||||
|
simpler404=False,
|
||||||
|
page_timeout=300,
|
||||||
|
behavior_timeout=900,
|
||||||
|
extract_outlinks_timeout=60,
|
||||||
|
download_throughput=-1,
|
||||||
|
stealth=False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
Browser should already be running, i.e. start() should have been
|
Browser should already be running, i.e. start() should have been
|
||||||
|
@ -473,54 +528,60 @@ class Browser:
|
||||||
Raises:
|
Raises:
|
||||||
brozzler.ProxyError: in case of proxy connection error
|
brozzler.ProxyError: in case of proxy connection error
|
||||||
BrowsingException: if browsing the page fails in some other way
|
BrowsingException: if browsing the page fails in some other way
|
||||||
'''
|
"""
|
||||||
if not self.is_running():
|
if not self.is_running():
|
||||||
raise BrowsingException('browser has not been started')
|
raise BrowsingException("browser has not been started")
|
||||||
if self.is_browsing:
|
if self.is_browsing:
|
||||||
raise BrowsingException('browser is already busy browsing a page')
|
raise BrowsingException("browser is already busy browsing a page")
|
||||||
self.is_browsing = True
|
self.is_browsing = True
|
||||||
if on_request:
|
if on_request:
|
||||||
self.websock_thread.on_request = on_request
|
self.websock_thread.on_request = on_request
|
||||||
if on_response:
|
if on_response:
|
||||||
self.websock_thread.on_response = on_response
|
self.websock_thread.on_response = on_response
|
||||||
if on_service_worker_version_updated:
|
if on_service_worker_version_updated:
|
||||||
self.websock_thread.on_service_worker_version_updated = \
|
self.websock_thread.on_service_worker_version_updated = (
|
||||||
on_service_worker_version_updated
|
on_service_worker_version_updated
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
with brozzler.thread_accept_exceptions():
|
with brozzler.thread_accept_exceptions():
|
||||||
self.configure_browser(
|
self.configure_browser(
|
||||||
extra_headers=extra_headers,
|
extra_headers=extra_headers,
|
||||||
user_agent=user_agent,
|
user_agent=user_agent,
|
||||||
download_throughput=download_throughput,
|
download_throughput=download_throughput,
|
||||||
stealth=stealth)
|
stealth=stealth,
|
||||||
|
)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
if password:
|
if password:
|
||||||
self.try_login(username, password, timeout=page_timeout)
|
self.try_login(username, password, timeout=page_timeout)
|
||||||
# if login redirected us, return to page_url
|
# if login redirected us, return to page_url
|
||||||
if page_url != self.url().split('#')[0]:
|
if page_url != self.url().split("#")[0]:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'login navigated away from %s; returning!',
|
"login navigated away from %s; returning!", page_url
|
||||||
page_url)
|
)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
# If the target page HTTP status is 4xx/5xx, there is no point
|
# If the target page HTTP status is 4xx/5xx, there is no point
|
||||||
# in running behaviors, screenshot, outlink and hashtag
|
# in running behaviors, screenshot, outlink and hashtag
|
||||||
# extraction as we didn't get a valid page.
|
# extraction as we didn't get a valid page.
|
||||||
# This is only enabled with option `simpler404`.
|
# This is only enabled with option `simpler404`.
|
||||||
run_behaviors = True
|
run_behaviors = True
|
||||||
if simpler404 and (self.websock_thread.page_status is None or
|
if simpler404 and (
|
||||||
self.websock_thread.page_status >= 400):
|
self.websock_thread.page_status is None
|
||||||
|
or self.websock_thread.page_status >= 400
|
||||||
|
):
|
||||||
run_behaviors = False
|
run_behaviors = False
|
||||||
|
|
||||||
if run_behaviors and behavior_timeout > 0:
|
if run_behaviors and behavior_timeout > 0:
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters,
|
page_url, behavior_parameters, behaviors_dir=behaviors_dir
|
||||||
behaviors_dir=behaviors_dir)
|
)
|
||||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||||
final_page_url = self.url()
|
final_page_url = self.url()
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
if simpler404:
|
if simpler404:
|
||||||
if self.websock_thread.page_status and \
|
if (
|
||||||
self.websock_thread.page_status < 400:
|
self.websock_thread.page_status
|
||||||
|
and self.websock_thread.page_status < 400
|
||||||
|
):
|
||||||
self._try_screenshot(on_screenshot, screenshot_full_page)
|
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||||
else:
|
else:
|
||||||
self._try_screenshot(on_screenshot, screenshot_full_page)
|
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||||
|
@ -528,9 +589,7 @@ class Browser:
|
||||||
if not run_behaviors or skip_extract_outlinks:
|
if not run_behaviors or skip_extract_outlinks:
|
||||||
outlinks = []
|
outlinks = []
|
||||||
else:
|
else:
|
||||||
outlinks = self.extract_outlinks(
|
outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
|
||||||
timeout=extract_outlinks_timeout
|
|
||||||
)
|
|
||||||
if run_behaviors and not skip_visit_hashtags:
|
if run_behaviors and not skip_visit_hashtags:
|
||||||
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
|
@ -539,7 +598,7 @@ class Browser:
|
||||||
# more information, raise that one
|
# more information, raise that one
|
||||||
raise self.websock_thread.reached_limit
|
raise self.websock_thread.reached_limit
|
||||||
except websocket.WebSocketConnectionClosedException as e:
|
except websocket.WebSocketConnectionClosedException as e:
|
||||||
self.logger.error('websocket closed, did chrome die?')
|
self.logger.error("websocket closed, did chrome die?")
|
||||||
raise BrowsingException(e)
|
raise BrowsingException(e)
|
||||||
finally:
|
finally:
|
||||||
self.is_browsing = False
|
self.is_browsing = False
|
||||||
|
@ -550,21 +609,24 @@ class Browser:
|
||||||
"""The browser instance must be scrolled to the top of the page before
|
"""The browser instance must be scrolled to the top of the page before
|
||||||
trying to get a screenshot.
|
trying to get a screenshot.
|
||||||
"""
|
"""
|
||||||
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
|
self.send_to_chrome(
|
||||||
params={'expression': 'window.scroll(0,0)'})
|
method="Runtime.evaluate",
|
||||||
|
suppress_logging=True,
|
||||||
|
params={"expression": "window.scroll(0,0)"},
|
||||||
|
)
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
try:
|
try:
|
||||||
jpeg_bytes = self.screenshot(full_page)
|
jpeg_bytes = self.screenshot(full_page)
|
||||||
on_screenshot(jpeg_bytes)
|
on_screenshot(jpeg_bytes)
|
||||||
return
|
return
|
||||||
except BrowsingTimeout as e:
|
except BrowsingTimeout as e:
|
||||||
logging.error('attempt %s/3: %s', i+1, e)
|
logging.error("attempt %s/3: %s", i + 1, e)
|
||||||
|
|
||||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||||
_hashtags = set(hashtags or [])
|
_hashtags = set(hashtags or [])
|
||||||
for outlink in outlinks:
|
for outlink in outlinks:
|
||||||
url = urlcanon.whatwg(outlink)
|
url = urlcanon.whatwg(outlink)
|
||||||
hashtag = (url.hash_sign + url.fragment).decode('utf-8')
|
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||||
urlcanon.canon.remove_fragment(url)
|
urlcanon.canon.remove_fragment(url)
|
||||||
if hashtag and str(url) == page_url:
|
if hashtag and str(url) == page_url:
|
||||||
_hashtags.add(hashtag)
|
_hashtags.add(hashtag)
|
||||||
|
@ -572,84 +634,85 @@ class Browser:
|
||||||
# out which hashtags were visited already and skip those
|
# out which hashtags were visited already and skip those
|
||||||
for hashtag in _hashtags:
|
for hashtag in _hashtags:
|
||||||
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||||
self.logger.debug('navigating to hashtag %s', hashtag)
|
self.logger.debug("navigating to hashtag %s", hashtag)
|
||||||
url = urlcanon.whatwg(page_url)
|
url = urlcanon.whatwg(page_url)
|
||||||
url.hash_sign = b'#'
|
url.hash_sign = b"#"
|
||||||
url.fragment = hashtag[1:].encode('utf-8')
|
url.fragment = hashtag[1:].encode("utf-8")
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(method="Page.navigate", params={"url": str(url)})
|
||||||
method='Page.navigate', params={'url': str(url)})
|
|
||||||
time.sleep(5) # um.. wait for idleness or something?
|
time.sleep(5) # um.. wait for idleness or something?
|
||||||
# take another screenshot?
|
# take another screenshot?
|
||||||
# run behavior again with short timeout?
|
# run behavior again with short timeout?
|
||||||
# retrieve outlinks again and append to list?
|
# retrieve outlinks again and append to list?
|
||||||
|
|
||||||
def configure_browser(self, extra_headers=None, user_agent=None,
|
def configure_browser(
|
||||||
download_throughput=-1, stealth=False):
|
self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False
|
||||||
|
):
|
||||||
headers = extra_headers or {}
|
headers = extra_headers or {}
|
||||||
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
|
headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Network.setExtraHTTPHeaders',
|
method="Network.setExtraHTTPHeaders", params={"headers": headers}
|
||||||
params={'headers': headers})
|
)
|
||||||
self._wait_for(
|
self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10)
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
|
||||||
timeout=10)
|
|
||||||
if user_agent:
|
if user_agent:
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Network.setUserAgentOverride',
|
method="Network.setUserAgentOverride", params={"userAgent": user_agent}
|
||||||
params={'userAgent': user_agent})
|
)
|
||||||
if download_throughput > -1:
|
if download_throughput > -1:
|
||||||
# traffic shaping already used by SPN2 to aid warcprox resilience
|
# traffic shaping already used by SPN2 to aid warcprox resilience
|
||||||
# parameter value as bytes/second, or -1 to disable (default)
|
# parameter value as bytes/second, or -1 to disable (default)
|
||||||
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
|
msg_id = self.send_to_chrome(
|
||||||
params={'downloadThroughput': download_throughput})
|
method="Network.emulateNetworkConditions",
|
||||||
|
params={"downloadThroughput": download_throughput},
|
||||||
|
)
|
||||||
if stealth:
|
if stealth:
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
js = brozzler.jinja2_environment().get_template('stealth.js').render()
|
js = brozzler.jinja2_environment().get_template("stealth.js").render()
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Page.addScriptToEvaluateOnNewDocument',
|
method="Page.addScriptToEvaluateOnNewDocument", params={"source": js}
|
||||||
params={'source': js})
|
)
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=10
|
||||||
timeout=10)
|
)
|
||||||
|
|
||||||
|
|
||||||
def navigate_to_page(self, page_url, timeout=300):
|
def navigate_to_page(self, page_url, timeout=300):
|
||||||
self.logger.info('navigating to page %s', page_url)
|
self.logger.info("navigating to page %s", page_url)
|
||||||
self.websock_thread.got_page_load_event = None
|
self.websock_thread.got_page_load_event = None
|
||||||
self.websock_thread.page_status = None
|
self.websock_thread.page_status = None
|
||||||
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
|
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
|
||||||
self._wait_for(
|
self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
|
||||||
lambda: self.websock_thread.got_page_load_event,
|
|
||||||
timeout=timeout)
|
|
||||||
|
|
||||||
def extract_outlinks(self, timeout=60):
|
def extract_outlinks(self, timeout=60):
|
||||||
self.logger.info('extracting outlinks')
|
self.logger.info("extracting outlinks")
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
js = brozzler.jinja2_environment().get_template(
|
js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
|
||||||
'extract-outlinks.js').render()
|
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Runtime.evaluate', params={'expression': js})
|
method="Runtime.evaluate", params={"expression": js}
|
||||||
|
)
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
|
||||||
timeout=timeout)
|
)
|
||||||
message = self.websock_thread.pop_result(msg_id)
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
if ('result' in message and 'result' in message['result']
|
if (
|
||||||
and 'value' in message['result']['result']):
|
"result" in message
|
||||||
if message['result']['result']['value']:
|
and "result" in message["result"]
|
||||||
|
and "value" in message["result"]["result"]
|
||||||
|
):
|
||||||
|
if message["result"]["result"]["value"]:
|
||||||
out = []
|
out = []
|
||||||
for link in message['result']['result']['value'].split('\n'):
|
for link in message["result"]["result"]["value"].split("\n"):
|
||||||
try:
|
try:
|
||||||
out.append(str(urlcanon.whatwg(link)))
|
out.append(str(urlcanon.whatwg(link)))
|
||||||
except AddressValueError:
|
except AddressValueError:
|
||||||
self.logger.warning('skip invalid outlink: %s', link)
|
self.logger.warning("skip invalid outlink: %s", link)
|
||||||
return frozenset(out)
|
return frozenset(out)
|
||||||
else:
|
else:
|
||||||
# no links found
|
# no links found
|
||||||
return frozenset()
|
return frozenset()
|
||||||
else:
|
else:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'problem extracting outlinks, result message: %s', message)
|
"problem extracting outlinks, result message: %s", message
|
||||||
|
)
|
||||||
return frozenset()
|
return frozenset()
|
||||||
|
|
||||||
def screenshot(self, full_page=False, timeout=45):
|
def screenshot(self, full_page=False, timeout=45):
|
||||||
|
@ -657,121 +720,141 @@ class Browser:
|
||||||
inspiration:
|
inspiration:
|
||||||
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
|
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
|
||||||
"""
|
"""
|
||||||
self.logger.info('taking screenshot')
|
self.logger.info("taking screenshot")
|
||||||
if full_page:
|
if full_page:
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
|
msg_id = self.send_to_chrome(method="Page.getLayoutMetrics")
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
|
||||||
timeout=timeout)
|
)
|
||||||
message = self.websock_thread.pop_result(msg_id)
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
width = message['result']['contentSize']['width']
|
width = message["result"]["contentSize"]["width"]
|
||||||
height = message['result']['contentSize']['height']
|
height = message["result"]["contentSize"]["height"]
|
||||||
clip = dict(x=0, y=0, width=width, height=height, scale=1)
|
clip = dict(x=0, y=0, width=width, height=height, scale=1)
|
||||||
deviceScaleFactor = 1
|
deviceScaleFactor = 1
|
||||||
screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
|
screenOrientation = {"angle": 0, "type": "portraitPrimary"}
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Emulation.setDeviceMetricsOverride',
|
method="Emulation.setDeviceMetricsOverride",
|
||||||
params=dict(mobile=False, width=width, height=height,
|
params=dict(
|
||||||
|
mobile=False,
|
||||||
|
width=width,
|
||||||
|
height=height,
|
||||||
deviceScaleFactor=deviceScaleFactor,
|
deviceScaleFactor=deviceScaleFactor,
|
||||||
screenOrientation=screenOrientation)
|
screenOrientation=screenOrientation,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
|
capture_params = {"format": "jpeg", "quality": 95, "clip": clip}
|
||||||
else:
|
else:
|
||||||
capture_params = {'format': 'jpeg', 'quality': 95}
|
capture_params = {"format": "jpeg", "quality": 95}
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
|
msg_id = self.send_to_chrome(
|
||||||
params=capture_params)
|
method="Page.captureScreenshot", params=capture_params
|
||||||
|
)
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
|
||||||
timeout=timeout)
|
)
|
||||||
message = self.websock_thread.pop_result(msg_id)
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
jpeg_bytes = base64.b64decode(message['result']['data'])
|
jpeg_bytes = base64.b64decode(message["result"]["data"])
|
||||||
return jpeg_bytes
|
return jpeg_bytes
|
||||||
|
|
||||||
def url(self, timeout=30):
|
def url(self, timeout=30):
|
||||||
'''
|
"""
|
||||||
Returns value of document.URL from the browser.
|
Returns value of document.URL from the browser.
|
||||||
'''
|
"""
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Runtime.evaluate',
|
method="Runtime.evaluate", params={"expression": "document.URL"}
|
||||||
params={'expression': 'document.URL'})
|
)
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
|
||||||
timeout=timeout)
|
)
|
||||||
message = self.websock_thread.pop_result(msg_id)
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
return message['result']['result']['value']
|
return message["result"]["result"]["value"]
|
||||||
|
|
||||||
def run_behavior(self, behavior_script, timeout=900):
|
def run_behavior(self, behavior_script, timeout=900):
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Runtime.evaluate', suppress_logging=True,
|
method="Runtime.evaluate",
|
||||||
params={'expression': behavior_script})
|
suppress_logging=True,
|
||||||
|
params={"expression": behavior_script},
|
||||||
|
)
|
||||||
|
|
||||||
check_interval = min(timeout, 7)
|
check_interval = min(timeout, 7)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while True:
|
while True:
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
if elapsed > timeout:
|
if elapsed > timeout:
|
||||||
logging.info(
|
logging.info("behavior reached hard timeout after %.1fs", elapsed)
|
||||||
'behavior reached hard timeout after %.1fs', elapsed)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
brozzler.sleep(check_interval)
|
brozzler.sleep(check_interval)
|
||||||
|
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Runtime.evaluate', suppress_logging=True,
|
method="Runtime.evaluate",
|
||||||
params={'expression': 'umbraBehaviorFinished()'})
|
suppress_logging=True,
|
||||||
|
params={"expression": "umbraBehaviorFinished()"},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=5
|
||||||
timeout=5)
|
)
|
||||||
msg = self.websock_thread.pop_result(msg_id)
|
msg = self.websock_thread.pop_result(msg_id)
|
||||||
if (msg and 'result' in msg
|
if (
|
||||||
and not ('exceptionDetails' in msg['result'])
|
msg
|
||||||
and not ('wasThrown' in msg['result']
|
and "result" in msg
|
||||||
and msg['result']['wasThrown'])
|
and not ("exceptionDetails" in msg["result"])
|
||||||
and 'result' in msg['result']
|
and not (
|
||||||
and type(msg['result']['result']['value']) == bool
|
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
|
||||||
and msg['result']['result']['value']):
|
)
|
||||||
self.logger.info('behavior decided it has finished')
|
and "result" in msg["result"]
|
||||||
|
and type(msg["result"]["result"]["value"]) == bool
|
||||||
|
and msg["result"]["result"]["value"]
|
||||||
|
):
|
||||||
|
self.logger.info("behavior decided it has finished")
|
||||||
return
|
return
|
||||||
except BrowsingTimeout:
|
except BrowsingTimeout:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def try_login(self, username, password, timeout=300):
|
def try_login(self, username, password, timeout=300):
|
||||||
try_login_js = brozzler.jinja2_environment().get_template(
|
try_login_js = (
|
||||||
'try-login.js.j2').render(username=username, password=password)
|
brozzler.jinja2_environment()
|
||||||
|
.get_template("try-login.js.j2")
|
||||||
|
.render(username=username, password=password)
|
||||||
|
)
|
||||||
|
|
||||||
self.websock_thread.got_page_load_event = None
|
self.websock_thread.got_page_load_event = None
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Runtime.evaluate', suppress_logging=True,
|
method="Runtime.evaluate",
|
||||||
params={'expression': try_login_js})
|
suppress_logging=True,
|
||||||
|
params={"expression": try_login_js},
|
||||||
|
)
|
||||||
|
|
||||||
# wait for tryLogin to finish trying (should be very very quick)
|
# wait for tryLogin to finish trying (should be very very quick)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while True:
|
while True:
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Runtime.evaluate',
|
method="Runtime.evaluate",
|
||||||
params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
|
params={
|
||||||
|
"expression": 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id), timeout=5
|
||||||
timeout=5)
|
)
|
||||||
msg = self.websock_thread.pop_result(msg_id)
|
msg = self.websock_thread.pop_result(msg_id)
|
||||||
if (msg and 'result' in msg
|
if msg and "result" in msg and "result" in msg["result"]:
|
||||||
and 'result' in msg['result']):
|
result = msg["result"]["result"]["value"]
|
||||||
result = msg['result']['result']['value']
|
if result == "login-form-not-found":
|
||||||
if result == 'login-form-not-found':
|
|
||||||
# we're done
|
# we're done
|
||||||
return
|
return
|
||||||
elif result in ('submitted-form', 'maybe-submitted-form'):
|
elif result in ("submitted-form", "maybe-submitted-form"):
|
||||||
# wait for page load event below
|
# wait for page load event below
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'submitted a login form, waiting for another '
|
"submitted a login form, waiting for another "
|
||||||
'page load event')
|
"page load event"
|
||||||
|
)
|
||||||
break
|
break
|
||||||
# else try again to get __brzl_tryLoginState
|
# else try again to get __brzl_tryLoginState
|
||||||
|
|
||||||
|
@ -780,23 +863,23 @@ class Browser:
|
||||||
|
|
||||||
if time.time() - start > 30:
|
if time.time() - start > 30:
|
||||||
raise BrowsingException(
|
raise BrowsingException(
|
||||||
'timed out trying to check if tryLogin finished')
|
"timed out trying to check if tryLogin finished"
|
||||||
|
)
|
||||||
|
|
||||||
# if we get here, we submitted a form, now we wait for another page
|
# if we get here, we submitted a form, now we wait for another page
|
||||||
# load event
|
# load event
|
||||||
self._wait_for(
|
self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
|
||||||
lambda: self.websock_thread.got_page_load_event,
|
|
||||||
timeout=timeout)
|
|
||||||
|
|
||||||
class Counter:
|
class Counter:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.next_value = 0
|
self.next_value = 0
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
try:
|
try:
|
||||||
return self.next_value
|
return self.next_value
|
||||||
finally:
|
finally:
|
||||||
self.next_value += 1
|
self.next_value += 1
|
||||||
|
|
||||||
def peek(self):
|
def peek(self):
|
||||||
return self.next_value
|
return self.next_value
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
||||||
|
|
||||||
Copyright (C) 2014-2023 Internet Archive
|
Copyright (C) 2014-2023 Internet Archive
|
||||||
|
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
@ -31,12 +31,13 @@ import json
|
||||||
import tempfile
|
import tempfile
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def check_version(chrome_exe):
|
def check_version(chrome_exe):
|
||||||
'''
|
"""
|
||||||
Raises SystemExit if `chrome_exe` is not a supported browser version.
|
Raises SystemExit if `chrome_exe` is not a supported browser version.
|
||||||
|
|
||||||
Must run in the main thread to have the desired effect.
|
Must run in the main thread to have the desired effect.
|
||||||
'''
|
"""
|
||||||
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
|
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
|
||||||
# Google Chrome 64.0.3282.140
|
# Google Chrome 64.0.3282.140
|
||||||
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
|
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
|
||||||
|
@ -45,25 +46,28 @@ def check_version(chrome_exe):
|
||||||
# Using PPAPI flash.
|
# Using PPAPI flash.
|
||||||
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
|
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
|
||||||
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
|
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
|
||||||
cmd = [chrome_exe, '--version']
|
cmd = [chrome_exe, "--version"]
|
||||||
out = subprocess.check_output(cmd, timeout=60)
|
out = subprocess.check_output(cmd, timeout=60)
|
||||||
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
|
m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
|
||||||
if not m:
|
if not m:
|
||||||
sys.exit(
|
sys.exit(
|
||||||
'unable to parse browser version from output of '
|
"unable to parse browser version from output of "
|
||||||
'%r: %r' % (subprocess.list2cmdline(cmd), out))
|
"%r: %r" % (subprocess.list2cmdline(cmd), out)
|
||||||
|
)
|
||||||
version_str = m.group(2).decode()
|
version_str = m.group(2).decode()
|
||||||
major_version = int(version_str.split('.')[0])
|
major_version = int(version_str.split(".")[0])
|
||||||
if major_version < 64:
|
if major_version < 64:
|
||||||
sys.exit('brozzler requires chrome/chromium version 64 or '
|
sys.exit(
|
||||||
'later but %s reports version %s' % (
|
"brozzler requires chrome/chromium version 64 or "
|
||||||
chrome_exe, version_str))
|
"later but %s reports version %s" % (chrome_exe, version_str)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
|
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
|
||||||
'''
|
"""
|
||||||
Initializes instance of this class.
|
Initializes instance of this class.
|
||||||
|
|
||||||
Doesn't start the browser, start() does that.
|
Doesn't start the browser, start() does that.
|
||||||
|
@ -73,7 +77,7 @@ class Chrome:
|
||||||
port: chrome debugging protocol port (default 9222)
|
port: chrome debugging protocol port (default 9222)
|
||||||
ignore_cert_errors: configure chrome to accept all certs (default
|
ignore_cert_errors: configure chrome to accept all certs (default
|
||||||
False)
|
False)
|
||||||
'''
|
"""
|
||||||
self.port = port
|
self.port = port
|
||||||
self.chrome_exe = chrome_exe
|
self.chrome_exe = chrome_exe
|
||||||
self.ignore_cert_errors = ignore_cert_errors
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
|
@ -81,63 +85,72 @@ class Chrome:
|
||||||
self.chrome_process = None
|
self.chrome_process = None
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
'''
|
"""
|
||||||
Returns websocket url to chrome window with about:blank loaded.
|
Returns websocket url to chrome window with about:blank loaded.
|
||||||
'''
|
"""
|
||||||
return self.start()
|
return self.start()
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def _init_cookie_db(self, cookie_db):
|
def _init_cookie_db(self, cookie_db):
|
||||||
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
|
||||||
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
cookie_location = os.path.join(cookie_dir, "Cookies")
|
||||||
self.logger.debug('cookie DB provided, writing to %s', cookie_location)
|
self.logger.debug("cookie DB provided, writing to %s", cookie_location)
|
||||||
os.makedirs(cookie_dir, exist_ok=True)
|
os.makedirs(cookie_dir, exist_ok=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(cookie_location, 'wb') as cookie_file:
|
with open(cookie_location, "wb") as cookie_file:
|
||||||
cookie_file.write(cookie_db)
|
cookie_file.write(cookie_db)
|
||||||
except OSError:
|
except OSError:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'exception writing cookie file at %s',
|
"exception writing cookie file at %s", cookie_location, exc_info=True
|
||||||
cookie_location, exc_info=True)
|
)
|
||||||
|
|
||||||
def persist_and_read_cookie_db(self):
|
def persist_and_read_cookie_db(self):
|
||||||
cookie_location = os.path.join(
|
cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
|
||||||
self._chrome_user_data_dir, 'Default', 'Cookies')
|
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'marking cookies persistent then reading file into memory: %s',
|
"marking cookies persistent then reading file into memory: %s",
|
||||||
cookie_location)
|
cookie_location,
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
with sqlite3.connect(cookie_location) as conn:
|
with sqlite3.connect(cookie_location) as conn:
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
cur.execute('UPDATE cookies SET is_persistent = 1')
|
cur.execute("UPDATE cookies SET is_persistent = 1")
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
try:
|
try:
|
||||||
# db schema changed around version 66, this is the old schema
|
# db schema changed around version 66, this is the old schema
|
||||||
with sqlite3.connect(cookie_location) as conn:
|
with sqlite3.connect(cookie_location) as conn:
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
cur.execute('UPDATE cookies SET persistent = 1')
|
cur.execute("UPDATE cookies SET persistent = 1")
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'exception updating cookie DB %s', cookie_location,
|
"exception updating cookie DB %s", cookie_location, exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
|
|
||||||
cookie_db = None
|
cookie_db = None
|
||||||
try:
|
try:
|
||||||
with open(cookie_location, 'rb') as cookie_file:
|
with open(cookie_location, "rb") as cookie_file:
|
||||||
cookie_db = cookie_file.read()
|
cookie_db = cookie_file.read()
|
||||||
except OSError:
|
except OSError:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'exception reading from cookie DB file %s',
|
"exception reading from cookie DB file %s",
|
||||||
cookie_location, exc_info=True)
|
cookie_location,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
return cookie_db
|
return cookie_db
|
||||||
|
|
||||||
def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
|
def start(
|
||||||
disk_cache_size=None, websocket_timeout=60,
|
self,
|
||||||
window_height=900, window_width=1400):
|
proxy=None,
|
||||||
'''
|
cookie_db=None,
|
||||||
|
disk_cache_dir=None,
|
||||||
|
disk_cache_size=None,
|
||||||
|
websocket_timeout=60,
|
||||||
|
window_height=900,
|
||||||
|
window_width=1400,
|
||||||
|
):
|
||||||
|
"""
|
||||||
Starts chrome/chromium process.
|
Starts chrome/chromium process.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -154,103 +167,126 @@ class Chrome:
|
||||||
window_height, window_width: window height and width, in pixels
|
window_height, window_width: window height and width, in pixels
|
||||||
Returns:
|
Returns:
|
||||||
websocket url to chrome window with about:blank loaded
|
websocket url to chrome window with about:blank loaded
|
||||||
'''
|
"""
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self._home_tmpdir = tempfile.TemporaryDirectory()
|
self._home_tmpdir = tempfile.TemporaryDirectory()
|
||||||
self._chrome_user_data_dir = os.path.join(
|
self._chrome_user_data_dir = os.path.join(
|
||||||
self._home_tmpdir.name, 'chrome-user-data')
|
self._home_tmpdir.name, "chrome-user-data"
|
||||||
|
)
|
||||||
if cookie_db:
|
if cookie_db:
|
||||||
self._init_cookie_db(cookie_db)
|
self._init_cookie_db(cookie_db)
|
||||||
self._shutdown.clear()
|
self._shutdown.clear()
|
||||||
|
|
||||||
new_env = os.environ.copy()
|
new_env = os.environ.copy()
|
||||||
new_env['HOME'] = self._home_tmpdir.name
|
new_env["HOME"] = self._home_tmpdir.name
|
||||||
chrome_args = [
|
chrome_args = [
|
||||||
self.chrome_exe,
|
self.chrome_exe,
|
||||||
'-v',
|
"-v",
|
||||||
'--headless',
|
"--headless",
|
||||||
'--remote-debugging-port=%s' % self.port,
|
"--remote-debugging-port=%s" % self.port,
|
||||||
'--use-mock-keychain', # mac thing
|
"--use-mock-keychain", # mac thing
|
||||||
'--user-data-dir=%s' % self._chrome_user_data_dir,
|
"--user-data-dir=%s" % self._chrome_user_data_dir,
|
||||||
'--disable-background-networking', '--disable-breakpad',
|
"--disable-background-networking",
|
||||||
'--disable-renderer-backgrounding', '--disable-hang-monitor',
|
"--disable-breakpad",
|
||||||
'--disable-background-timer-throttling', '--mute-audio',
|
"--disable-renderer-backgrounding",
|
||||||
'--disable-web-sockets',
|
"--disable-hang-monitor",
|
||||||
f'--window-size={window_width},{window_height}',
|
"--disable-background-timer-throttling",
|
||||||
'--no-default-browser-check',
|
"--mute-audio",
|
||||||
'--disable-first-run-ui', '--no-first-run',
|
"--disable-web-sockets",
|
||||||
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
f"--window-size={window_width},{window_height}",
|
||||||
'--disable-web-security', '--disable-notifications',
|
"--no-default-browser-check",
|
||||||
'--disable-extensions', '--disable-save-password-bubble',
|
"--disable-first-run-ui",
|
||||||
'--disable-sync']
|
"--no-first-run",
|
||||||
|
"--homepage=about:blank",
|
||||||
|
"--disable-direct-npapi-requests",
|
||||||
|
"--disable-web-security",
|
||||||
|
"--disable-notifications",
|
||||||
|
"--disable-extensions",
|
||||||
|
"--disable-save-password-bubble",
|
||||||
|
"--disable-sync",
|
||||||
|
]
|
||||||
|
|
||||||
extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
|
extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
|
||||||
if extra_chrome_args:
|
if extra_chrome_args:
|
||||||
chrome_args.extend(extra_chrome_args.split())
|
chrome_args.extend(extra_chrome_args.split())
|
||||||
if disk_cache_dir:
|
if disk_cache_dir:
|
||||||
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
|
chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
|
||||||
if disk_cache_size:
|
if disk_cache_size:
|
||||||
chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
|
chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
|
||||||
if self.ignore_cert_errors:
|
if self.ignore_cert_errors:
|
||||||
chrome_args.append('--ignore-certificate-errors')
|
chrome_args.append("--ignore-certificate-errors")
|
||||||
if proxy:
|
if proxy:
|
||||||
chrome_args.append('--proxy-server=%s' % proxy)
|
chrome_args.append("--proxy-server=%s" % proxy)
|
||||||
chrome_args.append('about:blank')
|
chrome_args.append("about:blank")
|
||||||
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
|
self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
|
||||||
# start_new_session - new process group so we can kill the whole group
|
# start_new_session - new process group so we can kill the whole group
|
||||||
self.chrome_process = subprocess.Popen(
|
self.chrome_process = subprocess.Popen(
|
||||||
chrome_args, env=new_env, start_new_session=True,
|
chrome_args,
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
|
env=new_env,
|
||||||
|
start_new_session=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
bufsize=0,
|
||||||
|
)
|
||||||
self._out_reader_thread = threading.Thread(
|
self._out_reader_thread = threading.Thread(
|
||||||
target=self._read_stderr_stdout,
|
target=self._read_stderr_stdout,
|
||||||
name='ChromeOutReaderThread:%s' % self.port, daemon=True)
|
name="ChromeOutReaderThread:%s" % self.port,
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
self._out_reader_thread.start()
|
self._out_reader_thread.start()
|
||||||
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
|
self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
|
||||||
|
|
||||||
return self._websocket_url(timeout_sec=websocket_timeout)
|
return self._websocket_url(timeout_sec=websocket_timeout)
|
||||||
|
|
||||||
def _websocket_url(self, timeout_sec=60):
|
def _websocket_url(self, timeout_sec=60):
|
||||||
json_url = 'http://localhost:%s/json' % self.port
|
json_url = "http://localhost:%s/json" % self.port
|
||||||
# make this a member variable so that kill -QUIT reports it
|
# make this a member variable so that kill -QUIT reports it
|
||||||
self._start = time.time()
|
self._start = time.time()
|
||||||
self._last_warning = self._start
|
self._last_warning = self._start
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
all_debug_info = json.loads(raw_json.decode("utf-8"))
|
||||||
debug_info = [x for x in all_debug_info
|
debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
|
||||||
if x['url'] == 'about:blank']
|
|
||||||
|
|
||||||
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
|
||||||
self.logger.debug('%s returned %s', json_url, raw_json)
|
self.logger.debug("%s returned %s", json_url, raw_json)
|
||||||
url = debug_info[0]['webSocketDebuggerUrl']
|
url = debug_info[0]["webSocketDebuggerUrl"]
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'got chrome window websocket debug url %s from %s',
|
"got chrome window websocket debug url %s from %s",
|
||||||
url, json_url)
|
url,
|
||||||
|
json_url,
|
||||||
|
)
|
||||||
return url
|
return url
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if time.time() - self._last_warning > 30:
|
if time.time() - self._last_warning > 30:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'problem with %s (will keep trying until timeout '
|
"problem with %s (will keep trying until timeout "
|
||||||
'of %d seconds): %s', json_url, timeout_sec, e)
|
"of %d seconds): %s",
|
||||||
|
json_url,
|
||||||
|
timeout_sec,
|
||||||
|
e,
|
||||||
|
)
|
||||||
self._last_warning = time.time()
|
self._last_warning = time.time()
|
||||||
finally:
|
finally:
|
||||||
e = None
|
e = None
|
||||||
if self.chrome_process:
|
if self.chrome_process:
|
||||||
if time.time() - self._start > timeout_sec:
|
if time.time() - self._start > timeout_sec:
|
||||||
e = Exception(
|
e = Exception(
|
||||||
'killing chrome, failed to retrieve %s after '
|
"killing chrome, failed to retrieve %s after "
|
||||||
'%s seconds' % (
|
"%s seconds" % (json_url, time.time() - self._start)
|
||||||
json_url, time.time() - self._start))
|
)
|
||||||
elif self.chrome_process.poll() is not None:
|
elif self.chrome_process.poll() is not None:
|
||||||
e = Exception(
|
e = Exception(
|
||||||
'chrome process died with status %s' % self.chrome_process.poll())
|
"chrome process died with status %s"
|
||||||
|
% self.chrome_process.poll()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
else:
|
else:
|
||||||
e = Exception('??? self.chrome_process is not set ???')
|
e = Exception("??? self.chrome_process is not set ???")
|
||||||
if e:
|
if e:
|
||||||
self.stop()
|
self.stop()
|
||||||
raise e
|
raise e
|
||||||
|
@ -258,11 +294,13 @@ class Chrome:
|
||||||
def _read_stderr_stdout(self):
|
def _read_stderr_stdout(self):
|
||||||
# XXX select doesn't work on windows
|
# XXX select doesn't work on windows
|
||||||
def readline_nonblock(f):
|
def readline_nonblock(f):
|
||||||
buf = b''
|
buf = b""
|
||||||
try:
|
try:
|
||||||
while not self._shutdown.is_set() and (
|
while (
|
||||||
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
not self._shutdown.is_set()
|
||||||
[f],[],[],0.5)[0]:
|
and (len(buf) == 0 or buf[-1] != 0xA)
|
||||||
|
and select.select([f], [], [], 0.5)[0]
|
||||||
|
):
|
||||||
buf += f.read(1)
|
buf += f.read(1)
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
# When the chrome process crashes, stdout & stderr are closed
|
# When the chrome process crashes, stdout & stderr are closed
|
||||||
|
@ -276,16 +314,16 @@ class Chrome:
|
||||||
buf = readline_nonblock(self.chrome_process.stdout)
|
buf = readline_nonblock(self.chrome_process.stdout)
|
||||||
if buf:
|
if buf:
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'chrome pid %s STDOUT %s',
|
"chrome pid %s STDOUT %s", self.chrome_process.pid, buf
|
||||||
self.chrome_process.pid, buf)
|
)
|
||||||
|
|
||||||
buf = readline_nonblock(self.chrome_process.stderr)
|
buf = readline_nonblock(self.chrome_process.stderr)
|
||||||
if buf:
|
if buf:
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'chrome pid %s STDERR %s',
|
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
|
||||||
self.chrome_process.pid, buf)
|
)
|
||||||
except:
|
except:
|
||||||
self.logger.error('unexpected exception', exc_info=True)
|
self.logger.error("unexpected exception", exc_info=True)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
if not self.chrome_process or self._shutdown.is_set():
|
if not self.chrome_process or self._shutdown.is_set():
|
||||||
|
@ -294,8 +332,7 @@ class Chrome:
|
||||||
|
|
||||||
timeout_sec = 300
|
timeout_sec = 300
|
||||||
if self.chrome_process.poll() is None:
|
if self.chrome_process.poll() is None:
|
||||||
self.logger.info(
|
self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
|
||||||
'terminating chrome pgid %s', self.chrome_process.pid)
|
|
||||||
|
|
||||||
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
@ -306,12 +343,14 @@ class Chrome:
|
||||||
if status is not None:
|
if status is not None:
|
||||||
if status == 0:
|
if status == 0:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'chrome pid %s exited normally',
|
"chrome pid %s exited normally", self.chrome_process.pid
|
||||||
self.chrome_process.pid)
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'chrome pid %s exited with nonzero status %s',
|
"chrome pid %s exited with nonzero status %s",
|
||||||
self.chrome_process.pid, status)
|
self.chrome_process.pid,
|
||||||
|
status,
|
||||||
|
)
|
||||||
|
|
||||||
# XXX I would like to forcefully kill the process group
|
# XXX I would like to forcefully kill the process group
|
||||||
# here to guarantee no orphaned chromium subprocesses hang
|
# here to guarantee no orphaned chromium subprocesses hang
|
||||||
|
@ -321,14 +360,18 @@ class Chrome:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'chrome pid %s still alive %.1f seconds after sending '
|
"chrome pid %s still alive %.1f seconds after sending "
|
||||||
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
"SIGTERM, sending SIGKILL",
|
||||||
time.time() - t0)
|
self.chrome_process.pid,
|
||||||
|
time.time() - t0,
|
||||||
|
)
|
||||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||||
status = self.chrome_process.wait()
|
status = self.chrome_process.wait()
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'chrome pid %s reaped (status=%s) after killing with '
|
"chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
|
||||||
'SIGKILL', self.chrome_process.pid, status)
|
self.chrome_process.pid,
|
||||||
|
status,
|
||||||
|
)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
self.chrome_process.stdout.close()
|
self.chrome_process.stdout.close()
|
||||||
|
@ -337,8 +380,7 @@ class Chrome:
|
||||||
self._home_tmpdir.cleanup()
|
self._home_tmpdir.cleanup()
|
||||||
except:
|
except:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'exception deleting %s', self._home_tmpdir,
|
"exception deleting %s", self._home_tmpdir, exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
self._out_reader_thread.join()
|
self._out_reader_thread.join()
|
||||||
self.chrome_process = None
|
self.chrome_process = None
|
||||||
|
|
||||||
|
|
900
brozzler/cli.py
900
brozzler/cli.py
File diff suppressed because it is too large
Load diff
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
||||||
endspoints etc
|
endspoints etc
|
||||||
|
|
||||||
|
@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import flask
|
import flask
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__,
|
||||||
|
e,
|
||||||
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
import json
|
import json
|
||||||
|
@ -41,33 +44,44 @@ app = flask.Flask(__name__)
|
||||||
|
|
||||||
# configure with environment variables
|
# configure with environment variables
|
||||||
SETTINGS = {
|
SETTINGS = {
|
||||||
'RETHINKDB_SERVERS': os.environ.get(
|
"RETHINKDB_SERVERS": os.environ.get(
|
||||||
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
|
"BROZZLER_RETHINKDB_SERVERS", "localhost"
|
||||||
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
).split(","),
|
||||||
'WAYBACK_BASEURL': os.environ.get(
|
"RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
|
||||||
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
|
"WAYBACK_BASEURL": os.environ.get(
|
||||||
'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'),
|
"WAYBACK_BASEURL", "http://localhost:8880/brozzler"
|
||||||
'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost')
|
),
|
||||||
|
"DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
|
||||||
|
"DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
|
||||||
}
|
}
|
||||||
rr = doublethink.Rethinker(
|
rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
|
||||||
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
|
|
||||||
_svc_reg = None
|
_svc_reg = None
|
||||||
|
|
||||||
|
|
||||||
def service_registry():
|
def service_registry():
|
||||||
global _svc_reg
|
global _svc_reg
|
||||||
if not _svc_reg:
|
if not _svc_reg:
|
||||||
_svc_reg = doublethink.ServiceRegistry(rr)
|
_svc_reg = doublethink.ServiceRegistry(rr)
|
||||||
return _svc_reg
|
return _svc_reg
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/queued_count")
|
@app.route("/api/sites/<site_id>/queued_count")
|
||||||
@app.route("/api/site/<site_id>/queued_count")
|
@app.route("/api/site/<site_id>/queued_count")
|
||||||
def queued_count(site_id):
|
def queued_count(site_id):
|
||||||
reql = rr.table("pages").between(
|
reql = (
|
||||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
rr.table("pages")
|
||||||
index="priority_by_site").count()
|
.between(
|
||||||
|
[site_id, 0, False, r.minval],
|
||||||
|
[site_id, 0, False, r.maxval],
|
||||||
|
index="priority_by_site",
|
||||||
|
)
|
||||||
|
.count()
|
||||||
|
)
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
count = reql.run()
|
count = reql.run()
|
||||||
return flask.jsonify(count=count)
|
return flask.jsonify(count=count)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/queue")
|
@app.route("/api/sites/<site_id>/queue")
|
||||||
@app.route("/api/site/<site_id>/queue")
|
@app.route("/api/site/<site_id>/queue")
|
||||||
def queue(site_id):
|
def queue(site_id):
|
||||||
|
@ -75,38 +89,52 @@ def queue(site_id):
|
||||||
start = flask.request.args.get("start", 0)
|
start = flask.request.args.get("start", 0)
|
||||||
end = flask.request.args.get("end", start + 90)
|
end = flask.request.args.get("end", start + 90)
|
||||||
reql = rr.table("pages").between(
|
reql = rr.table("pages").between(
|
||||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
[site_id, 0, False, r.minval],
|
||||||
index="priority_by_site")[start:end]
|
[site_id, 0, False, r.maxval],
|
||||||
|
index="priority_by_site",
|
||||||
|
)[start:end]
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
queue_ = reql.run()
|
queue_ = reql.run()
|
||||||
return flask.jsonify(queue_=list(queue_))
|
return flask.jsonify(queue_=list(queue_))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/pages_count")
|
@app.route("/api/sites/<site_id>/pages_count")
|
||||||
@app.route("/api/site/<site_id>/pages_count")
|
@app.route("/api/site/<site_id>/pages_count")
|
||||||
@app.route("/api/sites/<site_id>/page_count")
|
@app.route("/api/sites/<site_id>/page_count")
|
||||||
@app.route("/api/site/<site_id>/page_count")
|
@app.route("/api/site/<site_id>/page_count")
|
||||||
def page_count(site_id):
|
def page_count(site_id):
|
||||||
reql = rr.table("pages").between(
|
reql = (
|
||||||
|
rr.table("pages")
|
||||||
|
.between(
|
||||||
[site_id, 1, False, r.minval],
|
[site_id, 1, False, r.minval],
|
||||||
[site_id, r.maxval, False, r.maxval],
|
[site_id, r.maxval, False, r.maxval],
|
||||||
index="priority_by_site").count()
|
index="priority_by_site",
|
||||||
|
)
|
||||||
|
.count()
|
||||||
|
)
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
count = reql.run()
|
count = reql.run()
|
||||||
return flask.jsonify(count=count)
|
return flask.jsonify(count=count)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/pages")
|
@app.route("/api/sites/<site_id>/pages")
|
||||||
@app.route("/api/site/<site_id>/pages")
|
@app.route("/api/site/<site_id>/pages")
|
||||||
def pages(site_id):
|
def pages(site_id):
|
||||||
"""Pages already crawled."""
|
"""Pages already crawled."""
|
||||||
start = int(flask.request.args.get("start", 0))
|
start = int(flask.request.args.get("start", 0))
|
||||||
end = int(flask.request.args.get("end", start + 90))
|
end = int(flask.request.args.get("end", start + 90))
|
||||||
reql = rr.table("pages").between(
|
reql = (
|
||||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
rr.table("pages")
|
||||||
index="least_hops").order_by(index="least_hops")[start:end]
|
.between(
|
||||||
|
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
|
||||||
|
)
|
||||||
|
.order_by(index="least_hops")[start:end]
|
||||||
|
)
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
pages_ = reql.run()
|
pages_ = reql.run()
|
||||||
return flask.jsonify(pages=list(pages_))
|
return flask.jsonify(pages=list(pages_))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/pages/<page_id>")
|
@app.route("/api/pages/<page_id>")
|
||||||
@app.route("/api/page/<page_id>")
|
@app.route("/api/page/<page_id>")
|
||||||
def page(page_id):
|
def page(page_id):
|
||||||
|
@ -115,6 +143,7 @@ def page(page_id):
|
||||||
page_ = reql.run()
|
page_ = reql.run()
|
||||||
return flask.jsonify(page_)
|
return flask.jsonify(page_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/pages/<page_id>/yaml")
|
@app.route("/api/pages/<page_id>/yaml")
|
||||||
@app.route("/api/page/<page_id>/yaml")
|
@app.route("/api/page/<page_id>/yaml")
|
||||||
def page_yaml(page_id):
|
def page_yaml(page_id):
|
||||||
|
@ -122,8 +151,9 @@ def page_yaml(page_id):
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
page_ = reql.run()
|
page_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(page_, default_flow_style=False),
|
yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
|
||||||
mimetype="application/yaml")
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>")
|
@app.route("/api/sites/<site_id>")
|
||||||
@app.route("/api/site/<site_id>")
|
@app.route("/api/site/<site_id>")
|
||||||
|
@ -135,6 +165,7 @@ def site(site_id):
|
||||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||||
return flask.jsonify(s)
|
return flask.jsonify(s)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/yaml")
|
@app.route("/api/sites/<site_id>/yaml")
|
||||||
@app.route("/api/site/<site_id>/yaml")
|
@app.route("/api/site/<site_id>/yaml")
|
||||||
def site_yaml(site_id):
|
def site_yaml(site_id):
|
||||||
|
@ -142,8 +173,9 @@ def site_yaml(site_id):
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
site_ = reql.run()
|
site_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(site_, default_flow_style=False),
|
yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
|
||||||
mimetype="application/yaml")
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/stats/<bucket>")
|
@app.route("/api/stats/<bucket>")
|
||||||
def stats(bucket):
|
def stats(bucket):
|
||||||
|
@ -152,6 +184,7 @@ def stats(bucket):
|
||||||
stats_ = reql.run()
|
stats_ = reql.run()
|
||||||
return flask.jsonify(stats_)
|
return flask.jsonify(stats_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/jobs/<job_id>/sites")
|
@app.route("/api/jobs/<job_id>/sites")
|
||||||
@app.route("/api/job/<job_id>/sites")
|
@app.route("/api/job/<job_id>/sites")
|
||||||
def sites(job_id):
|
def sites(job_id):
|
||||||
|
@ -168,6 +201,7 @@ def sites(job_id):
|
||||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||||
return flask.jsonify(sites=sites_)
|
return flask.jsonify(sites=sites_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/jobless-sites")
|
@app.route("/api/jobless-sites")
|
||||||
def jobless_sites():
|
def jobless_sites():
|
||||||
# XXX inefficient (unindexed) query
|
# XXX inefficient (unindexed) query
|
||||||
|
@ -180,6 +214,7 @@ def jobless_sites():
|
||||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||||
return flask.jsonify(sites=sites_)
|
return flask.jsonify(sites=sites_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/jobs/<job_id>")
|
@app.route("/api/jobs/<job_id>")
|
||||||
@app.route("/api/job/<job_id>")
|
@app.route("/api/job/<job_id>")
|
||||||
def job(job_id):
|
def job(job_id):
|
||||||
|
@ -192,6 +227,7 @@ def job(job_id):
|
||||||
job_ = reql.run()
|
job_ = reql.run()
|
||||||
return flask.jsonify(job_)
|
return flask.jsonify(job_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/jobs/<job_id>/yaml")
|
@app.route("/api/jobs/<job_id>/yaml")
|
||||||
@app.route("/api/job/<job_id>/yaml")
|
@app.route("/api/job/<job_id>/yaml")
|
||||||
def job_yaml(job_id):
|
def job_yaml(job_id):
|
||||||
|
@ -203,19 +239,22 @@ def job_yaml(job_id):
|
||||||
logging.debug("querying rethinkdb: %s", reql)
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
job_ = reql.run()
|
job_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(job_, default_flow_style=False),
|
yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
|
||||||
mimetype="application/yaml")
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/workers")
|
@app.route("/api/workers")
|
||||||
def workers():
|
def workers():
|
||||||
workers_ = service_registry().available_services("brozzler-worker")
|
workers_ = service_registry().available_services("brozzler-worker")
|
||||||
return flask.jsonify(workers=list(workers_))
|
return flask.jsonify(workers=list(workers_))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/services")
|
@app.route("/api/services")
|
||||||
def services():
|
def services():
|
||||||
services_ = service_registry().available_services()
|
services_ = service_registry().available_services()
|
||||||
return flask.jsonify(services=list(services_))
|
return flask.jsonify(services=list(services_))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/jobs")
|
@app.route("/api/jobs")
|
||||||
def jobs():
|
def jobs():
|
||||||
reql = rr.table("jobs").order_by(r.desc("id"))
|
reql = rr.table("jobs").order_by(r.desc("id"))
|
||||||
|
@ -223,20 +262,24 @@ def jobs():
|
||||||
jobs_ = list(reql.run())
|
jobs_ = list(reql.run())
|
||||||
return flask.jsonify(jobs=jobs_)
|
return flask.jsonify(jobs=jobs_)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/config")
|
@app.route("/api/config")
|
||||||
def config():
|
def config():
|
||||||
return flask.jsonify(config=SETTINGS)
|
return flask.jsonify(config=SETTINGS)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/<path:path>")
|
@app.route("/api/<path:path>")
|
||||||
@app.route("/api", defaults={"path": ""})
|
@app.route("/api", defaults={"path": ""})
|
||||||
def api404(path):
|
def api404(path):
|
||||||
flask.abort(404)
|
flask.abort(404)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/", defaults={"path": ""})
|
@app.route("/", defaults={"path": ""})
|
||||||
@app.route("/<path:path>")
|
@app.route("/<path:path>")
|
||||||
def root(path):
|
def root(path):
|
||||||
return flask.render_template("index.html")
|
return flask.render_template("index.html")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import gunicorn.app.base
|
import gunicorn.app.base
|
||||||
from gunicorn.six import iteritems
|
from gunicorn.six import iteritems
|
||||||
|
@ -255,8 +298,12 @@ try:
|
||||||
|
|
||||||
def load_config(self):
|
def load_config(self):
|
||||||
config = dict(
|
config = dict(
|
||||||
[(key, value) for key, value in iteritems(self.options)
|
[
|
||||||
if key in self.cfg.settings and value is not None])
|
(key, value)
|
||||||
|
for key, value in iteritems(self.options)
|
||||||
|
if key in self.cfg.settings and value is not None
|
||||||
|
]
|
||||||
|
)
|
||||||
for key, value in iteritems(config):
|
for key, value in iteritems(config):
|
||||||
self.cfg.set(key.lower(), value)
|
self.cfg.set(key.lower(), value)
|
||||||
self.cfg.set("logger_class", BypassGunicornLogging)
|
self.cfg.set("logger_class", BypassGunicornLogging)
|
||||||
|
@ -270,37 +317,42 @@ try:
|
||||||
GunicornBrozzlerDashboard(app, options).run()
|
GunicornBrozzlerDashboard(app, options).run()
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
logging.info("running brozzler-dashboard using simple flask app.run")
|
logging.info("running brozzler-dashboard using simple flask app.run")
|
||||||
app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT'])
|
app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
import argparse
|
import argparse
|
||||||
import brozzler.cli
|
import brozzler.cli
|
||||||
|
|
||||||
argv = argv or sys.argv
|
argv = argv or sys.argv
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(argv[0]),
|
prog=os.path.basename(argv[0]),
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
description=(
|
description=(
|
||||||
'brozzler-dashboard - web application for viewing brozzler '
|
"brozzler-dashboard - web application for viewing brozzler " "crawl status"
|
||||||
'crawl status'),
|
),
|
||||||
epilog=(
|
epilog=(
|
||||||
'brozzler-dashboard has no command line options, but can be '
|
"brozzler-dashboard has no command line options, but can be "
|
||||||
'configured using the following environment variables:\n\n'
|
"configured using the following environment variables:\n\n"
|
||||||
' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. '
|
" BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. "
|
||||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
|
"db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
|
||||||
'localhost)\n'
|
"localhost)\n"
|
||||||
' BROZZLER_RETHINKDB_DB rethinkdb database name '
|
" BROZZLER_RETHINKDB_DB rethinkdb database name "
|
||||||
'(default: brozzler)\n'
|
"(default: brozzler)\n"
|
||||||
' WAYBACK_BASEURL base url for constructing wayback '
|
" WAYBACK_BASEURL base url for constructing wayback "
|
||||||
'links (default http://localhost:8880/brozzler)'
|
"links (default http://localhost:8880/brozzler)"
|
||||||
' DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n'
|
" DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n"
|
||||||
' DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)'))
|
" DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
|
||||||
|
),
|
||||||
|
)
|
||||||
brozzler.cli.add_common_options(arg_parser, argv)
|
brozzler.cli.add_common_options(arg_parser, argv)
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
brozzler.cli.configure_logging(args)
|
brozzler.cli.configure_logging(args)
|
||||||
run()
|
run()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
211
brozzler/easy.py
211
brozzler/easy.py
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
|
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
|
||||||
working together in a single process
|
working together in a single process
|
||||||
|
|
||||||
|
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import warcprox
|
import warcprox
|
||||||
import warcprox.main
|
import warcprox.main
|
||||||
|
@ -32,7 +33,9 @@ except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee README.rst for more information.',
|
'brozzler[easy]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__,
|
||||||
|
e,
|
||||||
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import argparse
|
import argparse
|
||||||
import brozzler
|
import brozzler
|
||||||
|
@ -46,76 +49,112 @@ import doublethink
|
||||||
import traceback
|
import traceback
|
||||||
import socketserver
|
import socketserver
|
||||||
|
|
||||||
|
|
||||||
def _build_arg_parser(argv=None):
|
def _build_arg_parser(argv=None):
|
||||||
argv = argv or sys.argv
|
argv = argv or sys.argv
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
||||||
prog=os.path.basename(argv[0]), description=(
|
prog=os.path.basename(argv[0]),
|
||||||
'brozzler-easy - easy deployment of brozzler, with '
|
description=(
|
||||||
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
|
"brozzler-easy - easy deployment of brozzler, with "
|
||||||
'running in a single process'))
|
"brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
|
||||||
|
"running in a single process"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# common args
|
# common args
|
||||||
brozzler.cli.add_rethinkdb_options(arg_parser)
|
brozzler.cli.add_rethinkdb_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
"-d",
|
||||||
help='where to write warcs')
|
"--warcs-dir",
|
||||||
|
dest="warcs_dir",
|
||||||
|
default="./warcs",
|
||||||
|
help="where to write warcs",
|
||||||
|
)
|
||||||
|
|
||||||
# warcprox args
|
# warcprox args
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-c', '--cacert', dest='cacert',
|
"-c",
|
||||||
default='./%s-warcprox-ca.pem' % socket.gethostname(),
|
"--cacert",
|
||||||
|
dest="cacert",
|
||||||
|
default="./%s-warcprox-ca.pem" % socket.gethostname(),
|
||||||
help=(
|
help=(
|
||||||
'warcprox CA certificate file; if file does not exist, it '
|
"warcprox CA certificate file; if file does not exist, it "
|
||||||
'will be created'))
|
"will be created"
|
||||||
|
),
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--certs-dir', dest='certs_dir',
|
"--certs-dir",
|
||||||
default='./%s-warcprox-ca' % socket.gethostname(),
|
dest="certs_dir",
|
||||||
help='where warcprox will store and load generated certificates')
|
default="./%s-warcprox-ca" % socket.gethostname(),
|
||||||
|
help="where warcprox will store and load generated certificates",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
"--onion-tor-socks-proxy",
|
||||||
default=None, help=(
|
dest="onion_tor_socks_proxy",
|
||||||
'host:port of tor socks proxy, used only to connect to '
|
default=None,
|
||||||
'.onion sites'))
|
help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
|
||||||
|
)
|
||||||
|
|
||||||
# brozzler-worker args
|
# brozzler-worker args
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
"-e",
|
||||||
|
"--chrome-exe",
|
||||||
|
dest="chrome_exe",
|
||||||
default=brozzler.cli.suggest_default_chrome_exe(),
|
default=brozzler.cli.suggest_default_chrome_exe(),
|
||||||
help='executable to use to invoke chrome')
|
help="executable to use to invoke chrome",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-n', '--max-browsers', dest='max_browsers',
|
"-n",
|
||||||
type=int, default=1, help=(
|
"--max-browsers",
|
||||||
'max number of chrome instances simultaneously '
|
dest="max_browsers",
|
||||||
'browsing pages'))
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help=("max number of chrome instances simultaneously " "browsing pages"),
|
||||||
|
)
|
||||||
|
|
||||||
# pywb args
|
# pywb args
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--pywb-address', dest='pywb_address',
|
"--pywb-address",
|
||||||
default='0.0.0.0',
|
dest="pywb_address",
|
||||||
help='pywb wayback address to listen on')
|
default="0.0.0.0",
|
||||||
|
help="pywb wayback address to listen on",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--pywb-port', dest='pywb_port', type=int,
|
"--pywb-port",
|
||||||
default=8880, help='pywb wayback port')
|
dest="pywb_port",
|
||||||
|
type=int,
|
||||||
|
default=8880,
|
||||||
|
help="pywb wayback port",
|
||||||
|
)
|
||||||
|
|
||||||
# dashboard args
|
# dashboard args
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--dashboard-address', dest='dashboard_address',
|
"--dashboard-address",
|
||||||
default='localhost',
|
dest="dashboard_address",
|
||||||
help='brozzler dashboard address to listen on')
|
default="localhost",
|
||||||
|
help="brozzler dashboard address to listen on",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--dashboard-port', dest='dashboard_port',
|
"--dashboard-port",
|
||||||
type=int, default=8881, help='brozzler dashboard port')
|
dest="dashboard_port",
|
||||||
|
type=int,
|
||||||
|
default=8881,
|
||||||
|
help="brozzler dashboard port",
|
||||||
|
)
|
||||||
|
|
||||||
# common at the bottom args
|
# common at the bottom args
|
||||||
brozzler.cli.add_common_options(arg_parser, argv)
|
brozzler.cli.add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
return arg_parser
|
return arg_parser
|
||||||
|
|
||||||
|
|
||||||
class ThreadingWSGIServer(
|
class ThreadingWSGIServer(
|
||||||
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
|
socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
|
||||||
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BrozzlerEasyController:
|
class BrozzlerEasyController:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -123,25 +162,31 @@ class BrozzlerEasyController:
|
||||||
self.stop = threading.Event()
|
self.stop = threading.Event()
|
||||||
self.args = args
|
self.args = args
|
||||||
self.warcprox_controller = warcprox.controller.WarcproxController(
|
self.warcprox_controller = warcprox.controller.WarcproxController(
|
||||||
self._warcprox_opts(args))
|
self._warcprox_opts(args)
|
||||||
|
)
|
||||||
self.brozzler_worker = self._init_brozzler_worker(args)
|
self.brozzler_worker = self._init_brozzler_worker(args)
|
||||||
self.pywb_httpd = self._init_pywb(args)
|
self.pywb_httpd = self._init_pywb(args)
|
||||||
self.dashboard_httpd = self._init_brozzler_dashboard(args)
|
self.dashboard_httpd = self._init_brozzler_dashboard(args)
|
||||||
|
|
||||||
def _init_brozzler_dashboard(self, args):
|
def _init_brozzler_dashboard(self, args):
|
||||||
return wsgiref.simple_server.make_server(
|
return wsgiref.simple_server.make_server(
|
||||||
args.dashboard_address, args.dashboard_port,
|
args.dashboard_address,
|
||||||
brozzler.dashboard.app, ThreadingWSGIServer)
|
args.dashboard_port,
|
||||||
|
brozzler.dashboard.app,
|
||||||
|
ThreadingWSGIServer,
|
||||||
|
)
|
||||||
|
|
||||||
def _init_brozzler_worker(self, args):
|
def _init_brozzler_worker(self, args):
|
||||||
rr = doublethink.Rethinker(
|
rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
worker = brozzler.worker.BrozzlerWorker(
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
frontier, service_registry, chrome_exe=args.chrome_exe,
|
frontier,
|
||||||
proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
|
service_registry,
|
||||||
max_browsers=args.max_browsers)
|
chrome_exe=args.chrome_exe,
|
||||||
|
proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
|
||||||
|
max_browsers=args.max_browsers,
|
||||||
|
)
|
||||||
return worker
|
return worker
|
||||||
|
|
||||||
def _init_pywb(self, args):
|
def _init_pywb(self, args):
|
||||||
|
@ -152,66 +197,67 @@ class BrozzlerEasyController:
|
||||||
brozzler.pywb.monkey_patch_fuzzy_query()
|
brozzler.pywb.monkey_patch_fuzzy_query()
|
||||||
brozzler.pywb.monkey_patch_calc_search_range()
|
brozzler.pywb.monkey_patch_calc_search_range()
|
||||||
|
|
||||||
if args.warcs_dir.endswith('/'):
|
if args.warcs_dir.endswith("/"):
|
||||||
warcs_dir = args.warcs_dir
|
warcs_dir = args.warcs_dir
|
||||||
else:
|
else:
|
||||||
warcs_dir = args.warcs_dir + '/'
|
warcs_dir = args.warcs_dir + "/"
|
||||||
|
|
||||||
conf = {
|
conf = {
|
||||||
'collections': {
|
"collections": {
|
||||||
'brozzler': {
|
"brozzler": {
|
||||||
'index_paths': brozzler.pywb.RethinkCDXSource(
|
"index_paths": brozzler.pywb.RethinkCDXSource(
|
||||||
servers=args.rethinkdb_servers.split(","),
|
servers=args.rethinkdb_servers.split(","),
|
||||||
db=args.rethinkdb_db, table='captures')
|
db=args.rethinkdb_db,
|
||||||
|
table="captures",
|
||||||
|
)
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
# 'enable_http_proxy': True,
|
# 'enable_http_proxy': True,
|
||||||
# 'enable_memento': True,
|
# 'enable_memento': True,
|
||||||
'archive_paths': warcs_dir,
|
"archive_paths": warcs_dir,
|
||||||
'enable_cdx_api': True,
|
"enable_cdx_api": True,
|
||||||
'framed_replay': True,
|
"framed_replay": True,
|
||||||
'port': args.pywb_port,
|
"port": args.pywb_port,
|
||||||
'enable_auto_colls': False,
|
"enable_auto_colls": False,
|
||||||
}
|
}
|
||||||
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
|
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
|
||||||
pywb.webapp.pywb_init.create_wb_router, config=conf,
|
pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
|
||||||
load_yaml=False)
|
)
|
||||||
|
|
||||||
# disable is_hop_by_hop restrictions
|
# disable is_hop_by_hop restrictions
|
||||||
wsgiref.handlers.is_hop_by_hop = lambda x: False
|
wsgiref.handlers.is_hop_by_hop = lambda x: False
|
||||||
return wsgiref.simple_server.make_server(
|
return wsgiref.simple_server.make_server(
|
||||||
args.pywb_address, args.pywb_port, wsgi_app,
|
args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
|
||||||
ThreadingWSGIServer)
|
)
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self.logger.info('starting warcprox')
|
self.logger.info("starting warcprox")
|
||||||
self.warcprox_controller.start()
|
self.warcprox_controller.start()
|
||||||
|
|
||||||
# XXX wait til fully started?
|
# XXX wait til fully started?
|
||||||
self.logger.info('starting brozzler-worker')
|
self.logger.info("starting brozzler-worker")
|
||||||
self.brozzler_worker.start()
|
self.brozzler_worker.start()
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
|
||||||
'starting pywb at %s:%s', *self.pywb_httpd.server_address)
|
|
||||||
threading.Thread(target=self.pywb_httpd.serve_forever).start()
|
threading.Thread(target=self.pywb_httpd.serve_forever).start()
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'starting brozzler-dashboard at %s:%s',
|
"starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
|
||||||
*self.dashboard_httpd.server_address)
|
)
|
||||||
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
|
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
self.logger.info('shutting down brozzler-dashboard')
|
self.logger.info("shutting down brozzler-dashboard")
|
||||||
self.dashboard_httpd.shutdown()
|
self.dashboard_httpd.shutdown()
|
||||||
|
|
||||||
self.logger.info('shutting down brozzler-worker')
|
self.logger.info("shutting down brozzler-worker")
|
||||||
self.brozzler_worker.shutdown_now()
|
self.brozzler_worker.shutdown_now()
|
||||||
# brozzler-worker is fully shut down at this point
|
# brozzler-worker is fully shut down at this point
|
||||||
|
|
||||||
self.logger.info('shutting down pywb')
|
self.logger.info("shutting down pywb")
|
||||||
self.pywb_httpd.shutdown()
|
self.pywb_httpd.shutdown()
|
||||||
|
|
||||||
self.logger.info('shutting down warcprox')
|
self.logger.info("shutting down warcprox")
|
||||||
self.warcprox_controller.shutdown()
|
self.warcprox_controller.shutdown()
|
||||||
|
|
||||||
def wait_for_shutdown_request(self):
|
def wait_for_shutdown_request(self):
|
||||||
|
@ -222,14 +268,14 @@ class BrozzlerEasyController:
|
||||||
self.shutdown()
|
self.shutdown()
|
||||||
|
|
||||||
def _warcprox_opts(self, args):
|
def _warcprox_opts(self, args):
|
||||||
'''
|
"""
|
||||||
Takes args as produced by the argument parser built by
|
Takes args as produced by the argument parser built by
|
||||||
_build_arg_parser and builds warcprox arguments object suitable to pass
|
_build_arg_parser and builds warcprox arguments object suitable to pass
|
||||||
to warcprox.main.init_controller. Copies some arguments, renames some,
|
to warcprox.main.init_controller. Copies some arguments, renames some,
|
||||||
populates some with defaults appropriate for brozzler-easy, etc.
|
populates some with defaults appropriate for brozzler-easy, etc.
|
||||||
'''
|
"""
|
||||||
warcprox_opts = warcprox.Options()
|
warcprox_opts = warcprox.Options()
|
||||||
warcprox_opts.address = 'localhost'
|
warcprox_opts.address = "localhost"
|
||||||
# let the OS choose an available port; discover it later using
|
# let the OS choose an available port; discover it later using
|
||||||
# sock.getsockname()[1]
|
# sock.getsockname()[1]
|
||||||
warcprox_opts.port = 0
|
warcprox_opts.port = 0
|
||||||
|
@ -237,17 +283,18 @@ class BrozzlerEasyController:
|
||||||
warcprox_opts.certs_dir = args.certs_dir
|
warcprox_opts.certs_dir = args.certs_dir
|
||||||
warcprox_opts.directory = args.warcs_dir
|
warcprox_opts.directory = args.warcs_dir
|
||||||
warcprox_opts.gzip = True
|
warcprox_opts.gzip = True
|
||||||
warcprox_opts.prefix = 'brozzler'
|
warcprox_opts.prefix = "brozzler"
|
||||||
warcprox_opts.size = 1000 * 1000 * 1000
|
warcprox_opts.size = 1000 * 1000 * 1000
|
||||||
warcprox_opts.rollover_idle_time = 3 * 60
|
warcprox_opts.rollover_idle_time = 3 * 60
|
||||||
warcprox_opts.digest_algorithm = 'sha1'
|
warcprox_opts.digest_algorithm = "sha1"
|
||||||
warcprox_opts.base32 = True
|
warcprox_opts.base32 = True
|
||||||
warcprox_opts.stats_db_file = None
|
warcprox_opts.stats_db_file = None
|
||||||
warcprox_opts.playback_port = None
|
warcprox_opts.playback_port = None
|
||||||
warcprox_opts.playback_index_db_file = None
|
warcprox_opts.playback_index_db_file = None
|
||||||
warcprox_opts.rethinkdb_big_table_url = (
|
warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
|
||||||
'rethinkdb://%s/%s/captures' % (
|
args.rethinkdb_servers,
|
||||||
args.rethinkdb_servers, args.rethinkdb_db))
|
args.rethinkdb_db,
|
||||||
|
)
|
||||||
warcprox_opts.queue_size = 500
|
warcprox_opts.queue_size = 500
|
||||||
warcprox_opts.max_threads = None
|
warcprox_opts.max_threads = None
|
||||||
warcprox_opts.profile = False
|
warcprox_opts.profile = False
|
||||||
|
@ -259,9 +306,11 @@ class BrozzlerEasyController:
|
||||||
for th in threading.enumerate():
|
for th in threading.enumerate():
|
||||||
state_strs.append(str(th))
|
state_strs.append(str(th))
|
||||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||||
state_strs.append(''.join(stack))
|
state_strs.append("".join(stack))
|
||||||
logging.warning('dumping state (caught signal {})\n{}'.format(
|
logging.warning(
|
||||||
signum, '\n'.join(state_strs)))
|
"dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
argv = argv or sys.argv
|
argv = argv or sys.argv
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||||
|
|
||||||
Copyright (C) 2014-2018 Internet Archive
|
Copyright (C) 2014-2018 Internet Archive
|
||||||
|
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
|
@ -27,9 +27,11 @@ import urlcanon
|
||||||
|
|
||||||
r = rdb.RethinkDB()
|
r = rdb.RethinkDB()
|
||||||
|
|
||||||
|
|
||||||
class UnexpectedDbResult(Exception):
|
class UnexpectedDbResult(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RethinkDbFrontier:
|
class RethinkDbFrontier:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -47,40 +49,49 @@ class RethinkDbFrontier:
|
||||||
tables = self.rr.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not "sites" in tables:
|
if not "sites" in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table 'sites' in database %r",
|
"creating rethinkdb table 'sites' in database %r", self.rr.dbname
|
||||||
self.rr.dbname)
|
)
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"sites", shards=self.shards, replicas=self.replicas).run()
|
"sites", shards=self.shards, replicas=self.replicas
|
||||||
self.rr.table("sites").index_create("sites_last_disclaimed", [
|
).run()
|
||||||
r.row["status"], r.row["last_disclaimed"]]).run()
|
self.rr.table("sites").index_create(
|
||||||
|
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
|
||||||
|
).run()
|
||||||
self.rr.table("sites").index_create("job_id").run()
|
self.rr.table("sites").index_create("job_id").run()
|
||||||
if not "pages" in tables:
|
if not "pages" in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table 'pages' in database %r",
|
"creating rethinkdb table 'pages' in database %r", self.rr.dbname
|
||||||
self.rr.dbname)
|
)
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"pages", shards=self.shards, replicas=self.replicas).run()
|
"pages", shards=self.shards, replicas=self.replicas
|
||||||
self.rr.table("pages").index_create("priority_by_site", [
|
).run()
|
||||||
r.row["site_id"], r.row["brozzle_count"],
|
self.rr.table("pages").index_create(
|
||||||
r.row["claimed"], r.row["priority"]]).run()
|
"priority_by_site",
|
||||||
|
[
|
||||||
|
r.row["site_id"],
|
||||||
|
r.row["brozzle_count"],
|
||||||
|
r.row["claimed"],
|
||||||
|
r.row["priority"],
|
||||||
|
],
|
||||||
|
).run()
|
||||||
# this index is for displaying pages in a sensible order in the web
|
# this index is for displaying pages in a sensible order in the web
|
||||||
# console
|
# console
|
||||||
self.rr.table("pages").index_create("least_hops", [
|
self.rr.table("pages").index_create(
|
||||||
r.row["site_id"], r.row["brozzle_count"],
|
"least_hops",
|
||||||
r.row["hops_from_seed"]]).run()
|
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
||||||
|
).run()
|
||||||
if not "jobs" in tables:
|
if not "jobs" in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table 'jobs' in database %r",
|
"creating rethinkdb table 'jobs' in database %r", self.rr.dbname
|
||||||
self.rr.dbname)
|
)
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"jobs", shards=self.shards, replicas=self.replicas).run()
|
"jobs", shards=self.shards, replicas=self.replicas
|
||||||
|
).run()
|
||||||
|
|
||||||
def _vet_result(self, result, **kwargs):
|
def _vet_result(self, result, **kwargs):
|
||||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||||
# {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
|
# {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
|
||||||
for k in [
|
for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
|
||||||
"replaced", "errors", "skipped", "inserted", "deleted",
|
|
||||||
"unchanged"]:
|
|
||||||
if k in kwargs:
|
if k in kwargs:
|
||||||
expected = kwargs[k]
|
expected = kwargs[k]
|
||||||
else:
|
else:
|
||||||
|
@ -88,55 +99,81 @@ class RethinkDbFrontier:
|
||||||
if isinstance(expected, list):
|
if isinstance(expected, list):
|
||||||
if result.get(k) not in kwargs[k]:
|
if result.get(k) not in kwargs[k]:
|
||||||
raise UnexpectedDbResult(
|
raise UnexpectedDbResult(
|
||||||
"expected %r to be one of %r in %r" % (
|
"expected %r to be one of %r in %r" % (k, expected, result)
|
||||||
k, expected, result))
|
)
|
||||||
else:
|
else:
|
||||||
if result.get(k) != expected:
|
if result.get(k) != expected:
|
||||||
raise UnexpectedDbResult("expected %r to be %r in %r" % (
|
raise UnexpectedDbResult(
|
||||||
k, expected, result))
|
"expected %r to be %r in %r" % (k, expected, result)
|
||||||
|
)
|
||||||
|
|
||||||
def claim_sites(self, n=1):
|
def claim_sites(self, n=1):
|
||||||
self.logger.trace('claiming up to %s sites to brozzle', n)
|
self.logger.trace("claiming up to %s sites to brozzle", n)
|
||||||
result = (
|
result = (
|
||||||
self.rr.table('sites').get_all(r.args(
|
self.rr.table("sites")
|
||||||
r.db(self.rr.dbname).table('sites', read_mode='majority')
|
.get_all(
|
||||||
|
r.args(
|
||||||
|
r.db(self.rr.dbname)
|
||||||
|
.table("sites", read_mode="majority")
|
||||||
.between(
|
.between(
|
||||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
["ACTIVE", r.minval],
|
||||||
index='sites_last_disclaimed')
|
["ACTIVE", r.maxval],
|
||||||
.order_by(r.desc('claimed'), 'last_disclaimed')
|
index="sites_last_disclaimed",
|
||||||
|
)
|
||||||
|
.order_by(r.desc("claimed"), "last_disclaimed")
|
||||||
.fold(
|
.fold(
|
||||||
{}, lambda acc, site: acc.merge(
|
{},
|
||||||
|
lambda acc, site: acc.merge(
|
||||||
r.branch(
|
r.branch(
|
||||||
site.has_fields('job_id'),
|
site.has_fields("job_id"),
|
||||||
r.object(
|
r.object(
|
||||||
site['job_id'].coerce_to('string'),
|
site["job_id"].coerce_to("string"),
|
||||||
acc[site['job_id'].coerce_to('string')].default(0).add(1)),
|
acc[site["job_id"].coerce_to("string")]
|
||||||
{})),
|
.default(0)
|
||||||
|
.add(1),
|
||||||
|
),
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
),
|
||||||
emit=lambda acc, site, new_acc: r.branch(
|
emit=lambda acc, site, new_acc: r.branch(
|
||||||
r.and_(
|
r.and_(
|
||||||
r.or_(
|
r.or_(
|
||||||
site['claimed'].not_(),
|
site["claimed"].not_(),
|
||||||
site['last_claimed'].lt(r.now().sub(60*60))),
|
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||||
|
),
|
||||||
r.or_(
|
r.or_(
|
||||||
site.has_fields('max_claimed_sites').not_(),
|
site.has_fields("max_claimed_sites").not_(),
|
||||||
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
|
new_acc[site["job_id"].coerce_to("string")].le(
|
||||||
[site['id']], []))
|
site["max_claimed_sites"]
|
||||||
.limit(n)))
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
[site["id"]],
|
||||||
|
[],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.limit(n)
|
||||||
|
)
|
||||||
|
)
|
||||||
.update(
|
.update(
|
||||||
# try to avoid a race condition resulting in multiple
|
# try to avoid a race condition resulting in multiple
|
||||||
# brozzler-workers claiming the same site
|
# brozzler-workers claiming the same site
|
||||||
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
||||||
r.branch(
|
r.branch(
|
||||||
r.or_(
|
r.or_(
|
||||||
r.row['claimed'].not_(),
|
r.row["claimed"].not_(),
|
||||||
r.row['last_claimed'].lt(r.now().sub(60*60))),
|
r.row["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||||
{'claimed': True, 'last_claimed': r.now()},
|
),
|
||||||
{}),
|
{"claimed": True, "last_claimed": r.now()},
|
||||||
return_changes=True)).run()
|
{},
|
||||||
|
),
|
||||||
|
return_changes=True,
|
||||||
|
)
|
||||||
|
).run()
|
||||||
|
|
||||||
self._vet_result(
|
self._vet_result(
|
||||||
result, replaced=list(range(n+1)),
|
result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
|
||||||
unchanged=list(range(n+1)))
|
)
|
||||||
sites = []
|
sites = []
|
||||||
for i in range(result["replaced"]):
|
for i in range(result["replaced"]):
|
||||||
if result["changes"][i]["old_val"]["claimed"]:
|
if result["changes"][i]["old_val"]["claimed"]:
|
||||||
|
@ -145,24 +182,27 @@ class RethinkDbFrontier:
|
||||||
"because it was last claimed a long time ago "
|
"because it was last claimed a long time ago "
|
||||||
"at %s, and presumably some error stopped it from "
|
"at %s, and presumably some error stopped it from "
|
||||||
"being disclaimed",
|
"being disclaimed",
|
||||||
result["changes"][i]["old_val"]["last_claimed"])
|
result["changes"][i]["old_val"]["last_claimed"],
|
||||||
|
)
|
||||||
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
self.logger.debug('claimed %s sites', len(sites))
|
self.logger.debug("claimed %s sites", len(sites))
|
||||||
if sites:
|
if sites:
|
||||||
return sites
|
return sites
|
||||||
else:
|
else:
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
|
||||||
def enforce_time_limit(self, site):
|
def enforce_time_limit(self, site):
|
||||||
'''
|
"""
|
||||||
Raises `brozzler.ReachedTimeLimit` if appropriate.
|
Raises `brozzler.ReachedTimeLimit` if appropriate.
|
||||||
'''
|
"""
|
||||||
if (site.time_limit and site.time_limit > 0
|
if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
|
||||||
and site.elapsed() > site.time_limit):
|
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
"site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
|
||||||
"elapsed=%s %s", site.time_limit, site.elapsed(), site)
|
site.time_limit,
|
||||||
|
site.elapsed(),
|
||||||
|
site,
|
||||||
|
)
|
||||||
raise brozzler.ReachedTimeLimit
|
raise brozzler.ReachedTimeLimit
|
||||||
|
|
||||||
def claim_page(self, site, worker_id):
|
def claim_page(self, site, worker_id):
|
||||||
|
@ -170,15 +210,20 @@ class RethinkDbFrontier:
|
||||||
# brozzler-worker can be working on a site at a time, and that would
|
# brozzler-worker can be working on a site at a time, and that would
|
||||||
# have to be the worker calling this method, so if something is claimed
|
# have to be the worker calling this method, so if something is claimed
|
||||||
# already, it must have been left that way because of some error
|
# already, it must have been left that way because of some error
|
||||||
result = self.rr.table("pages").between(
|
result = (
|
||||||
|
self.rr.table("pages")
|
||||||
|
.between(
|
||||||
[site.id, 0, r.minval, r.minval],
|
[site.id, 0, r.minval, r.minval],
|
||||||
[site.id, 0, r.maxval, r.maxval],
|
[site.id, 0, r.maxval, r.maxval],
|
||||||
index="priority_by_site").order_by(
|
index="priority_by_site",
|
||||||
index=r.desc("priority_by_site")).limit(
|
)
|
||||||
1).update({
|
.order_by(index=r.desc("priority_by_site"))
|
||||||
"claimed":True,
|
.limit(1)
|
||||||
"last_claimed_by":worker_id},
|
.update(
|
||||||
return_changes="always").run()
|
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
||||||
|
)
|
||||||
|
.run()
|
||||||
|
)
|
||||||
self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
|
self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
|
||||||
if result["unchanged"] == 0 and result["replaced"] == 0:
|
if result["unchanged"] == 0 and result["replaced"] == 0:
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
@ -186,10 +231,16 @@ class RethinkDbFrontier:
|
||||||
return brozzler.Page(self.rr, result["changes"][0]["new_val"])
|
return brozzler.Page(self.rr, result["changes"][0]["new_val"])
|
||||||
|
|
||||||
def has_outstanding_pages(self, site):
|
def has_outstanding_pages(self, site):
|
||||||
results_iter = self.rr.table("pages").between(
|
results_iter = (
|
||||||
|
self.rr.table("pages")
|
||||||
|
.between(
|
||||||
[site.id, 0, r.minval, r.minval],
|
[site.id, 0, r.minval, r.minval],
|
||||||
[site.id, 0, r.maxval, r.maxval],
|
[site.id, 0, r.maxval, r.maxval],
|
||||||
index="priority_by_site").limit(1).run()
|
index="priority_by_site",
|
||||||
|
)
|
||||||
|
.limit(1)
|
||||||
|
.run()
|
||||||
|
)
|
||||||
return len(list(results_iter)) > 0
|
return len(list(results_iter)) > 0
|
||||||
|
|
||||||
def completed_page(self, site, page):
|
def completed_page(self, site, page):
|
||||||
|
@ -209,15 +260,17 @@ class RethinkDbFrontier:
|
||||||
def honor_stop_request(self, site):
|
def honor_stop_request(self, site):
|
||||||
"""Raises brozzler.CrawlStopped if stop has been requested."""
|
"""Raises brozzler.CrawlStopped if stop has been requested."""
|
||||||
site.refresh()
|
site.refresh()
|
||||||
if (site.stop_requested
|
if site.stop_requested and site.stop_requested <= doublethink.utcnow():
|
||||||
and site.stop_requested <= doublethink.utcnow()):
|
|
||||||
self.logger.info("stop requested for site %s", site.id)
|
self.logger.info("stop requested for site %s", site.id)
|
||||||
raise brozzler.CrawlStopped
|
raise brozzler.CrawlStopped
|
||||||
|
|
||||||
if site.job_id:
|
if site.job_id:
|
||||||
job = brozzler.Job.load(self.rr, site.job_id)
|
job = brozzler.Job.load(self.rr, site.job_id)
|
||||||
if (job and job.stop_requested
|
if (
|
||||||
and job.stop_requested <= doublethink.utcnow()):
|
job
|
||||||
|
and job.stop_requested
|
||||||
|
and job.stop_requested <= doublethink.utcnow()
|
||||||
|
):
|
||||||
self.logger.info("stop requested for job %s", site.job_id)
|
self.logger.info("stop requested for job %s", site.job_id)
|
||||||
raise brozzler.CrawlStopped
|
raise brozzler.CrawlStopped
|
||||||
|
|
||||||
|
@ -239,8 +292,7 @@ class RethinkDbFrontier:
|
||||||
return False
|
return False
|
||||||
n += 1
|
n += 1
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||||
"all %s sites finished, job %s is FINISHED!", n, job.id)
|
|
||||||
job.finish()
|
job.finish()
|
||||||
job.save()
|
job.save()
|
||||||
return True
|
return True
|
||||||
|
@ -270,13 +322,11 @@ class RethinkDbFrontier:
|
||||||
def resume_job(self, job):
|
def resume_job(self, job):
|
||||||
job.status = "ACTIVE"
|
job.status = "ACTIVE"
|
||||||
job.stop_requested = None
|
job.stop_requested = None
|
||||||
job.starts_and_stops.append(
|
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||||
{"start":doublethink.utcnow(), "stop":None})
|
|
||||||
job.save()
|
job.save()
|
||||||
for site in self.job_sites(job.id):
|
for site in self.job_sites(job.id):
|
||||||
site.status = "ACTIVE"
|
site.status = "ACTIVE"
|
||||||
site.starts_and_stops.append(
|
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||||
{"start":doublethink.utcnow(), "stop":None})
|
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
def resume_site(self, site):
|
def resume_site(self, site):
|
||||||
|
@ -285,51 +335,55 @@ class RethinkDbFrontier:
|
||||||
job = brozzler.Job.load(self.rr, site.job_id)
|
job = brozzler.Job.load(self.rr, site.job_id)
|
||||||
job.status = "ACTIVE"
|
job.status = "ACTIVE"
|
||||||
site.stop_requested = None
|
site.stop_requested = None
|
||||||
job.starts_and_stops.append(
|
job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||||
{"start":doublethink.utcnow(), "stop":None})
|
|
||||||
job.save()
|
job.save()
|
||||||
site.status = "ACTIVE"
|
site.status = "ACTIVE"
|
||||||
site.starts_and_stops.append(
|
site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
|
||||||
{"start":doublethink.utcnow(), "stop":None})
|
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
hashtag = (url_for_crawling.hash_sign
|
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
|
||||||
+ url_for_crawling.fragment).decode('utf-8')
|
"utf-8"
|
||||||
|
)
|
||||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||||
page = brozzler.Page(self.rr, {
|
page = brozzler.Page(
|
||||||
'url': str(url_for_crawling),
|
self.rr,
|
||||||
'site_id': site.id,
|
{
|
||||||
'job_id': site.job_id,
|
"url": str(url_for_crawling),
|
||||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
"site_id": site.id,
|
||||||
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
|
"job_id": site.job_id,
|
||||||
'via_page_id': parent_page.id,
|
"hops_from_seed": parent_page.hops_from_seed + 1,
|
||||||
'via_page_url': parent_page.url,
|
"hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
|
||||||
'hops_off_surt': hops_off,
|
+ "L",
|
||||||
'hashtags': [hashtag] if hashtag else []})
|
"via_page_id": parent_page.id,
|
||||||
|
"via_page_url": parent_page.url,
|
||||||
|
"hops_off_surt": hops_off,
|
||||||
|
"hashtags": [hashtag] if hashtag else [],
|
||||||
|
},
|
||||||
|
)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def _merge_page(self, existing_page, fresh_page):
|
def _merge_page(self, existing_page, fresh_page):
|
||||||
'''
|
"""
|
||||||
Utility method for merging info from `brozzler.Page` instances
|
Utility method for merging info from `brozzler.Page` instances
|
||||||
representing the same url but with possibly different metadata.
|
representing the same url but with possibly different metadata.
|
||||||
'''
|
"""
|
||||||
existing_page.priority += fresh_page.priority
|
existing_page.priority += fresh_page.priority
|
||||||
existing_page.hashtags = list(set(
|
existing_page.hashtags = list(
|
||||||
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
|
set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
|
||||||
existing_page.hops_off = min(
|
)
|
||||||
existing_page.hops_off, fresh_page.hops_off)
|
existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)
|
||||||
|
|
||||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||||
'''
|
"""
|
||||||
Returns tuple (
|
Returns tuple (
|
||||||
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
||||||
scope links accepted by robots policy,
|
scope links accepted by robots policy,
|
||||||
set of in scope urls (canonicalized) blocked by robots policy,
|
set of in scope urls (canonicalized) blocked by robots policy,
|
||||||
set of out-of-scope urls (canonicalized)).
|
set of out-of-scope urls (canonicalized)).
|
||||||
'''
|
"""
|
||||||
pages = {} # {page_id: Page, ...}
|
pages = {} # {page_id: Page, ...}
|
||||||
blocked = set()
|
blocked = set()
|
||||||
out_of_scope = set()
|
out_of_scope = set()
|
||||||
|
@ -337,17 +391,18 @@ class RethinkDbFrontier:
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
decision = site.accept_reject_or_neither(
|
decision = site.accept_reject_or_neither(
|
||||||
url_for_scoping, parent_page=parent_page)
|
url_for_scoping, parent_page=parent_page
|
||||||
|
)
|
||||||
if decision is True:
|
if decision is True:
|
||||||
hops_off = 0
|
hops_off = 0
|
||||||
elif decision is None:
|
elif decision is None:
|
||||||
decision = parent_page.hops_off < site.scope.get(
|
decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
|
||||||
'max_hops_off', 0)
|
|
||||||
hops_off = parent_page.hops_off + 1
|
hops_off = parent_page.hops_off + 1
|
||||||
if decision is True:
|
if decision is True:
|
||||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||||
fresh_page = self._build_fresh_page(
|
fresh_page = self._build_fresh_page(
|
||||||
site, parent_page, url, hops_off)
|
site, parent_page, url, hops_off
|
||||||
|
)
|
||||||
if fresh_page.id in pages:
|
if fresh_page.id in pages:
|
||||||
self._merge_page(pages[fresh_page.id], fresh_page)
|
self._merge_page(pages[fresh_page.id], fresh_page)
|
||||||
else:
|
else:
|
||||||
|
@ -359,31 +414,32 @@ class RethinkDbFrontier:
|
||||||
return pages, blocked, out_of_scope
|
return pages, blocked, out_of_scope
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
|
||||||
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
|
||||||
|
|
||||||
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||||
site, parent_page, outlinks)
|
site, parent_page, outlinks
|
||||||
decisions['blocked'] = blocked
|
)
|
||||||
decisions['rejected'] = out_of_scope
|
decisions["blocked"] = blocked
|
||||||
counts['blocked'] += len(blocked)
|
decisions["rejected"] = out_of_scope
|
||||||
counts['rejected'] += len(out_of_scope)
|
counts["blocked"] += len(blocked)
|
||||||
|
counts["rejected"] += len(out_of_scope)
|
||||||
|
|
||||||
# get existing pages from rethinkdb
|
# get existing pages from rethinkdb
|
||||||
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
|
||||||
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}
|
||||||
|
|
||||||
# build list of pages to save, consisting of new pages, and existing
|
# build list of pages to save, consisting of new pages, and existing
|
||||||
# pages updated with higher priority and new hashtags
|
# pages updated with higher priority and new hashtags
|
||||||
for fresh_page in fresh_pages.values():
|
for fresh_page in fresh_pages.values():
|
||||||
decisions['accepted'].add(fresh_page.url)
|
decisions["accepted"].add(fresh_page.url)
|
||||||
if fresh_page.id in pages:
|
if fresh_page.id in pages:
|
||||||
page = pages[fresh_page.id]
|
page = pages[fresh_page.id]
|
||||||
self._merge_page(page, fresh_page)
|
self._merge_page(page, fresh_page)
|
||||||
counts['updated'] += 1
|
counts["updated"] += 1
|
||||||
else:
|
else:
|
||||||
pages[fresh_page.id] = fresh_page
|
pages[fresh_page.id] = fresh_page
|
||||||
counts['added'] += 1
|
counts["added"] += 1
|
||||||
|
|
||||||
# make sure we're not stepping on our own toes in case we have a link
|
# make sure we're not stepping on our own toes in case we have a link
|
||||||
# back to parent_page, which I think happens because of hashtags
|
# back to parent_page, which I think happens because of hashtags
|
||||||
|
@ -398,17 +454,20 @@ class RethinkDbFrontier:
|
||||||
l = list(pages.values())
|
l = list(pages.values())
|
||||||
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
|
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
|
||||||
try:
|
try:
|
||||||
self.logger.debug(
|
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
|
||||||
'inserting/replacing batch of %s pages', len(batch))
|
reql = self.rr.table("pages").insert(batch, conflict="replace")
|
||||||
reql = self.rr.table('pages').insert(batch, conflict='replace')
|
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'running query self.rr.table("pages").insert(%r, '
|
'running query self.rr.table("pages").insert(%r, '
|
||||||
'conflict="replace")', batch)
|
'conflict="replace")',
|
||||||
|
batch,
|
||||||
|
)
|
||||||
result = reql.run()
|
result = reql.run()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'problem inserting/replacing batch of %s pages',
|
"problem inserting/replacing batch of %s pages",
|
||||||
len(batch), exc_info=True)
|
len(batch),
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
|
||||||
parent_page.outlinks = {}
|
parent_page.outlinks = {}
|
||||||
for k in decisions:
|
for k in decisions:
|
||||||
|
@ -416,43 +475,56 @@ class RethinkDbFrontier:
|
||||||
parent_page.save()
|
parent_page.save()
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'%s new links added, %s existing links updated, %s links '
|
"%s new links added, %s existing links updated, %s links "
|
||||||
'rejected, %s links blocked by robots from %s',
|
"rejected, %s links blocked by robots from %s",
|
||||||
counts['added'], counts['updated'], counts['rejected'],
|
counts["added"],
|
||||||
counts['blocked'], parent_page)
|
counts["updated"],
|
||||||
|
counts["rejected"],
|
||||||
|
counts["blocked"],
|
||||||
|
parent_page,
|
||||||
|
)
|
||||||
|
|
||||||
def reached_limit(self, site, e):
|
def reached_limit(self, site, e):
|
||||||
self.logger.info("reached_limit site=%s e=%s", site, e)
|
self.logger.info("reached_limit site=%s e=%s", site, e)
|
||||||
assert isinstance(e, brozzler.ReachedLimit)
|
assert isinstance(e, brozzler.ReachedLimit)
|
||||||
if (site.reached_limit
|
if (
|
||||||
and site.reached_limit != e.warcprox_meta["reached-limit"]):
|
site.reached_limit
|
||||||
|
and site.reached_limit != e.warcprox_meta["reached-limit"]
|
||||||
|
):
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"reached limit %s but site had already reached limit %s",
|
"reached limit %s but site had already reached limit %s",
|
||||||
e.warcprox_meta["reached-limit"], self.reached_limit)
|
e.warcprox_meta["reached-limit"],
|
||||||
|
self.reached_limit,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
site.reached_limit = e.warcprox_meta["reached-limit"]
|
site.reached_limit = e.warcprox_meta["reached-limit"]
|
||||||
self.finished(site, "FINISHED_REACHED_LIMIT")
|
self.finished(site, "FINISHED_REACHED_LIMIT")
|
||||||
|
|
||||||
def job_sites(self, job_id):
|
def job_sites(self, job_id):
|
||||||
results = self.rr.table('sites').get_all(job_id, index="job_id").run()
|
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
|
||||||
for result in results:
|
for result in results:
|
||||||
yield brozzler.Site(self.rr, result)
|
yield brozzler.Site(self.rr, result)
|
||||||
|
|
||||||
def seed_page(self, site_id):
|
def seed_page(self, site_id):
|
||||||
results = self.rr.table("pages").between(
|
results = (
|
||||||
|
self.rr.table("pages")
|
||||||
|
.between(
|
||||||
[site_id, r.minval, r.minval, r.minval],
|
[site_id, r.minval, r.minval, r.minval],
|
||||||
[site_id, r.maxval, r.maxval, r.maxval],
|
[site_id, r.maxval, r.maxval, r.maxval],
|
||||||
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
index="priority_by_site",
|
||||||
|
)
|
||||||
|
.filter({"hops_from_seed": 0})
|
||||||
|
.run()
|
||||||
|
)
|
||||||
pages = list(results)
|
pages = list(results)
|
||||||
if len(pages) > 1:
|
if len(pages) > 1:
|
||||||
self.logger.warning(
|
self.logger.warning("more than one seed page for site_id %s ?", site_id)
|
||||||
"more than one seed page for site_id %s ?", site_id)
|
|
||||||
if len(pages) < 1:
|
if len(pages) < 1:
|
||||||
return None
|
return None
|
||||||
return brozzler.Page(self.rr, pages[0])
|
return brozzler.Page(self.rr, pages[0])
|
||||||
|
|
||||||
def site_pages(self, site_id, brozzled=None):
|
def site_pages(self, site_id, brozzled=None):
|
||||||
'''
|
"""
|
||||||
Args:
|
Args:
|
||||||
site_id (str or int):
|
site_id (str or int):
|
||||||
brozzled (bool): if true, results include only pages that have
|
brozzled (bool): if true, results include only pages that have
|
||||||
|
@ -460,16 +532,14 @@ class RethinkDbFrontier:
|
||||||
not been brozzled; and if None (the default), all pages
|
not been brozzled; and if None (the default), all pages
|
||||||
Returns:
|
Returns:
|
||||||
iterator of brozzler.Page
|
iterator of brozzler.Page
|
||||||
'''
|
"""
|
||||||
query = self.rr.table("pages").between(
|
query = self.rr.table("pages").between(
|
||||||
[site_id, 1 if brozzled is True else 0,
|
[site_id, 1 if brozzled is True else 0, r.minval, r.minval],
|
||||||
r.minval, r.minval],
|
[site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
|
||||||
[site_id, 0 if brozzled is False else r.maxval,
|
index="priority_by_site",
|
||||||
r.maxval, r.maxval],
|
)
|
||||||
index="priority_by_site")
|
|
||||||
self.logger.trace("running query: %r", query)
|
self.logger.trace("running query: %r", query)
|
||||||
results = query.run()
|
results = query.run()
|
||||||
for result in results:
|
for result in results:
|
||||||
self.logger.trace("yielding result: %r", result)
|
self.logger.trace("yielding result: %r", result)
|
||||||
yield brozzler.Page(self.rr, result)
|
yield brozzler.Page(self.rr, result)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||||
related logic
|
related logic
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import brozzler
|
import brozzler
|
||||||
import base64
|
import base64
|
||||||
|
@ -36,15 +36,18 @@ import yaml
|
||||||
import zlib
|
import zlib
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
def load_schema():
|
def load_schema():
|
||||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
|
||||||
with open(schema_file) as f:
|
with open(schema_file) as f:
|
||||||
return yaml.safe_load(f)
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
class JobValidator(cerberus.Validator):
|
class JobValidator(cerberus.Validator):
|
||||||
def _validate_type_url(self, value):
|
def _validate_type_url(self, value):
|
||||||
url = urllib.parse.urlparse(value)
|
url = urllib.parse.urlparse(value)
|
||||||
return url.scheme in ('http', 'https', 'ftp')
|
return url.scheme in ("http", "https", "ftp")
|
||||||
|
|
||||||
|
|
||||||
class InvalidJobConf(Exception):
|
class InvalidJobConf(Exception):
|
||||||
def __init__(self, validator):
|
def __init__(self, validator):
|
||||||
|
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
|
||||||
# Cerberus does a nice job hiding the bad value. In the case I
|
# Cerberus does a nice job hiding the bad value. In the case I
|
||||||
# debugged, I found it here. Maybe there's a better way to see it.
|
# debugged, I found it here. Maybe there's a better way to see it.
|
||||||
value = validator._errors[0].info[0][0].info[0][0].value
|
value = validator._errors[0].info[0][0].info[0][0].value
|
||||||
self.errors['bad value'] = value
|
self.errors["bad value"] = value
|
||||||
except:
|
except:
|
||||||
value = None
|
value = None
|
||||||
|
|
||||||
|
|
||||||
def validate_conf(job_conf, schema=load_schema()):
|
def validate_conf(job_conf, schema=load_schema()):
|
||||||
v = JobValidator(schema)
|
v = JobValidator(schema)
|
||||||
if not v.validate(job_conf, normalize=False):
|
if not v.validate(job_conf, normalize=False):
|
||||||
raise InvalidJobConf(v)
|
raise InvalidJobConf(v)
|
||||||
|
|
||||||
|
|
||||||
def merge(a, b):
|
def merge(a, b):
|
||||||
if isinstance(a, dict) and isinstance(b, dict):
|
if isinstance(a, dict) and isinstance(b, dict):
|
||||||
merged = dict(a)
|
merged = dict(a)
|
||||||
|
@ -75,19 +80,22 @@ def merge(a, b):
|
||||||
else:
|
else:
|
||||||
return a
|
return a
|
||||||
|
|
||||||
|
|
||||||
def new_job_file(frontier, job_conf_file):
|
def new_job_file(frontier, job_conf_file):
|
||||||
'''Returns new Job.'''
|
"""Returns new Job."""
|
||||||
logging.info("loading %s", job_conf_file)
|
logging.info("loading %s", job_conf_file)
|
||||||
with open(job_conf_file) as f:
|
with open(job_conf_file) as f:
|
||||||
job_conf = yaml.safe_load(f)
|
job_conf = yaml.safe_load(f)
|
||||||
return new_job(frontier, job_conf)
|
return new_job(frontier, job_conf)
|
||||||
|
|
||||||
|
|
||||||
def new_job(frontier, job_conf):
|
def new_job(frontier, job_conf):
|
||||||
'''Returns new Job.'''
|
"""Returns new Job."""
|
||||||
validate_conf(job_conf)
|
validate_conf(job_conf)
|
||||||
job = Job(frontier.rr, {
|
job = Job(
|
||||||
"conf": job_conf, "status": "ACTIVE",
|
frontier.rr,
|
||||||
"started": doublethink.utcnow()})
|
{"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
|
||||||
|
)
|
||||||
if "id" in job_conf:
|
if "id" in job_conf:
|
||||||
job.id = job_conf["id"]
|
job.id = job_conf["id"]
|
||||||
if "max_claimed_sites" in job_conf:
|
if "max_claimed_sites" in job_conf:
|
||||||
|
@ -109,31 +117,39 @@ def new_job(frontier, job_conf):
|
||||||
# insert in batches to avoid this error
|
# insert in batches to avoid this error
|
||||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||||
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
||||||
logging.info('inserting batch of %s pages', len(batch))
|
logging.info("inserting batch of %s pages", len(batch))
|
||||||
result = frontier.rr.table('pages').insert(batch).run()
|
result = frontier.rr.table("pages").insert(batch).run()
|
||||||
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
||||||
logging.info('inserting batch of %s sites', len(batch))
|
logging.info("inserting batch of %s sites", len(batch))
|
||||||
result = frontier.rr.table('sites').insert(batch).run()
|
result = frontier.rr.table("sites").insert(batch).run()
|
||||||
logging.info('job %s fully started', job.id)
|
logging.info("job %s fully started", job.id)
|
||||||
|
|
||||||
return job
|
return job
|
||||||
|
|
||||||
|
|
||||||
def new_seed_page(frontier, site):
|
def new_seed_page(frontier, site):
|
||||||
url = urlcanon.parse_url(site.seed)
|
url = urlcanon.parse_url(site.seed)
|
||||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||||
urlcanon.canon.remove_fragment(url)
|
urlcanon.canon.remove_fragment(url)
|
||||||
page = brozzler.Page(frontier.rr, {
|
page = brozzler.Page(
|
||||||
|
frontier.rr,
|
||||||
|
{
|
||||||
"url": str(url),
|
"url": str(url),
|
||||||
"site_id": site.get("id"),
|
"site_id": site.get("id"),
|
||||||
"job_id": site.get("job_id"),
|
"job_id": site.get("job_id"),
|
||||||
"hops_from_seed": 0,
|
"hops_from_seed": 0,
|
||||||
"priority": 1000,
|
"priority": 1000,
|
||||||
"needs_robots_check": True,
|
"needs_robots_check": True,
|
||||||
"hop_path": None})
|
"hop_path": None,
|
||||||
|
},
|
||||||
|
)
|
||||||
if hashtag:
|
if hashtag:
|
||||||
page.hashtags = [hashtag,]
|
page.hashtags = [
|
||||||
|
hashtag,
|
||||||
|
]
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def new_site(frontier, site):
|
def new_site(frontier, site):
|
||||||
logging.info("new site %s", site)
|
logging.info("new site %s", site)
|
||||||
site.id = site.id or str(uuid.uuid4())
|
site.id = site.id or str(uuid.uuid4())
|
||||||
|
@ -148,9 +164,10 @@ def new_site(frontier, site):
|
||||||
# finally block because we want to insert the Site no matter what
|
# finally block because we want to insert the Site no matter what
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
|
|
||||||
class ElapsedMixIn(object):
|
class ElapsedMixIn(object):
|
||||||
def elapsed(self):
|
def elapsed(self):
|
||||||
'''
|
"""
|
||||||
Returns elapsed crawl time as a float in seconds.
|
Returns elapsed crawl time as a float in seconds.
|
||||||
|
|
||||||
This metric includes all the time that a site was in active rotation,
|
This metric includes all the time that a site was in active rotation,
|
||||||
|
@ -158,21 +175,22 @@ class ElapsedMixIn(object):
|
||||||
|
|
||||||
In contrast `Site.active_brozzling_time` only counts time when a
|
In contrast `Site.active_brozzling_time` only counts time when a
|
||||||
brozzler worker claimed the site and was actively brozzling it.
|
brozzler worker claimed the site and was actively brozzling it.
|
||||||
'''
|
"""
|
||||||
dt = 0
|
dt = 0
|
||||||
for ss in self.starts_and_stops[:-1]:
|
for ss in self.starts_and_stops[:-1]:
|
||||||
if ss['stop']:
|
if ss["stop"]:
|
||||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
dt += (ss["stop"] - ss["start"]).total_seconds()
|
||||||
else:
|
else:
|
||||||
self.logger.warning("missing expected ss['stop']")
|
self.logger.warning("missing expected ss['stop']")
|
||||||
dt += (doublethink.utcnow() - ss['start']).total_seconds()
|
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
|
||||||
ss = self.starts_and_stops[-1]
|
ss = self.starts_and_stops[-1]
|
||||||
if ss['stop']:
|
if ss["stop"]:
|
||||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
dt += (ss["stop"] - ss["start"]).total_seconds()
|
||||||
else: # crawl is active
|
else: # crawl is active
|
||||||
dt += (doublethink.utcnow() - ss['start']).total_seconds()
|
dt += (doublethink.utcnow() - ss["start"]).total_seconds()
|
||||||
return dt
|
return dt
|
||||||
|
|
||||||
|
|
||||||
class Job(doublethink.Document, ElapsedMixIn):
|
class Job(doublethink.Document, ElapsedMixIn):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = "jobs"
|
table = "jobs"
|
||||||
|
@ -182,28 +200,29 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("started"): # backward compatibility
|
if self.get("started"): # backward compatibility
|
||||||
self.starts_and_stops = [{
|
self.starts_and_stops = [
|
||||||
"start": self.get("started"),
|
{"start": self.get("started"), "stop": self.get("finished")}
|
||||||
"stop": self.get("finished")}]
|
]
|
||||||
del self["started"]
|
del self["started"]
|
||||||
if "finished" in self:
|
if "finished" in self:
|
||||||
del self["finished"]
|
del self["finished"]
|
||||||
else:
|
else:
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
|
||||||
{"start":doublethink.utcnow(),"stop":None}]
|
|
||||||
|
|
||||||
def finish(self):
|
def finish(self):
|
||||||
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
|
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"job is already finished status=%s "
|
"job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
|
||||||
"starts_and_stops[-1]['stop']=%s", self.status,
|
self.status,
|
||||||
self.starts_and_stops[-1]["stop"])
|
self.starts_and_stops[-1]["stop"],
|
||||||
|
)
|
||||||
self.status = "FINISHED"
|
self.status = "FINISHED"
|
||||||
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
||||||
|
|
||||||
|
|
||||||
class Site(doublethink.Document, ElapsedMixIn):
|
class Site(doublethink.Document, ElapsedMixIn):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = 'sites'
|
table = "sites"
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if not "status" in self:
|
||||||
|
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
del self.scope["surt"]
|
del self.scope["surt"]
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if ("max_hops_off_surt" in self.scope
|
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
|
||||||
and not "max_hops_off" in self.scope):
|
|
||||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||||
if "max_hops_off_surt" in self.scope:
|
if "max_hops_off_surt" in self.scope:
|
||||||
del self.scope["max_hops_off_surt"]
|
del self.scope["max_hops_off_surt"]
|
||||||
|
|
||||||
if self.seed:
|
if self.seed:
|
||||||
self._accept_ssurt_if_not_redundant(
|
self._accept_ssurt_if_not_redundant(
|
||||||
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
|
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
||||||
|
)
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
self.starts_and_stops = [{
|
self.starts_and_stops = [
|
||||||
"start":self.get("start_time"),"stop":None}]
|
{"start": self.get("start_time"), "stop": None}
|
||||||
|
]
|
||||||
if self.get("status") != "ACTIVE":
|
if self.get("status") != "ACTIVE":
|
||||||
self.starts_and_stops[0]["stop"] = self.last_disclaimed
|
self.starts_and_stops[0]["stop"] = self.last_disclaimed
|
||||||
del self["start_time"]
|
del self["start_time"]
|
||||||
else:
|
else:
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
|
||||||
{"start":doublethink.utcnow(),"stop":None}]
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
if not "accepts" in self.scope:
|
if not "accepts" in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
simple_rule_ssurts = (
|
simple_rule_ssurts = (
|
||||||
rule["ssurt"] for rule in self.scope["accepts"]
|
rule["ssurt"]
|
||||||
if set(rule.keys()) == {'ssurt'})
|
for rule in self.scope["accepts"]
|
||||||
|
if set(rule.keys()) == {"ssurt"}
|
||||||
|
)
|
||||||
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
||||||
self.logger.info(
|
self.logger.info("adding ssurt %s to scope accept rules", ssurt)
|
||||||
"adding ssurt %s to scope accept rules", ssurt)
|
|
||||||
self.scope["accepts"].append({"ssurt": ssurt})
|
self.scope["accepts"].append({"ssurt": ssurt})
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
|
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
|
|
||||||
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also
|
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also
|
||||||
# put all of https://foo.com/ in scope
|
# put all of https://foo.com/ in scope
|
||||||
if (canon_seed_redirect.authority == canon_seed.authority
|
if (
|
||||||
and canon_seed_redirect.scheme != canon_seed.scheme):
|
canon_seed_redirect.authority == canon_seed.authority
|
||||||
|
and canon_seed_redirect.scheme != canon_seed.scheme
|
||||||
|
):
|
||||||
canon_seed.scheme = canon_seed_redirect.scheme
|
canon_seed.scheme = canon_seed_redirect.scheme
|
||||||
self._accept_ssurt_if_not_redundant(
|
self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))
|
||||||
canon_seed.ssurt().decode('ascii'))
|
|
||||||
|
|
||||||
self._accept_ssurt_if_not_redundant(
|
self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))
|
||||||
canon_seed_redirect.ssurt().decode('ascii'))
|
|
||||||
|
|
||||||
def extra_headers(self, page: Optional["Page"] = None):
|
def extra_headers(self, page: Optional["Page"] = None):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
|
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
|
temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
|
||||||
if "blocks" in self.warcprox_meta:
|
if "blocks" in self.warcprox_meta:
|
||||||
# delete temp_warcprox_meta's 'blocks' (they may be big!)
|
# delete temp_warcprox_meta's 'blocks' (they may be big!)
|
||||||
del temp_warcprox_meta['blocks']
|
del temp_warcprox_meta["blocks"]
|
||||||
# str-ify blocks
|
# str-ify blocks
|
||||||
blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
|
blocks_str = json.dumps(
|
||||||
|
self.warcprox_meta["blocks"], separators=(",", ":")
|
||||||
|
)
|
||||||
# encode(), compress, b64encode, decode()
|
# encode(), compress, b64encode, decode()
|
||||||
temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
|
temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
|
||||||
|
zlib.compress(blocks_str.encode())
|
||||||
|
).decode()
|
||||||
if page is not None:
|
if page is not None:
|
||||||
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||||
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
|
temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||||
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||||
hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
|
hdrs["Warcprox-Meta"] = json.dumps(
|
||||||
|
temp_warcprox_meta, separators=(",", ":")
|
||||||
|
)
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def accept_reject_or_neither(self, url, parent_page=None):
|
def accept_reject_or_neither(self, url, parent_page=None):
|
||||||
'''
|
"""
|
||||||
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
||||||
|
|
||||||
`None` usually means rejected, unless `max_hops_off` comes into play.
|
`None` usually means rejected, unless `max_hops_off` comes into play.
|
||||||
'''
|
"""
|
||||||
if not isinstance(url, urlcanon.ParsedUrl):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
url = urlcanon.semantic(url)
|
url = urlcanon.semantic(url)
|
||||||
|
|
||||||
if not url.scheme in (b'http', b'https'):
|
if not url.scheme in (b"http", b"https"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
|
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
if parent_page:
|
if parent_page:
|
||||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||||
if parent_page.redirect_url:
|
if parent_page.redirect_url:
|
||||||
try_parent_urls.append(
|
try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))
|
||||||
urlcanon.semantic(parent_page.redirect_url))
|
|
||||||
|
|
||||||
# enforce max_hops
|
# enforce max_hops
|
||||||
if (parent_page and "max_hops" in self.scope
|
if (
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
parent_page
|
||||||
|
and "max_hops" in self.scope
|
||||||
|
and parent_page.hops_from_seed >= self.scope["max_hops"]
|
||||||
|
):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# enforce reject rules
|
# enforce reject rules
|
||||||
|
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
# no decision if we reach here
|
# no decision if we reach here
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class Page(doublethink.Document):
|
class Page(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = "pages"
|
table = "pages"
|
||||||
|
@ -398,4 +427,3 @@ class Page(doublethink.Document):
|
||||||
if self._canon_hurl is None:
|
if self._canon_hurl is None:
|
||||||
self._canon_hurl = urlcanon.semantic(self.url)
|
self._canon_hurl = urlcanon.semantic(self.url)
|
||||||
return str(self._canon_hurl)
|
return str(self._canon_hurl)
|
||||||
|
|
||||||
|
|
223
brozzler/pywb.py
223
brozzler/pywb.py
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
||||||
loading from warcs still being written to, canonicalization rules matching
|
loading from warcs still being written to, canonicalization rules matching
|
||||||
brozzler conventions, support for screenshot: and thumbnail: urls
|
brozzler conventions, support for screenshot: and thumbnail: urls
|
||||||
|
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pywb.apps.cli
|
import pywb.apps.cli
|
||||||
import pywb.cdx.cdxdomainspecific
|
import pywb.cdx.cdxdomainspecific
|
||||||
|
@ -32,7 +33,9 @@ except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee README.rst for more information.',
|
'brozzler[easy]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__,
|
||||||
|
e,
|
||||||
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
import rethinkdb as rdb
|
import rethinkdb as rdb
|
||||||
|
@ -43,6 +46,7 @@ import argparse
|
||||||
|
|
||||||
r = rdb.RethinkDB()
|
r = rdb.RethinkDB()
|
||||||
|
|
||||||
|
|
||||||
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
def __init__(self, servers, db, table):
|
def __init__(self, servers, db, table):
|
||||||
self.servers = servers
|
self.servers = servers
|
||||||
|
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
# XXX inefficient, it gets parsed later, figure out how to
|
# XXX inefficient, it gets parsed later, figure out how to
|
||||||
# short-circuit this step and create the CDXObject directly
|
# short-circuit this step and create the CDXObject directly
|
||||||
blob = {
|
blob = {
|
||||||
'url': record['url'],
|
"url": record["url"],
|
||||||
'status': str(record['response_code']),
|
"status": str(record["response_code"]),
|
||||||
'digest': record['sha1base32'],
|
"digest": record["sha1base32"],
|
||||||
'length': str(record.get('record_length', '-')),
|
"length": str(record.get("record_length", "-")),
|
||||||
'offset': str(record['offset']),
|
"offset": str(record["offset"]),
|
||||||
'filename': record['filename'],
|
"filename": record["filename"],
|
||||||
}
|
}
|
||||||
if record['warc_type'] != 'revisit':
|
if record["warc_type"] != "revisit":
|
||||||
blob['mime'] = record['content_type'] or '-'
|
blob["mime"] = record["content_type"] or "-"
|
||||||
else:
|
else:
|
||||||
blob['mime'] = 'warc/revisit'
|
blob["mime"] = "warc/revisit"
|
||||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||||
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
|
||||||
record['canon_surt'], record['timestamp'],
|
record["canon_surt"], record["timestamp"], json.dumps(blob)
|
||||||
json.dumps(blob))
|
)
|
||||||
yield cdx_line.encode('utf-8')
|
yield cdx_line.encode("utf-8")
|
||||||
|
|
||||||
def _query_rethinkdb(self, cdx_query):
|
def _query_rethinkdb(self, cdx_query):
|
||||||
start_key = cdx_query.key.decode('utf-8')
|
start_key = cdx_query.key.decode("utf-8")
|
||||||
end_key = cdx_query.end_key.decode('utf-8')
|
end_key = cdx_query.end_key.decode("utf-8")
|
||||||
reql = self.rr.table(self.table).between(
|
reql = self.rr.table(self.table).between(
|
||||||
[start_key[:150], r.minval], [end_key[:150], r.maxval],
|
[start_key[:150], r.minval],
|
||||||
index='abbr_canon_surt_timestamp', right_bound='closed')
|
[end_key[:150], r.maxval],
|
||||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
index="abbr_canon_surt_timestamp",
|
||||||
|
right_bound="closed",
|
||||||
|
)
|
||||||
|
reql = reql.order_by(index="abbr_canon_surt_timestamp")
|
||||||
# TODO support for POST, etc
|
# TODO support for POST, etc
|
||||||
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
||||||
reql = reql.filter(
|
reql = reql.filter(
|
||||||
lambda capture: r.expr(
|
lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
|
||||||
['WARCPROX_WRITE_RECORD','GET']).contains(
|
capture["http_method"]
|
||||||
capture['http_method']))
|
)
|
||||||
|
)
|
||||||
reql = reql.filter(
|
reql = reql.filter(
|
||||||
lambda capture: (capture['canon_surt'] >= start_key)
|
lambda capture: (capture["canon_surt"] >= start_key)
|
||||||
& (capture['canon_surt'] < end_key))
|
& (capture["canon_surt"] < end_key)
|
||||||
|
)
|
||||||
if cdx_query.limit:
|
if cdx_query.limit:
|
||||||
reql = reql.limit(cdx_query.limit)
|
reql = reql.limit(cdx_query.limit)
|
||||||
logging.debug('rethinkdb query: %s', reql)
|
logging.debug("rethinkdb query: %s", reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
class TheGoodUrlCanonicalizer(object):
|
class TheGoodUrlCanonicalizer(object):
|
||||||
'''
|
"""
|
||||||
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
|
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
|
||||||
surts with scheme and with trailing comma, and does not "massage"
|
surts with scheme and with trailing comma, and does not "massage"
|
||||||
www.foo.org into foo.org.
|
www.foo.org into foo.org.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
def __init__(self, surt_ordered=True):
|
def __init__(self, surt_ordered=True):
|
||||||
'''We are always surt ordered (surt_ordered param is ignored)'''
|
"""We are always surt ordered (surt_ordered param is ignored)"""
|
||||||
self.surt_ordered = True
|
self.surt_ordered = True
|
||||||
|
|
||||||
def __call__(self, url):
|
def __call__(self, url):
|
||||||
try:
|
try:
|
||||||
key = urlcanon.semantic(url).surt().decode('ascii')
|
key = urlcanon.semantic(url).surt().decode("ascii")
|
||||||
# logging.debug('%s -> %s', url, key)
|
# logging.debug('%s -> %s', url, key)
|
||||||
return key
|
return key
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def replace_default_canonicalizer():
|
def replace_default_canonicalizer():
|
||||||
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
"""Replace parent class of CustomUrlCanonicalizer with this class."""
|
||||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||||
TheGoodUrlCanonicalizer,)
|
TheGoodUrlCanonicalizer,
|
||||||
|
)
|
||||||
|
|
||||||
def good_surts_from_default(default_surt):
|
def good_surts_from_default(default_surt):
|
||||||
'''
|
"""
|
||||||
Takes a standard surt without scheme and without trailing comma, and
|
Takes a standard surt without scheme and without trailing comma, and
|
||||||
returns a list of "good" surts that together match the same set of
|
returns a list of "good" surts that together match the same set of
|
||||||
urls. For example:
|
urls. For example:
|
||||||
|
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
|
||||||
'http://(com,example,www,)/path',
|
'http://(com,example,www,)/path',
|
||||||
'https://(com,example,www,)/path']
|
'https://(com,example,www,)/path']
|
||||||
|
|
||||||
'''
|
"""
|
||||||
if default_surt == '':
|
if default_surt == "":
|
||||||
return ['']
|
return [""]
|
||||||
|
|
||||||
parts = default_surt.split(')', 1)
|
parts = default_surt.split(")", 1)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
orig_host_part, path_part = parts
|
orig_host_part, path_part = parts
|
||||||
good_surts = [
|
good_surts = [
|
||||||
'http://(%s,)%s' % (orig_host_part, path_part),
|
"http://(%s,)%s" % (orig_host_part, path_part),
|
||||||
'https://(%s,)%s' % (orig_host_part, path_part),
|
"https://(%s,)%s" % (orig_host_part, path_part),
|
||||||
'http://(%s,www,)%s' % (orig_host_part, path_part),
|
"http://(%s,www,)%s" % (orig_host_part, path_part),
|
||||||
'https://(%s,www,)%s' % (orig_host_part, path_part),
|
"https://(%s,www,)%s" % (orig_host_part, path_part),
|
||||||
]
|
]
|
||||||
else: # no path part
|
else: # no path part
|
||||||
host_part = parts[0]
|
host_part = parts[0]
|
||||||
good_surts = [
|
good_surts = [
|
||||||
'http://(%s' % host_part,
|
"http://(%s" % host_part,
|
||||||
'https://(%s' % host_part,
|
"https://(%s" % host_part,
|
||||||
]
|
]
|
||||||
return good_surts
|
return good_surts
|
||||||
|
|
||||||
def monkey_patch_dsrules_init():
|
def monkey_patch_dsrules_init():
|
||||||
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
|
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
|
||||||
|
|
||||||
def cdx_dsrule_init(self, url_prefix, rules):
|
def cdx_dsrule_init(self, url_prefix, rules):
|
||||||
good_surts = []
|
good_surts = []
|
||||||
url_prefixes = [url_prefix] if isinstance(
|
url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
|
||||||
url_prefix, str) else url_prefix
|
|
||||||
for bad_surt in url_prefixes:
|
for bad_surt in url_prefixes:
|
||||||
good_surts.extend(
|
good_surts.extend(
|
||||||
TheGoodUrlCanonicalizer.good_surts_from_default(
|
TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
|
||||||
bad_surt))
|
)
|
||||||
if 'match' in rules and 'regex' in rules['match']:
|
if "match" in rules and "regex" in rules["match"]:
|
||||||
rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
|
rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
|
||||||
orig_init(self, good_surts, rules)
|
orig_init(self, good_surts, rules)
|
||||||
|
|
||||||
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
|
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
|
||||||
|
|
||||||
|
|
||||||
def support_in_progress_warcs():
|
def support_in_progress_warcs():
|
||||||
'''
|
"""
|
||||||
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
||||||
being written to (warcs having ".open" suffix). This way if a cdx entry
|
being written to (warcs having ".open" suffix). This way if a cdx entry
|
||||||
references foo.warc.gz, pywb will try both foo.warc.gz and
|
references foo.warc.gz, pywb will try both foo.warc.gz and
|
||||||
foo.warc.gz.open.
|
foo.warc.gz.open.
|
||||||
'''
|
"""
|
||||||
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
||||||
|
|
||||||
def _prefix_resolver_call(self, filename, cdx=None):
|
def _prefix_resolver_call(self, filename, cdx=None):
|
||||||
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
||||||
results = []
|
results = []
|
||||||
for warc_path in raw_results:
|
for warc_path in raw_results:
|
||||||
results.append(warc_path)
|
results.append(warc_path)
|
||||||
results.append('%s.open' % warc_path)
|
results.append("%s.open" % warc_path)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||||
|
|
||||||
|
|
||||||
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||||
def __init__(self, orig_url):
|
def __init__(self, orig_url):
|
||||||
import re
|
import re
|
||||||
|
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||||
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
||||||
|
|
||||||
if six.PY2 and isinstance(orig_url, six.text_type):
|
if six.PY2 and isinstance(orig_url, six.text_type):
|
||||||
orig_url = orig_url.encode('utf-8')
|
orig_url = orig_url.encode("utf-8")
|
||||||
orig_url = quote(orig_url)
|
orig_url = quote(orig_url)
|
||||||
|
|
||||||
self._original_url = orig_url
|
self._original_url = orig_url
|
||||||
|
|
||||||
if not self._init_query(orig_url):
|
if not self._init_query(orig_url):
|
||||||
if not self._init_replay(orig_url):
|
if not self._init_replay(orig_url):
|
||||||
raise Exception('Invalid WbUrl: ', orig_url)
|
raise Exception("Invalid WbUrl: ", orig_url)
|
||||||
|
|
||||||
new_uri = WbUrl.to_uri(self.url)
|
new_uri = WbUrl.to_uri(self.url)
|
||||||
|
|
||||||
|
@ -227,8 +244,11 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||||
self.url = new_uri
|
self.url = new_uri
|
||||||
|
|
||||||
# begin brozzler changes
|
# begin brozzler changes
|
||||||
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
|
if (
|
||||||
or self.url.startswith('thumbnail:')):
|
self.url.startswith("urn:")
|
||||||
|
or self.url.startswith("screenshot:")
|
||||||
|
or self.url.startswith("thumbnail:")
|
||||||
|
):
|
||||||
return
|
return
|
||||||
# end brozzler changes
|
# end brozzler changes
|
||||||
|
|
||||||
|
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||||
self.url = self.DEFAULT_SCHEME + self.url
|
self.url = self.DEFAULT_SCHEME + self.url
|
||||||
else:
|
else:
|
||||||
inx += 2
|
inx += 2
|
||||||
if inx < len(self.url) and self.url[inx] != '/':
|
if inx < len(self.url) and self.url[inx] != "/":
|
||||||
self.url = self.url[:inx] + '/' + self.url[inx:]
|
self.url = self.url[:inx] + "/" + self.url[inx:]
|
||||||
|
|
||||||
|
|
||||||
def _get_wburl_type(self):
|
def _get_wburl_type(self):
|
||||||
return SomeWbUrl
|
return SomeWbUrl
|
||||||
|
|
||||||
|
|
||||||
def monkey_patch_wburl():
|
def monkey_patch_wburl():
|
||||||
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
|
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
|
||||||
|
|
||||||
|
|
||||||
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
|
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
|
||||||
def _extend_parser(self, arg_parser):
|
def _extend_parser(self, arg_parser):
|
||||||
super()._extend_parser(arg_parser)
|
super()._extend_parser(arg_parser)
|
||||||
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
|
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
|
||||||
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
|
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
|
||||||
arg_parser.epilog = '''
|
arg_parser.epilog = """
|
||||||
Run pywb like so:
|
Run pywb like so:
|
||||||
|
|
||||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||||
|
|
||||||
See README.rst for more information.
|
See README.rst for more information.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||||
# such below
|
# such below
|
||||||
|
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):
|
||||||
|
|
||||||
matched_rule = None
|
matched_rule = None
|
||||||
|
|
||||||
urlkey = to_native_str(query.key, 'utf-8')
|
urlkey = to_native_str(query.key, "utf-8")
|
||||||
url = query.url
|
url = query.url
|
||||||
filter_ = query.filters
|
filter_ = query.filters
|
||||||
output = query.output
|
output = query.output
|
||||||
|
@ -306,7 +330,7 @@ def _fuzzy_query_call(self, query):
|
||||||
if not matched_rule:
|
if not matched_rule:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
repl = '?'
|
repl = "?"
|
||||||
if matched_rule.replace:
|
if matched_rule.replace:
|
||||||
repl = matched_rule.replace
|
repl = matched_rule.replace
|
||||||
|
|
||||||
|
@ -315,33 +339,33 @@ def _fuzzy_query_call(self, query):
|
||||||
url = url[: inx + len(repl)]
|
url = url[: inx + len(repl)]
|
||||||
|
|
||||||
# begin brozzler changes
|
# begin brozzler changes
|
||||||
if matched_rule.match_type == 'domain':
|
if matched_rule.match_type == "domain":
|
||||||
orig_split_url = urlsplit(url)
|
orig_split_url = urlsplit(url)
|
||||||
# remove the subdomain, path, query and fragment
|
# remove the subdomain, path, query and fragment
|
||||||
host = orig_split_url.netloc.split('.', 1)[1]
|
host = orig_split_url.netloc.split(".", 1)[1]
|
||||||
new_split_url = (orig_split_url.scheme, host, '', '', '')
|
new_split_url = (orig_split_url.scheme, host, "", "", "")
|
||||||
url = urlunsplit(new_split_url)
|
url = urlunsplit(new_split_url)
|
||||||
# end brozzler changes
|
# end brozzler changes
|
||||||
|
|
||||||
params = query.params
|
params = query.params
|
||||||
params.update({'url': url,
|
params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})
|
||||||
'matchType': matched_rule.match_type,
|
|
||||||
'filter': filter_})
|
|
||||||
|
|
||||||
if 'reverse' in params:
|
if "reverse" in params:
|
||||||
del params['reverse']
|
del params["reverse"]
|
||||||
|
|
||||||
if 'closest' in params:
|
if "closest" in params:
|
||||||
del params['closest']
|
del params["closest"]
|
||||||
|
|
||||||
if 'end_key' in params:
|
if "end_key" in params:
|
||||||
del params['end_key']
|
del params["end_key"]
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
def monkey_patch_fuzzy_query():
|
def monkey_patch_fuzzy_query():
|
||||||
pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
|
pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
|
||||||
|
|
||||||
|
|
||||||
# copied and pasted from pywb/utils/canonicalize.py, only changes are commented
|
# copied and pasted from pywb/utils/canonicalize.py, only changes are commented
|
||||||
# as such
|
# as such
|
||||||
def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||||
|
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||||
|
|
||||||
start_key = url_canon(url)
|
start_key = url_canon(url)
|
||||||
|
|
||||||
if match_type == 'exact':
|
if match_type == "exact":
|
||||||
end_key = start_key + '!'
|
end_key = start_key + "!"
|
||||||
|
|
||||||
elif match_type == 'prefix':
|
elif match_type == "prefix":
|
||||||
# add trailing slash if url has it
|
# add trailing slash if url has it
|
||||||
if url.endswith('/') and not start_key.endswith('/'):
|
if url.endswith("/") and not start_key.endswith("/"):
|
||||||
start_key += '/'
|
start_key += "/"
|
||||||
|
|
||||||
end_key = inc_last_char(start_key)
|
end_key = inc_last_char(start_key)
|
||||||
|
|
||||||
elif match_type == 'host':
|
elif match_type == "host":
|
||||||
if surt_ordered:
|
if surt_ordered:
|
||||||
host = start_key.split(')/')[0]
|
host = start_key.split(")/")[0]
|
||||||
|
|
||||||
start_key = host + ')/'
|
start_key = host + ")/"
|
||||||
end_key = host + '*'
|
end_key = host + "*"
|
||||||
else:
|
else:
|
||||||
host = urlparse.urlsplit(url).netloc
|
host = urlparse.urlsplit(url).netloc
|
||||||
|
|
||||||
start_key = host + '/'
|
start_key = host + "/"
|
||||||
end_key = host + '0'
|
end_key = host + "0"
|
||||||
|
|
||||||
elif match_type == 'domain':
|
elif match_type == "domain":
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
msg = 'matchType=domain unsupported for non-surt'
|
msg = "matchType=domain unsupported for non-surt"
|
||||||
raise UrlCanonicalizeException(msg)
|
raise UrlCanonicalizeException(msg)
|
||||||
|
|
||||||
host = start_key.split(')/')[0]
|
host = start_key.split(")/")[0]
|
||||||
|
|
||||||
# if tld, use com, as start_key
|
# if tld, use com, as start_key
|
||||||
# otherwise, stick with com,example)/
|
# otherwise, stick with com,example)/
|
||||||
if ',' not in host:
|
if "," not in host:
|
||||||
start_key = host + ','
|
start_key = host + ","
|
||||||
else:
|
else:
|
||||||
start_key = host + ')/'
|
start_key = host + ")/"
|
||||||
|
|
||||||
# begin brozzler changes
|
# begin brozzler changes
|
||||||
end_key = host + '~'
|
end_key = host + "~"
|
||||||
# end brozzler changes
|
# end brozzler changes
|
||||||
else:
|
else:
|
||||||
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
raise UrlCanonicalizeException("Invalid match_type: " + match_type)
|
||||||
|
|
||||||
return (start_key, end_key)
|
return (start_key, end_key)
|
||||||
|
|
||||||
|
|
||||||
def monkey_patch_calc_search_range():
|
def monkey_patch_calc_search_range():
|
||||||
pywb.utils.canonicalize.calc_search_range = _calc_search_range
|
pywb.utils.canonicalize.calc_search_range = _calc_search_range
|
||||||
pywb.cdx.query.calc_search_range = _calc_search_range
|
pywb.cdx.query.calc_search_range = _calc_search_range
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||||
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||||
|
@ -417,7 +443,10 @@ def main(argv=sys.argv):
|
||||||
brozzler.pywb.monkey_patch_fuzzy_query()
|
brozzler.pywb.monkey_patch_fuzzy_query()
|
||||||
brozzler.pywb.monkey_patch_calc_search_range()
|
brozzler.pywb.monkey_patch_calc_search_range()
|
||||||
wayback_cli = BrozzlerWaybackCli(
|
wayback_cli = BrozzlerWaybackCli(
|
||||||
args=argv[1:], default_port=8880,
|
args=argv[1:],
|
||||||
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
default_port=8880,
|
||||||
'with brozzler)'))
|
desc=(
|
||||||
|
"brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
|
||||||
|
),
|
||||||
|
)
|
||||||
wayback_cli.run()
|
wayback_cli.run()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/robots.py - robots.txt support
|
brozzler/robots.py - robots.txt support
|
||||||
|
|
||||||
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
||||||
|
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
@ -34,30 +34,40 @@ __all__ = ["is_permitted_by_robots"]
|
||||||
|
|
||||||
# monkey-patch reppy to do substring user-agent matching, see top of file
|
# monkey-patch reppy to do substring user-agent matching, see top of file
|
||||||
reppy.Utility.short_user_agent = lambda strng: strng
|
reppy.Utility.short_user_agent = lambda strng: strng
|
||||||
|
|
||||||
|
|
||||||
def _reppy_rules_getitem(self, agent):
|
def _reppy_rules_getitem(self, agent):
|
||||||
'''
|
"""
|
||||||
Find the user-agent token matching the supplied full user-agent, using
|
Find the user-agent token matching the supplied full user-agent, using
|
||||||
a case-insensitive substring search.
|
a case-insensitive substring search.
|
||||||
'''
|
"""
|
||||||
lc_agent = agent.lower()
|
lc_agent = agent.lower()
|
||||||
for s in self.agents:
|
for s in self.agents:
|
||||||
if s in lc_agent:
|
if s in lc_agent:
|
||||||
return self.agents[s]
|
return self.agents[s]
|
||||||
return self.agents.get('*')
|
return self.agents.get("*")
|
||||||
|
|
||||||
|
|
||||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||||
|
|
||||||
|
|
||||||
class _SessionRaiseOn420(requests.Session):
|
class _SessionRaiseOn420(requests.Session):
|
||||||
timeout = 60
|
timeout = 60
|
||||||
|
|
||||||
def get(self, url, *args, **kwargs):
|
def get(self, url, *args, **kwargs):
|
||||||
res = super().get(url, timeout=self.timeout, *args, **kwargs)
|
res = super().get(url, timeout=self.timeout, *args, **kwargs)
|
||||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
if res.status_code == 420 and "warcprox-meta" in res.headers:
|
||||||
raise brozzler.ReachedLimit(
|
raise brozzler.ReachedLimit(
|
||||||
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
warcprox_meta=json.loads(res.headers["warcprox-meta"]),
|
||||||
http_payload=res.text)
|
http_payload=res.text,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
|
|
||||||
|
|
||||||
def _robots_cache(site, proxy=None):
|
def _robots_cache(site, proxy=None):
|
||||||
if not site.id in _robots_caches:
|
if not site.id in _robots_caches:
|
||||||
req_sesh = _SessionRaiseOn420()
|
req_sesh = _SessionRaiseOn420()
|
||||||
|
@ -68,14 +78,16 @@ def _robots_cache(site, proxy=None):
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
req_sesh.headers.update(site.extra_headers())
|
req_sesh.headers.update(site.extra_headers())
|
||||||
if site.user_agent:
|
if site.user_agent:
|
||||||
req_sesh.headers['User-Agent'] = site.user_agent
|
req_sesh.headers["User-Agent"] = site.user_agent
|
||||||
_robots_caches[site.id] = reppy.cache.RobotsCache(
|
_robots_caches[site.id] = reppy.cache.RobotsCache(
|
||||||
session=req_sesh, disallow_forbidden=False)
|
session=req_sesh, disallow_forbidden=False
|
||||||
|
)
|
||||||
|
|
||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
|
||||||
|
|
||||||
def is_permitted_by_robots(site, url, proxy=None):
|
def is_permitted_by_robots(site, url, proxy=None):
|
||||||
'''
|
"""
|
||||||
Checks if `url` is permitted by robots.txt.
|
Checks if `url` is permitted by robots.txt.
|
||||||
|
|
||||||
Treats any kind of error fetching robots.txt as "allow all". See
|
Treats any kind of error fetching robots.txt as "allow all". See
|
||||||
|
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||||
Raises:
|
Raises:
|
||||||
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
|
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
|
||||||
requests.exceptions.ProxyError: if the proxy is down
|
requests.exceptions.ProxyError: if the proxy is down
|
||||||
'''
|
"""
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = _robots_cache(site, proxy).allowed(
|
result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
|
||||||
url, site.user_agent or "brozzler")
|
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||||
e.args[0], brozzler.ReachedLimit):
|
e.args[0], brozzler.ReachedLimit
|
||||||
|
):
|
||||||
raise e.args[0]
|
raise e.args[0]
|
||||||
elif hasattr(e, 'args') and isinstance(
|
elif hasattr(e, "args") and isinstance(
|
||||||
e.args[0], requests.exceptions.ProxyError):
|
e.args[0], requests.exceptions.ProxyError
|
||||||
|
):
|
||||||
# reppy has wrapped an exception that we want to bubble up
|
# reppy has wrapped an exception that we want to bubble up
|
||||||
raise brozzler.ProxyError(e)
|
raise brozzler.ProxyError(e)
|
||||||
else:
|
else:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"returning true (permitted) after problem fetching "
|
"returning true (permitted) after problem fetching "
|
||||||
"robots.txt for %r: %r", url, e)
|
"robots.txt for %r: %r",
|
||||||
|
url,
|
||||||
|
e,
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||||
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
|
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
|
||||||
scopes and adds outlinks to the frontier
|
scopes and adds outlinks to the frontier
|
||||||
|
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
|
@ -39,6 +39,7 @@ from . import ydl
|
||||||
|
|
||||||
r = rdb.RethinkDB()
|
r = rdb.RethinkDB()
|
||||||
|
|
||||||
|
|
||||||
class BrozzlerWorker:
|
class BrozzlerWorker:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -50,13 +51,26 @@ class BrozzlerWorker:
|
||||||
SITE_SESSION_MINUTES = 15
|
SITE_SESSION_MINUTES = 15
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self,
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
frontier,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
service_registry=None,
|
||||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
max_browsers=1,
|
||||||
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
|
chrome_exe="chromium-browser",
|
||||||
download_throughput=-1, stealth=False,
|
warcprox_auto=False,
|
||||||
window_height=900, window_width=1400):
|
proxy=None,
|
||||||
|
skip_extract_outlinks=False,
|
||||||
|
skip_visit_hashtags=False,
|
||||||
|
skip_youtube_dl=False,
|
||||||
|
simpler404=False,
|
||||||
|
screenshot_full_page=False,
|
||||||
|
page_timeout=300,
|
||||||
|
behavior_timeout=900,
|
||||||
|
extract_outlinks_timeout=60,
|
||||||
|
download_throughput=-1,
|
||||||
|
stealth=False,
|
||||||
|
window_height=900,
|
||||||
|
window_width=1400,
|
||||||
|
):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
@ -79,7 +93,8 @@ class BrozzlerWorker:
|
||||||
self._stealth = stealth
|
self._stealth = stealth
|
||||||
|
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(
|
self._browser_pool = brozzler.browser.BrowserPool(
|
||||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
|
||||||
|
)
|
||||||
self._browsing_threads = set()
|
self._browsing_threads = set()
|
||||||
self._browsing_threads_lock = threading.Lock()
|
self._browsing_threads_lock = threading.Lock()
|
||||||
|
|
||||||
|
@ -88,13 +103,20 @@ class BrozzlerWorker:
|
||||||
self._shutdown = threading.Event()
|
self._shutdown = threading.Event()
|
||||||
|
|
||||||
def _choose_warcprox(self):
|
def _choose_warcprox(self):
|
||||||
warcproxes = self._service_registry.available_services('warcprox')
|
warcproxes = self._service_registry.available_services("warcprox")
|
||||||
if not warcproxes:
|
if not warcproxes:
|
||||||
return None
|
return None
|
||||||
# .group('proxy').count() makes this query about 99% more efficient
|
# .group('proxy').count() makes this query about 99% more efficient
|
||||||
reql = self._frontier.rr.table('sites').between(
|
reql = (
|
||||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
self._frontier.rr.table("sites")
|
||||||
index='sites_last_disclaimed').group('proxy').count()
|
.between(
|
||||||
|
["ACTIVE", r.minval],
|
||||||
|
["ACTIVE", r.maxval],
|
||||||
|
index="sites_last_disclaimed",
|
||||||
|
)
|
||||||
|
.group("proxy")
|
||||||
|
.count()
|
||||||
|
)
|
||||||
# returns results like
|
# returns results like
|
||||||
# {
|
# {
|
||||||
# "wbgrp-svc030.us.archive.org:8000": 148,
|
# "wbgrp-svc030.us.archive.org:8000": 148,
|
||||||
|
@ -102,10 +124,11 @@ class BrozzlerWorker:
|
||||||
# }
|
# }
|
||||||
proxy_scoreboard = dict(reql.run())
|
proxy_scoreboard = dict(reql.run())
|
||||||
for warcprox in warcproxes:
|
for warcprox in warcproxes:
|
||||||
address = '%s:%s' % (warcprox['host'], warcprox['port'])
|
address = "%s:%s" % (warcprox["host"], warcprox["port"])
|
||||||
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
|
warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
|
||||||
warcproxes.sort(key=lambda warcprox: (
|
warcproxes.sort(
|
||||||
warcprox['assigned_sites'], warcprox['load']))
|
key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
|
||||||
|
)
|
||||||
# XXX make this heuristic more advanced?
|
# XXX make this heuristic more advanced?
|
||||||
return warcproxes[0]
|
return warcproxes[0]
|
||||||
|
|
||||||
|
@ -118,13 +141,15 @@ class BrozzlerWorker:
|
||||||
svc = self._choose_warcprox()
|
svc = self._choose_warcprox()
|
||||||
if svc is None:
|
if svc is None:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'no available instances of warcprox in the service '
|
"no available instances of warcprox in the service " "registry"
|
||||||
'registry')
|
)
|
||||||
site.proxy = '%s:%s' % (svc['host'], svc['port'])
|
site.proxy = "%s:%s" % (svc["host"], svc["port"])
|
||||||
site.save()
|
site.save()
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'chose warcprox instance %r from service registry for %r',
|
"chose warcprox instance %r from service registry for %r",
|
||||||
site.proxy, site)
|
site.proxy,
|
||||||
|
site,
|
||||||
|
)
|
||||||
return site.proxy
|
return site.proxy
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -132,14 +157,16 @@ class BrozzlerWorker:
|
||||||
if self._proxy:
|
if self._proxy:
|
||||||
if self._proxy_is_warcprox is None:
|
if self._proxy_is_warcprox is None:
|
||||||
try:
|
try:
|
||||||
response = requests.get('http://%s/status' % self._proxy)
|
response = requests.get("http://%s/status" % self._proxy)
|
||||||
status = json.loads(response.text)
|
status = json.loads(response.text)
|
||||||
self._proxy_is_warcprox = (status['role'] == 'warcprox')
|
self._proxy_is_warcprox = status["role"] == "warcprox"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._proxy_is_warcprox = False
|
self._proxy_is_warcprox = False
|
||||||
logging.info(
|
logging.info(
|
||||||
'%s %s warcprox', self._proxy,
|
"%s %s warcprox",
|
||||||
'IS' if self._proxy_is_warcprox else 'IS NOT')
|
self._proxy,
|
||||||
|
"IS" if self._proxy_is_warcprox else "IS NOT",
|
||||||
|
)
|
||||||
return self._proxy_is_warcprox
|
return self._proxy_is_warcprox
|
||||||
else:
|
else:
|
||||||
# I should have commented when I originally wrote this code, but I
|
# I should have commented when I originally wrote this code, but I
|
||||||
|
@ -148,13 +175,20 @@ class BrozzlerWorker:
|
||||||
return bool(site.proxy or self._warcprox_auto)
|
return bool(site.proxy or self._warcprox_auto)
|
||||||
|
|
||||||
def _warcprox_write_record(
|
def _warcprox_write_record(
|
||||||
self, warcprox_address, url, warc_type, content_type,
|
self,
|
||||||
payload, extra_headers=None):
|
warcprox_address,
|
||||||
|
url,
|
||||||
|
warc_type,
|
||||||
|
content_type,
|
||||||
|
payload,
|
||||||
|
extra_headers=None,
|
||||||
|
):
|
||||||
headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
|
headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
|
||||||
if extra_headers:
|
if extra_headers:
|
||||||
headers.update(extra_headers)
|
headers.update(extra_headers)
|
||||||
request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
|
request = urllib.request.Request(
|
||||||
headers=headers, data=payload)
|
url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
|
||||||
|
)
|
||||||
|
|
||||||
# XXX setting request.type="http" is a hack to stop urllib from trying
|
# XXX setting request.type="http" is a hack to stop urllib from trying
|
||||||
# to tunnel if url is https
|
# to tunnel if url is https
|
||||||
|
@ -166,25 +200,30 @@ class BrozzlerWorker:
|
||||||
if response.getcode() != 204:
|
if response.getcode() != 204:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||||
response.getcode(), response.reason)
|
response.getcode(),
|
||||||
|
response.reason,
|
||||||
|
)
|
||||||
return request, response
|
return request, response
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||||
e.getcode(), e.info())
|
e.getcode(),
|
||||||
|
e.info(),
|
||||||
|
)
|
||||||
return request, None
|
return request, None
|
||||||
except urllib.error.URLError as e:
|
except urllib.error.URLError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
"proxy error on WARCPROX_WRITE_RECORD %s" % url
|
||||||
|
) from e
|
||||||
except ConnectionError as e:
|
except ConnectionError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
"proxy error on WARCPROX_WRITE_RECORD %s" % url
|
||||||
|
) from e
|
||||||
|
|
||||||
def thumb_jpeg(self, full_jpeg):
|
def thumb_jpeg(self, full_jpeg):
|
||||||
"""Create JPEG thumbnail.
|
"""Create JPEG thumbnail."""
|
||||||
"""
|
|
||||||
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
||||||
thumb_width = 300
|
thumb_width = 300
|
||||||
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
||||||
|
@ -193,8 +232,15 @@ class BrozzlerWorker:
|
||||||
img.save(out, "jpeg", quality=95)
|
img.save(out, "jpeg", quality=95)
|
||||||
return out.getbuffer()
|
return out.getbuffer()
|
||||||
|
|
||||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
def brozzle_page(
|
||||||
on_request=None, enable_youtube_dl=True):
|
self,
|
||||||
|
browser,
|
||||||
|
site,
|
||||||
|
page,
|
||||||
|
on_screenshot=None,
|
||||||
|
on_request=None,
|
||||||
|
enable_youtube_dl=True,
|
||||||
|
):
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
ydl_fetches = None
|
ydl_fetches = None
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
|
@ -208,31 +254,38 @@ class BrozzlerWorker:
|
||||||
except brozzler.ProxyError:
|
except brozzler.ProxyError:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
if (
|
||||||
and hasattr(e.exc_info[1], 'code')
|
hasattr(e, "exc_info")
|
||||||
and e.exc_info[1].code == 430):
|
and len(e.exc_info) >= 2
|
||||||
|
and hasattr(e.exc_info[1], "code")
|
||||||
|
and e.exc_info[1].code == 430
|
||||||
|
):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'youtube-dl got %s %s processing %s',
|
"youtube-dl got %s %s processing %s",
|
||||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
e.exc_info[1].code,
|
||||||
|
e.exc_info[1].msg,
|
||||||
|
page.url,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'youtube_dl raised exception on %s', page,
|
"youtube_dl raised exception on %s", page, exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_fetches):
|
if self._needs_browsing(page, ydl_fetches):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info("needs browsing: %s", page)
|
||||||
try:
|
try:
|
||||||
browser_outlinks = self._browse_page(
|
browser_outlinks = self._browse_page(
|
||||||
browser, site, page, on_screenshot, on_request)
|
browser, site, page, on_screenshot, on_request
|
||||||
|
)
|
||||||
outlinks.update(browser_outlinks)
|
outlinks.update(browser_outlinks)
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_fetches):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info("needs fetch: %s", page)
|
||||||
self._fetch_url(site, page=page)
|
self._fetch_url(site, page=page)
|
||||||
else:
|
else:
|
||||||
self.logger.info('already fetched: %s', page)
|
self.logger.info("already fetched: %s", page)
|
||||||
|
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
|
@ -243,71 +296,88 @@ class BrozzlerWorker:
|
||||||
if self._using_warcprox(site):
|
if self._using_warcprox(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy_for(site), page)
|
"screenshot for %s",
|
||||||
|
self._proxy_for(site),
|
||||||
|
page,
|
||||||
|
)
|
||||||
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy_for(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource",
|
||||||
|
content_type="image/jpeg",
|
||||||
payload=screenshot_jpeg,
|
payload=screenshot_jpeg,
|
||||||
extra_headers=site.extra_headers(page))
|
extra_headers=site.extra_headers(page),
|
||||||
|
)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy_for(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource",
|
||||||
|
content_type="image/jpeg",
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
extra_headers=site.extra_headers(page))
|
extra_headers=site.extra_headers(page),
|
||||||
|
)
|
||||||
|
|
||||||
def _on_response(chrome_msg):
|
def _on_response(chrome_msg):
|
||||||
if ('params' in chrome_msg
|
if (
|
||||||
and 'response' in chrome_msg['params']
|
"params" in chrome_msg
|
||||||
and 'mimeType' in chrome_msg['params']['response']
|
and "response" in chrome_msg["params"]
|
||||||
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
|
and "mimeType" in chrome_msg["params"]["response"]
|
||||||
|
and chrome_msg["params"]["response"]
|
||||||
|
.get("mimeType", "")
|
||||||
|
.startswith("video/")
|
||||||
# skip manifests of DASH segmented video -
|
# skip manifests of DASH segmented video -
|
||||||
# see https://github.com/internetarchive/brozzler/pull/70
|
# see https://github.com/internetarchive/brozzler/pull/70
|
||||||
and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
|
and chrome_msg["params"]["response"]["mimeType"]
|
||||||
and chrome_msg['params']['response'].get('status') in (200, 206)):
|
!= "video/vnd.mpeg.dash.mpd"
|
||||||
|
and chrome_msg["params"]["response"].get("status") in (200, 206)
|
||||||
|
):
|
||||||
video = {
|
video = {
|
||||||
'blame': 'browser',
|
"blame": "browser",
|
||||||
'url': chrome_msg['params']['response'].get('url'),
|
"url": chrome_msg["params"]["response"].get("url"),
|
||||||
'response_code': chrome_msg['params']['response']['status'],
|
"response_code": chrome_msg["params"]["response"]["status"],
|
||||||
'content-type': chrome_msg['params']['response']['mimeType'],
|
"content-type": chrome_msg["params"]["response"]["mimeType"],
|
||||||
}
|
}
|
||||||
response_headers = CaseInsensitiveDict(
|
response_headers = CaseInsensitiveDict(
|
||||||
chrome_msg['params']['response']['headers'])
|
chrome_msg["params"]["response"]["headers"]
|
||||||
if 'content-length' in response_headers:
|
)
|
||||||
video['content-length'] = int(response_headers['content-length'])
|
if "content-length" in response_headers:
|
||||||
if 'content-range' in response_headers:
|
video["content-length"] = int(response_headers["content-length"])
|
||||||
video['content-range'] = response_headers['content-range']
|
if "content-range" in response_headers:
|
||||||
logging.debug('embedded video %s', video)
|
video["content-range"] = response_headers["content-range"]
|
||||||
if not 'videos' in page:
|
logging.debug("embedded video %s", video)
|
||||||
|
if not "videos" in page:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
|
|
||||||
sw_fetched = set()
|
sw_fetched = set()
|
||||||
|
|
||||||
def _on_service_worker_version_updated(chrome_msg):
|
def _on_service_worker_version_updated(chrome_msg):
|
||||||
# https://github.com/internetarchive/brozzler/issues/140
|
# https://github.com/internetarchive/brozzler/issues/140
|
||||||
self.logger.trace('%r', chrome_msg)
|
self.logger.trace("%r", chrome_msg)
|
||||||
if chrome_msg.get('params', {}).get('versions'):
|
if chrome_msg.get("params", {}).get("versions"):
|
||||||
url = chrome_msg.get('params', {}).get('versions')[0]\
|
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
|
||||||
.get('scriptURL')
|
|
||||||
if url and url not in sw_fetched:
|
if url and url not in sw_fetched:
|
||||||
self.logger.info('fetching service worker script %s', url)
|
self.logger.info("fetching service worker script %s", url)
|
||||||
self._fetch_url(site, url=url)
|
self._fetch_url(site, url=url)
|
||||||
sw_fetched.add(url)
|
sw_fetched.add(url)
|
||||||
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(
|
browser.start(
|
||||||
proxy=self._proxy_for(site),
|
proxy=self._proxy_for(site),
|
||||||
cookie_db=site.get('cookie_db'),
|
cookie_db=site.get("cookie_db"),
|
||||||
window_height=self._window_height,
|
window_height=self._window_height,
|
||||||
window_width=self._window_width)
|
window_width=self._window_width,
|
||||||
|
)
|
||||||
final_page_url, outlinks = browser.browse_page(
|
final_page_url, outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(page),
|
page.url,
|
||||||
behavior_parameters=site.get('behavior_parameters'),
|
extra_headers=site.extra_headers(page),
|
||||||
username=site.get('username'), password=site.get('password'),
|
behavior_parameters=site.get("behavior_parameters"),
|
||||||
user_agent=site.get('user_agent'),
|
username=site.get("username"),
|
||||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
password=site.get("password"),
|
||||||
|
user_agent=site.get("user_agent"),
|
||||||
|
on_screenshot=_on_screenshot,
|
||||||
|
on_response=_on_response,
|
||||||
on_request=on_request,
|
on_request=on_request,
|
||||||
on_service_worker_version_updated=_on_service_worker_version_updated,
|
on_service_worker_version_updated=_on_service_worker_version_updated,
|
||||||
hashtags=page.hashtags,
|
hashtags=page.hashtags,
|
||||||
|
@ -320,7 +390,8 @@ class BrozzlerWorker:
|
||||||
behavior_timeout=self._behavior_timeout,
|
behavior_timeout=self._behavior_timeout,
|
||||||
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
||||||
download_throughput=self._download_throughput,
|
download_throughput=self._download_throughput,
|
||||||
stealth=self._stealth)
|
stealth=self._stealth,
|
||||||
|
)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
@ -331,19 +402,18 @@ class BrozzlerWorker:
|
||||||
url = page.url
|
url = page.url
|
||||||
if self._proxy_for(site):
|
if self._proxy_for(site):
|
||||||
proxies = {
|
proxies = {
|
||||||
'http': 'http://%s' % self._proxy_for(site),
|
"http": "http://%s" % self._proxy_for(site),
|
||||||
'https': 'http://%s' % self._proxy_for(site),
|
"https": "http://%s" % self._proxy_for(site),
|
||||||
}
|
}
|
||||||
|
|
||||||
self.logger.info('fetching %s', url)
|
self.logger.info("fetching %s", url)
|
||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
requests.get(
|
||||||
url, proxies=proxies, headers=site.extra_headers(page),
|
url, proxies=proxies, headers=site.extra_headers(page), verify=False
|
||||||
verify=False)
|
)
|
||||||
except requests.exceptions.ProxyError as e:
|
except requests.exceptions.ProxyError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||||
'proxy error fetching %s' % url) from e
|
|
||||||
|
|
||||||
def _needs_browsing(self, page, ydl_fetches):
|
def _needs_browsing(self, page, ydl_fetches):
|
||||||
if ydl_fetches:
|
if ydl_fetches:
|
||||||
|
@ -351,8 +421,10 @@ class BrozzlerWorker:
|
||||||
if not final_bounces:
|
if not final_bounces:
|
||||||
return True
|
return True
|
||||||
for txn in final_bounces:
|
for txn in final_bounces:
|
||||||
if txn['response_headers'].get_content_type() in [
|
if txn["response_headers"].get_content_type() in [
|
||||||
'text/html', 'application/xhtml+xml']:
|
"text/html",
|
||||||
|
"application/xhtml+xml",
|
||||||
|
]:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
@ -361,14 +433,13 @@ class BrozzlerWorker:
|
||||||
def _already_fetched(self, page, ydl_fetches):
|
def _already_fetched(self, page, ydl_fetches):
|
||||||
if ydl_fetches:
|
if ydl_fetches:
|
||||||
for fetch in ydl.final_bounces(ydl_fetches, page.url):
|
for fetch in ydl.final_bounces(ydl_fetches, page.url):
|
||||||
if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
|
if fetch["method"] == "GET" and fetch["response_code"] == 200:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def brozzle_site(self, browser, site):
|
def brozzle_site(self, browser, site):
|
||||||
try:
|
try:
|
||||||
site.last_claimed_by = '%s:%s' % (
|
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
|
||||||
socket.gethostname(), browser.chrome.port)
|
|
||||||
site.save()
|
site.save()
|
||||||
start = time.time()
|
start = time.time()
|
||||||
page = None
|
page = None
|
||||||
|
@ -377,28 +448,28 @@ class BrozzlerWorker:
|
||||||
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
||||||
# which is why we honor time limit and stop request first☝🏻
|
# which is why we honor time limit and stop request first☝🏻
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"brozzling site (proxy=%r) %s",
|
"brozzling site (proxy=%r) %s", self._proxy_for(site), site
|
||||||
self._proxy_for(site), site)
|
)
|
||||||
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
||||||
site.refresh()
|
site.refresh()
|
||||||
self._frontier.enforce_time_limit(site)
|
self._frontier.enforce_time_limit(site)
|
||||||
self._frontier.honor_stop_request(site)
|
self._frontier.honor_stop_request(site)
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(
|
||||||
socket.gethostname(), browser.chrome.port))
|
site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
|
||||||
|
)
|
||||||
|
|
||||||
if (page.needs_robots_check and
|
if page.needs_robots_check and not brozzler.is_permitted_by_robots(
|
||||||
not brozzler.is_permitted_by_robots(
|
site, page.url, self._proxy_for(site)
|
||||||
site, page.url, self._proxy_for(site))):
|
):
|
||||||
logging.warning("page %s is blocked by robots.txt", page.url)
|
logging.warning("page %s is blocked by robots.txt", page.url)
|
||||||
page.blocked_by_robots = True
|
page.blocked_by_robots = True
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
else:
|
else:
|
||||||
outlinks = self.brozzle_page(
|
outlinks = self.brozzle_page(
|
||||||
browser, site, page,
|
browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
|
||||||
enable_youtube_dl=not self._skip_youtube_dl)
|
)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
||||||
site, page, outlinks)
|
|
||||||
if browser.is_running():
|
if browser.is_running():
|
||||||
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
|
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
|
||||||
|
|
||||||
|
@ -418,31 +489,36 @@ class BrozzlerWorker:
|
||||||
except brozzler.ProxyError as e:
|
except brozzler.ProxyError as e:
|
||||||
if self._warcprox_auto:
|
if self._warcprox_auto:
|
||||||
logging.error(
|
logging.error(
|
||||||
'proxy error (site.proxy=%s), will try to choose a '
|
"proxy error (site.proxy=%s), will try to choose a "
|
||||||
'healthy instance next time site is brozzled: %s',
|
"healthy instance next time site is brozzled: %s",
|
||||||
site.proxy, e)
|
site.proxy,
|
||||||
|
e,
|
||||||
|
)
|
||||||
site.proxy = None
|
site.proxy = None
|
||||||
else:
|
else:
|
||||||
# using brozzler-worker --proxy, nothing to do but try the
|
# using brozzler-worker --proxy, nothing to do but try the
|
||||||
# same proxy again next time
|
# same proxy again next time
|
||||||
logging.error(
|
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
||||||
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
|
|
||||||
except:
|
except:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'unexpected exception site=%r page=%r', site, page,
|
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
if page:
|
if page:
|
||||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'marking page "completed" after %s unexpected '
|
'marking page "completed" after %s unexpected '
|
||||||
'exceptions attempting to brozzle %s',
|
"exceptions attempting to brozzle %s",
|
||||||
page.failed_attempts, page)
|
page.failed_attempts,
|
||||||
|
page,
|
||||||
|
)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
page = None
|
page = None
|
||||||
finally:
|
finally:
|
||||||
if start:
|
if start:
|
||||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
site.active_brozzling_time = (
|
||||||
|
(site.active_brozzling_time or 0) + time.time() - start
|
||||||
|
)
|
||||||
self._frontier.disclaim_site(site, page)
|
self._frontier.disclaim_site(site, page)
|
||||||
|
|
||||||
def _brozzle_site_thread_target(self, browser, site):
|
def _brozzle_site_thread_target(self, browser, site):
|
||||||
|
@ -462,21 +538,25 @@ class BrozzlerWorker:
|
||||||
"role": "brozzler-worker",
|
"role": "brozzler-worker",
|
||||||
"ttl": self.HEARTBEAT_INTERVAL * 3,
|
"ttl": self.HEARTBEAT_INTERVAL * 3,
|
||||||
}
|
}
|
||||||
status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
|
status_info["load"] = (
|
||||||
|
1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
|
||||||
|
)
|
||||||
status_info["browser_pool_size"] = self._browser_pool.size
|
status_info["browser_pool_size"] = self._browser_pool.size
|
||||||
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.status_info = self._service_registry.heartbeat(status_info)
|
self.status_info = self._service_registry.heartbeat(status_info)
|
||||||
self.logger.trace(
|
self.logger.trace("status in service registry: %s", self.status_info)
|
||||||
"status in service registry: %s", self.status_info)
|
|
||||||
except r.ReqlError as e:
|
except r.ReqlError as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"failed to send heartbeat and update service registry "
|
"failed to send heartbeat and update service registry "
|
||||||
"with info %s: %s", status_info, e)
|
"with info %s: %s",
|
||||||
|
status_info,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
def _service_heartbeat_if_due(self):
|
def _service_heartbeat_if_due(self):
|
||||||
'''Sends service registry heartbeat if due'''
|
"""Sends service registry heartbeat if due"""
|
||||||
due = False
|
due = False
|
||||||
if self._service_registry:
|
if self._service_registry:
|
||||||
if not hasattr(self, "status_info"):
|
if not hasattr(self, "status_info"):
|
||||||
|
@ -489,15 +569,16 @@ class BrozzlerWorker:
|
||||||
self._service_heartbeat()
|
self._service_heartbeat()
|
||||||
|
|
||||||
def _start_browsing_some_sites(self):
|
def _start_browsing_some_sites(self):
|
||||||
'''
|
"""
|
||||||
Starts browsing some sites.
|
Starts browsing some sites.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
NoBrowsersAvailable if none available
|
NoBrowsersAvailable if none available
|
||||||
'''
|
"""
|
||||||
# acquire_multi() raises NoBrowsersAvailable if none available
|
# acquire_multi() raises NoBrowsersAvailable if none available
|
||||||
browsers = self._browser_pool.acquire_multi(
|
browsers = self._browser_pool.acquire_multi(
|
||||||
(self._browser_pool.num_available() + 1) // 2)
|
(self._browser_pool.num_available() + 1) // 2
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
sites = self._frontier.claim_sites(len(browsers))
|
sites = self._frontier.claim_sites(len(browsers))
|
||||||
except:
|
except:
|
||||||
|
@ -510,7 +591,8 @@ class BrozzlerWorker:
|
||||||
target=self._brozzle_site_thread_target,
|
target=self._brozzle_site_thread_target,
|
||||||
args=(browsers[i], sites[i]),
|
args=(browsers[i], sites[i]),
|
||||||
name="BrozzlingThread:%s" % browsers[i].chrome.port,
|
name="BrozzlingThread:%s" % browsers[i].chrome.port,
|
||||||
daemon=True)
|
daemon=True,
|
||||||
|
)
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
self._browsing_threads.add(th)
|
self._browsing_threads.add(th)
|
||||||
th.start()
|
th.start()
|
||||||
|
@ -519,7 +601,8 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.logger.notice(
|
self.logger.notice(
|
||||||
'brozzler %s - brozzler-worker starting', brozzler.__version__)
|
"brozzler %s - brozzler-worker starting", brozzler.__version__
|
||||||
|
)
|
||||||
last_nothing_to_claim = 0
|
last_nothing_to_claim = 0
|
||||||
try:
|
try:
|
||||||
while not self._shutdown.is_set():
|
while not self._shutdown.is_set():
|
||||||
|
@ -528,39 +611,38 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
self._start_browsing_some_sites()
|
self._start_browsing_some_sites()
|
||||||
except brozzler.browser.NoBrowsersAvailable:
|
except brozzler.browser.NoBrowsersAvailable:
|
||||||
logging.trace(
|
logging.trace("all %s browsers are in use", self._max_browsers)
|
||||||
"all %s browsers are in use",
|
|
||||||
self._max_browsers)
|
|
||||||
except brozzler.NothingToClaim:
|
except brozzler.NothingToClaim:
|
||||||
last_nothing_to_claim = time.time()
|
last_nothing_to_claim = time.time()
|
||||||
logging.trace(
|
logging.trace(
|
||||||
"nothing to claim, all available active sites "
|
"nothing to claim, all available active sites "
|
||||||
"are already claimed by a brozzler worker")
|
"are already claimed by a brozzler worker"
|
||||||
|
)
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
self.logger.notice("shutdown requested")
|
self.logger.notice("shutdown requested")
|
||||||
except r.ReqlError as e:
|
except r.ReqlError as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"caught rethinkdb exception, will try to proceed",
|
"caught rethinkdb exception, will try to proceed", exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
self.logger.info("shutdown requested")
|
self.logger.info("shutdown requested")
|
||||||
except:
|
except:
|
||||||
self.logger.critical(
|
self.logger.critical(
|
||||||
"thread exiting due to unexpected exception",
|
"thread exiting due to unexpected exception", exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
finally:
|
finally:
|
||||||
if self._service_registry and hasattr(self, "status_info"):
|
if self._service_registry and hasattr(self, "status_info"):
|
||||||
try:
|
try:
|
||||||
self._service_registry.unregister(self.status_info["id"])
|
self._service_registry.unregister(self.status_info["id"])
|
||||||
except:
|
except:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"failed to unregister from service registry",
|
"failed to unregister from service registry", exc_info=True
|
||||||
exc_info=True)
|
)
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'shutting down %s brozzling threads',
|
"shutting down %s brozzling threads", len(self._browsing_threads)
|
||||||
len(self._browsing_threads))
|
)
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
for th in self._browsing_threads:
|
for th in self._browsing_threads:
|
||||||
if th.is_alive():
|
if th.is_alive():
|
||||||
|
@ -575,11 +657,10 @@ class BrozzlerWorker:
|
||||||
with self._start_stop_lock:
|
with self._start_stop_lock:
|
||||||
if self._thread:
|
if self._thread:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
'ignoring start request because self._thread is '
|
"ignoring start request because self._thread is " "not None"
|
||||||
'not None')
|
)
|
||||||
return
|
return
|
||||||
self._thread = threading.Thread(
|
self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
|
||||||
target=self.run, name="BrozzlerWorker")
|
|
||||||
self._thread.start()
|
self._thread.start()
|
||||||
|
|
||||||
def shutdown_now(self):
|
def shutdown_now(self):
|
||||||
|
@ -590,4 +671,3 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
def is_alive(self):
|
def is_alive(self):
|
||||||
return self._thread and self._thread.is_alive()
|
return self._thread and self._thread.is_alive()
|
||||||
|
|
||||||
|
|
269
brozzler/ydl.py
269
brozzler/ydl.py
|
@ -1,4 +1,4 @@
|
||||||
'''
|
"""
|
||||||
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
||||||
|
|
||||||
Copyright (C) 2023 Internet Archive
|
Copyright (C) 2023 Internet Archive
|
||||||
|
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
|
@ -31,6 +31,7 @@ import threading
|
||||||
|
|
||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
self.extra_headers = extra_headers
|
self.extra_headers = extra_headers
|
||||||
|
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
req.add_header(h, v)
|
req.add_header(h, v)
|
||||||
return req
|
return req
|
||||||
|
|
||||||
|
|
||||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
|
|
||||||
def _http_response(self, request, response):
|
def _http_response(self, request, response):
|
||||||
fetch = {
|
fetch = {
|
||||||
'url': request.full_url,
|
"url": request.full_url,
|
||||||
'method': request.get_method(),
|
"method": request.get_method(),
|
||||||
'response_code': response.code,
|
"response_code": response.code,
|
||||||
'response_headers': response.headers,
|
"response_headers": response.headers,
|
||||||
}
|
}
|
||||||
self.fetches.append(fetch)
|
self.fetches.append(fetch)
|
||||||
return response
|
return response
|
||||||
|
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.fetches = []
|
self.fetches = []
|
||||||
|
|
||||||
|
|
||||||
def final_bounces(fetches, url):
|
def final_bounces(fetches, url):
|
||||||
"""
|
"""
|
||||||
Resolves redirect chains in `fetches` and returns a list of fetches
|
Resolves redirect chains in `fetches` and returns a list of fetches
|
||||||
|
@ -75,24 +78,26 @@ def final_bounces(fetches, url):
|
||||||
for fetch in fetches:
|
for fetch in fetches:
|
||||||
# XXX check http status 301,302,303,307? check for "uri" header
|
# XXX check http status 301,302,303,307? check for "uri" header
|
||||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||||
if 'location' in fetch['response_headers']:
|
if "location" in fetch["response_headers"]:
|
||||||
redirects[fetch['url']] = fetch
|
redirects[fetch["url"]] = fetch
|
||||||
|
|
||||||
final_url = url
|
final_url = url
|
||||||
while final_url in redirects:
|
while final_url in redirects:
|
||||||
fetch = redirects.pop(final_url)
|
fetch = redirects.pop(final_url)
|
||||||
final_url = urllib.parse.urljoin(
|
final_url = urllib.parse.urljoin(
|
||||||
fetch['url'], fetch['response_headers']['location'])
|
fetch["url"], fetch["response_headers"]["location"]
|
||||||
|
)
|
||||||
|
|
||||||
final_bounces = []
|
final_bounces = []
|
||||||
for fetch in fetches:
|
for fetch in fetches:
|
||||||
if fetch['url'] == final_url:
|
if fetch["url"] == final_url:
|
||||||
final_bounces.append(fetch)
|
final_bounces.append(fetch)
|
||||||
|
|
||||||
return final_bounces
|
return final_bounces
|
||||||
|
|
||||||
|
|
||||||
def _build_youtube_dl(worker, destdir, site, page):
|
def _build_youtube_dl(worker, destdir, site, page):
|
||||||
'''
|
"""
|
||||||
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
|
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
|
||||||
|
|
||||||
The `YoutubeDL` instance does a few special brozzler-specific things:
|
The `YoutubeDL` instance does a few special brozzler-specific things:
|
||||||
|
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
a yt-dlp `yt_dlp.YoutubeDL` instance
|
a yt-dlp `yt_dlp.YoutubeDL` instance
|
||||||
'''
|
"""
|
||||||
|
|
||||||
class _YoutubeDL(yt_dlp.YoutubeDL):
|
class _YoutubeDL(yt_dlp.YoutubeDL):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
def add_default_extra_info(self, ie_result, ie, url):
|
def add_default_extra_info(self, ie_result, ie, url):
|
||||||
# hook in some logging
|
# hook in some logging
|
||||||
super().add_default_extra_info(ie_result, ie, url)
|
super().add_default_extra_info(ie_result, ie, url)
|
||||||
if ie_result.get('_type') == 'playlist':
|
if ie_result.get("_type") == "playlist":
|
||||||
self.logger.info(
|
self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
|
||||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
if ie.IE_NAME in {
|
||||||
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
|
"youtube:playlist",
|
||||||
|
"youtube:tab",
|
||||||
|
"soundcloud:user",
|
||||||
|
"instagram:user",
|
||||||
|
}:
|
||||||
# At this point ie_result['entries'] is an iterator that
|
# At this point ie_result['entries'] is an iterator that
|
||||||
# will fetch more metadata from youtube to list all the
|
# will fetch more metadata from youtube to list all the
|
||||||
# videos. We unroll that iterator here partly because
|
# videos. We unroll that iterator here partly because
|
||||||
# otherwise `process_ie_result()` will clobber it, and we
|
# otherwise `process_ie_result()` will clobber it, and we
|
||||||
# use it later to extract the watch pages as outlinks.
|
# use it later to extract the watch pages as outlinks.
|
||||||
try:
|
try:
|
||||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
ie_result["entries_no_dl"] = list(ie_result["entries"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
|
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
|
||||||
ie.IE_NAME, url, e)
|
ie.IE_NAME,
|
||||||
ie_result['entries_no_dl'] =[]
|
url,
|
||||||
ie_result['entries'] = []
|
e,
|
||||||
|
)
|
||||||
|
ie_result["entries_no_dl"] = []
|
||||||
|
ie_result["entries"] = []
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'not downloading %s media files from this '
|
"not downloading %s media files from this "
|
||||||
'playlist because we expect to capture them from '
|
"playlist because we expect to capture them from "
|
||||||
'individual watch/track/detail pages',
|
"individual watch/track/detail pages",
|
||||||
len(ie_result['entries_no_dl']))
|
len(ie_result["entries_no_dl"]),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
|
||||||
'extractor %r found a download in %s', ie.IE_NAME, url)
|
|
||||||
|
|
||||||
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
|
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
|
||||||
# 220211 update: does yt-dlp supply content-type? no, not as such
|
# 220211 update: does yt-dlp supply content-type? no, not as such
|
||||||
|
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
# youtube-dl produces a stitched-up video that /usr/bin/file fails
|
# youtube-dl produces a stitched-up video that /usr/bin/file fails
|
||||||
# to identify (says "application/octet-stream"). `ffprobe` doesn't
|
# to identify (says "application/octet-stream"). `ffprobe` doesn't
|
||||||
# give us a mimetype.
|
# give us a mimetype.
|
||||||
if info_dict.get('ext') == 'mp4':
|
if info_dict.get("ext") == "mp4":
|
||||||
mimetype = 'video/mp4'
|
mimetype = "video/mp4"
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
import magic
|
import magic
|
||||||
mimetype = magic.from_file(info_dict['filepath'], mime=True)
|
|
||||||
|
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
mimetype = 'video/%s' % info_dict['ext']
|
mimetype = "video/%s" % info_dict["ext"]
|
||||||
self.logger.warning(
|
self.logger.warning("guessing mimetype %s because %r", mimetype, e)
|
||||||
'guessing mimetype %s because %r', mimetype, e)
|
|
||||||
|
|
||||||
# youtube watch page postprocessor is MoveFiles
|
# youtube watch page postprocessor is MoveFiles
|
||||||
|
|
||||||
if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
|
if postprocessor == "FixupM3u8" or postprocessor == "Merger":
|
||||||
url = 'youtube-dl:%05d:%s' % (
|
url = "youtube-dl:%05d:%s" % (
|
||||||
info_dict.get('playlist_index') or 1,
|
info_dict.get("playlist_index") or 1,
|
||||||
info_dict['webpage_url'])
|
info_dict["webpage_url"],
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
url = info_dict.get('url', '')
|
url = info_dict.get("url", "")
|
||||||
|
|
||||||
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
|
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
|
||||||
if url.endswith('.m3u8') or url == '':
|
if url.endswith(".m3u8") or url == "":
|
||||||
return
|
return
|
||||||
|
|
||||||
size = os.path.getsize(info_dict['filepath'])
|
size = os.path.getsize(info_dict["filepath"])
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'pushing %r video as %s (%s bytes) to '
|
"pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
|
||||||
'warcprox at %s with url %s', info_dict['format'],
|
info_dict["format"],
|
||||||
mimetype, size, worker._proxy_for(site), url)
|
mimetype,
|
||||||
with open(info_dict['filepath'], 'rb') as f:
|
size,
|
||||||
|
worker._proxy_for(site),
|
||||||
|
url,
|
||||||
|
)
|
||||||
|
with open(info_dict["filepath"], "rb") as f:
|
||||||
# include content-length header to avoid chunked
|
# include content-length header to avoid chunked
|
||||||
# transfer, which warcprox currently rejects
|
# transfer, which warcprox currently rejects
|
||||||
extra_headers = dict(site.extra_headers())
|
extra_headers = dict(site.extra_headers())
|
||||||
extra_headers['content-length'] = size
|
extra_headers["content-length"] = size
|
||||||
request, response = worker._warcprox_write_record(
|
request, response = worker._warcprox_write_record(
|
||||||
warcprox_address=worker._proxy_for(site), url=url,
|
warcprox_address=worker._proxy_for(site),
|
||||||
warc_type='resource', content_type=mimetype, payload=f,
|
url=url,
|
||||||
extra_headers=extra_headers)
|
warc_type="resource",
|
||||||
|
content_type=mimetype,
|
||||||
|
payload=f,
|
||||||
|
extra_headers=extra_headers,
|
||||||
|
)
|
||||||
# consulted by _remember_videos()
|
# consulted by _remember_videos()
|
||||||
ydl.pushed_videos.append({
|
ydl.pushed_videos.append(
|
||||||
'url': url,
|
{
|
||||||
'response_code': response.code,
|
"url": url,
|
||||||
'content-type': mimetype,
|
"response_code": response.code,
|
||||||
'content-length': size,
|
"content-type": mimetype,
|
||||||
})
|
"content-length": size,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||||
# in case yt-dlp takes a long time, heartbeat site.last_claimed
|
# in case yt-dlp takes a long time, heartbeat site.last_claimed
|
||||||
# to prevent another brozzler-worker from claiming the site
|
# to prevent another brozzler-worker from claiming the site
|
||||||
try:
|
try:
|
||||||
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
|
if (
|
||||||
|
site.rr
|
||||||
|
and doublethink.utcnow() - site.last_claimed
|
||||||
|
> datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
|
||||||
|
):
|
||||||
worker.logger.debug(
|
worker.logger.debug(
|
||||||
'heartbeating site.last_claimed to prevent another '
|
"heartbeating site.last_claimed to prevent another "
|
||||||
'brozzler-worker claiming this site id=%r', site.id)
|
"brozzler-worker claiming this site id=%r",
|
||||||
|
site.id,
|
||||||
|
)
|
||||||
site.last_claimed = doublethink.utcnow()
|
site.last_claimed = doublethink.utcnow()
|
||||||
site.save()
|
site.save()
|
||||||
except:
|
except:
|
||||||
worker.logger.debug(
|
worker.logger.debug(
|
||||||
'problem heartbeating site.last_claimed site id=%r',
|
"problem heartbeating site.last_claimed site id=%r",
|
||||||
site.id, exc_info=True)
|
site.id,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
|
||||||
def ydl_postprocess_hook(d):
|
def ydl_postprocess_hook(d):
|
||||||
if d['status'] == 'finished':
|
if d["status"] == "finished":
|
||||||
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
|
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
|
||||||
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
|
worker.logger.info(
|
||||||
|
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
|
||||||
|
)
|
||||||
if worker._using_warcprox(site):
|
if worker._using_warcprox(site):
|
||||||
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
_YoutubeDL._push_video_to_warcprox(
|
||||||
|
_YoutubeDL, site, d["info_dict"], d["postprocessor"]
|
||||||
|
)
|
||||||
|
|
||||||
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
|
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
"socket_timeout": 40,
|
"socket_timeout": 40,
|
||||||
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
||||||
"postprocessor_hooks": [ydl_postprocess_hook],
|
"postprocessor_hooks": [ydl_postprocess_hook],
|
||||||
|
|
||||||
# https://github.com/yt-dlp/yt-dlp#format-selection
|
# https://github.com/yt-dlp/yt-dlp#format-selection
|
||||||
# "By default, yt-dlp tries to download the best available quality..."
|
# "By default, yt-dlp tries to download the best available quality..."
|
||||||
# pre-v.2023.07.06: "format_sort": ["ext"],
|
# pre-v.2023.07.06: "format_sort": ["ext"],
|
||||||
|
@ -241,13 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
"format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
|
"format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
|
||||||
# skip live streams
|
# skip live streams
|
||||||
"match_filter": match_filter_func("!is_live"),
|
"match_filter": match_filter_func("!is_live"),
|
||||||
|
"extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
|
||||||
"extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
|
|
||||||
|
|
||||||
# --cache-dir local or..
|
# --cache-dir local or..
|
||||||
# this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
|
# this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
|
||||||
"cache_dir": "/home/archiveit",
|
"cache_dir": "/home/archiveit",
|
||||||
|
|
||||||
"logger": logging.getLogger("yt_dlp"),
|
"logger": logging.getLogger("yt_dlp"),
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
"quiet": False,
|
"quiet": False,
|
||||||
|
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
ydl._opener.add_handler(ydl.fetch_spy)
|
ydl._opener.add_handler(ydl.fetch_spy)
|
||||||
return ydl
|
return ydl
|
||||||
|
|
||||||
|
|
||||||
def _remember_videos(page, fetches, pushed_videos=None):
|
def _remember_videos(page, fetches, pushed_videos=None):
|
||||||
'''
|
"""
|
||||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||||
'''
|
"""
|
||||||
if not 'videos' in page:
|
if not "videos" in page:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
for fetch in fetches or []:
|
for fetch in fetches or []:
|
||||||
content_type = fetch['response_headers'].get_content_type()
|
content_type = fetch["response_headers"].get_content_type()
|
||||||
if (content_type.startswith('video/')
|
if (
|
||||||
|
content_type.startswith("video/")
|
||||||
# skip manifests of DASH segmented video -
|
# skip manifests of DASH segmented video -
|
||||||
# see https://github.com/internetarchive/brozzler/pull/70
|
# see https://github.com/internetarchive/brozzler/pull/70
|
||||||
and content_type != 'video/vnd.mpeg.dash.mpd'
|
and content_type != "video/vnd.mpeg.dash.mpd"
|
||||||
and fetch['method'] == 'GET'
|
and fetch["method"] == "GET"
|
||||||
and fetch['response_code'] in (200, 206)):
|
and fetch["response_code"] in (200, 206)
|
||||||
|
):
|
||||||
video = {
|
video = {
|
||||||
'blame': 'youtube-dl',
|
"blame": "youtube-dl",
|
||||||
'url': fetch['url'],
|
"url": fetch["url"],
|
||||||
'response_code': fetch['response_code'],
|
"response_code": fetch["response_code"],
|
||||||
'content-type': content_type,
|
"content-type": content_type,
|
||||||
}
|
}
|
||||||
if 'content-length' in fetch['response_headers']:
|
if "content-length" in fetch["response_headers"]:
|
||||||
video['content-length'] = int(
|
video["content-length"] = int(
|
||||||
fetch['response_headers']['content-length'])
|
fetch["response_headers"]["content-length"]
|
||||||
if 'content-range' in fetch['response_headers']:
|
)
|
||||||
|
if "content-range" in fetch["response_headers"]:
|
||||||
# skip chunked youtube video
|
# skip chunked youtube video
|
||||||
if 'googlevideo.com/videoplayback' in fetch['url']:
|
if "googlevideo.com/videoplayback" in fetch["url"]:
|
||||||
continue
|
continue
|
||||||
video['content-range'] = fetch[
|
video["content-range"] = fetch["response_headers"]["content-range"]
|
||||||
'response_headers']['content-range']
|
logging.debug("embedded video %s", video)
|
||||||
logging.debug('embedded video %s', video)
|
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
for pushed_video in pushed_videos or []:
|
for pushed_video in pushed_videos or []:
|
||||||
if pushed_video['content-type'].startswith('video/'):
|
if pushed_video["content-type"].startswith("video/"):
|
||||||
video = {
|
video = {
|
||||||
'blame': 'youtube-dl',
|
"blame": "youtube-dl",
|
||||||
'url': pushed_video['url'],
|
"url": pushed_video["url"],
|
||||||
'response_code': pushed_video['response_code'],
|
"response_code": pushed_video["response_code"],
|
||||||
'content-type': pushed_video['content-type'],
|
"content-type": pushed_video["content-type"],
|
||||||
'content-length': pushed_video['content-length'],
|
"content-length": pushed_video["content-length"],
|
||||||
}
|
}
|
||||||
logging.debug('embedded video %s', video)
|
logging.debug("embedded video %s", video)
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
|
|
||||||
|
|
||||||
def _try_youtube_dl(worker, ydl, site, page):
|
def _try_youtube_dl(worker, ydl, site, page):
|
||||||
try:
|
try:
|
||||||
logging.info("trying yt-dlp on %s", page)
|
logging.info("trying yt-dlp on %s", page)
|
||||||
|
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||||
# no host given>" resulting in ProxyError
|
# no host given>" resulting in ProxyError
|
||||||
# needs automated test
|
# needs automated test
|
||||||
# and yt-dlp needs sanitize_info for extract_info
|
# and yt-dlp needs sanitize_info for extract_info
|
||||||
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
ie_result = ydl.sanitize_info(
|
||||||
|
ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
||||||
|
)
|
||||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
|
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
|
||||||
if worker._using_warcprox(site):
|
if worker._using_warcprox(site):
|
||||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||||
logging.info(
|
logging.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||||
"with yt-dlp json for %s", page)
|
"with yt-dlp json for %s",
|
||||||
|
page,
|
||||||
|
)
|
||||||
worker._warcprox_write_record(
|
worker._warcprox_write_record(
|
||||||
warcprox_address=worker._proxy_for(site),
|
warcprox_address=worker._proxy_for(site),
|
||||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="metadata",
|
warc_type="metadata",
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
payload=info_json.encode("utf-8"),
|
payload=info_json.encode("utf-8"),
|
||||||
extra_headers=site.extra_headers(page))
|
extra_headers=site.extra_headers(page),
|
||||||
|
)
|
||||||
return ie_result
|
return ie_result
|
||||||
except brozzler.ShutdownRequested as e:
|
except brozzler.ShutdownRequested as e:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
|
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
|
||||||
return None
|
return None
|
||||||
elif (hasattr(e, "exc_info")
|
elif (
|
||||||
|
hasattr(e, "exc_info")
|
||||||
and e.exc_info[0] == urllib.error.HTTPError
|
and e.exc_info[0] == urllib.error.HTTPError
|
||||||
and hasattr(e.exc_info[1], "code")
|
and hasattr(e.exc_info[1], "code")
|
||||||
and e.exc_info[1].code == 420):
|
and e.exc_info[1].code == 420
|
||||||
|
):
|
||||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||||
elif (hasattr(e, 'exc_info')
|
elif (
|
||||||
|
hasattr(e, "exc_info")
|
||||||
and e.exc_info[0] == urllib.error.URLError
|
and e.exc_info[0] == urllib.error.URLError
|
||||||
and worker._proxy_for(site)):
|
and worker._proxy_for(site)
|
||||||
|
):
|
||||||
# connection problem when using a proxy == proxy error (XXX?)
|
# connection problem when using a proxy == proxy error (XXX?)
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'yt-dlp hit apparent proxy error from '
|
"yt-dlp hit apparent proxy error from " "%s" % page.url
|
||||||
'%s' % page.url) from e
|
) from e
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def do_youtube_dl(worker, site, page):
|
def do_youtube_dl(worker, site, page):
|
||||||
'''
|
"""
|
||||||
Runs yt-dlp configured for `worker` and `site` to download videos from
|
Runs yt-dlp configured for `worker` and `site` to download videos from
|
||||||
`page`.
|
`page`.
|
||||||
|
|
||||||
|
@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
|
||||||
'response_headers': ...,
|
'response_headers': ...,
|
||||||
}, ...]
|
}, ...]
|
||||||
`list` of `str`: outlink urls
|
`list` of `str`: outlink urls
|
||||||
'''
|
"""
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or
|
if ie_result and (
|
||||||
ie_result.get('extractor') == 'youtube:tab'):
|
ie_result.get("extractor") == "youtube:playlist"
|
||||||
|
or ie_result.get("extractor") == "youtube:tab"
|
||||||
|
):
|
||||||
# youtube watch pages as outlinks
|
# youtube watch pages as outlinks
|
||||||
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
|
outlinks = {
|
||||||
for e in ie_result.get('entries_no_dl', [])}
|
"https://www.youtube.com/watch?v=%s" % e["id"]
|
||||||
|
for e in ie_result.get("entries_no_dl", [])
|
||||||
|
}
|
||||||
# any outlinks for other cases?
|
# any outlinks for other cases?
|
||||||
return ydl.fetch_spy.fetches, outlinks
|
return ydl.fetch_spy.fetches, outlinks
|
||||||
|
|
125
setup.py
125
setup.py
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
setup.py - brozzler setup script
|
setup.py - brozzler setup script
|
||||||
|
|
||||||
Copyright (C) 2014-2024 Internet Archive
|
Copyright (C) 2014-2024 Internet Archive
|
||||||
|
@ -15,89 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import setuptools
|
import setuptools
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
def find_package_data(package):
|
def find_package_data(package):
|
||||||
pkg_data = []
|
pkg_data = []
|
||||||
depth = len(package.split('.'))
|
depth = len(package.split("."))
|
||||||
path = os.path.join(*package.split('.'))
|
path = os.path.join(*package.split("."))
|
||||||
for dirpath, dirnames, filenames in os.walk(path):
|
for dirpath, dirnames, filenames in os.walk(path):
|
||||||
if not os.path.exists(os.path.join(dirpath, '__init__.py')):
|
if not os.path.exists(os.path.join(dirpath, "__init__.py")):
|
||||||
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
|
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
|
||||||
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
|
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
|
||||||
return pkg_data
|
return pkg_data
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name="brozzler",
|
||||||
version='1.5.44',
|
version="1.5.44",
|
||||||
description='Distributed web crawling with browsers',
|
description="Distributed web crawling with browsers",
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author='Noah Levitt',
|
author="Noah Levitt",
|
||||||
author_email='nlevitt@archive.org',
|
author_email="nlevitt@archive.org",
|
||||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
|
||||||
license='Apache License 2.0',
|
license="Apache License 2.0",
|
||||||
packages=['brozzler', 'brozzler.dashboard'],
|
packages=["brozzler", "brozzler.dashboard"],
|
||||||
package_data={
|
package_data={
|
||||||
'brozzler': [
|
"brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
|
||||||
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
"brozzler.dashboard": find_package_data("brozzler.dashboard"),
|
||||||
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
|
|
||||||
},
|
},
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
"console_scripts": [
|
||||||
'brozzle-page=brozzler.cli:brozzle_page',
|
"brozzle-page=brozzler.cli:brozzle_page",
|
||||||
'brozzler-new-job=brozzler.cli:brozzler_new_job',
|
"brozzler-new-job=brozzler.cli:brozzler_new_job",
|
||||||
'brozzler-new-site=brozzler.cli:brozzler_new_site',
|
"brozzler-new-site=brozzler.cli:brozzler_new_site",
|
||||||
'brozzler-worker=brozzler.cli:brozzler_worker',
|
"brozzler-worker=brozzler.cli:brozzler_worker",
|
||||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
|
||||||
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
|
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",
|
||||||
'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
|
"brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
|
||||||
'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
|
"brozzler-list-sites=brozzler.cli:brozzler_list_sites",
|
||||||
'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
|
"brozzler-list-pages=brozzler.cli:brozzler_list_pages",
|
||||||
'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl',
|
"brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
|
||||||
'brozzler-purge=brozzler.cli:brozzler_purge',
|
"brozzler-purge=brozzler.cli:brozzler_purge",
|
||||||
'brozzler-dashboard=brozzler.dashboard:main',
|
"brozzler-dashboard=brozzler.dashboard:main",
|
||||||
'brozzler-easy=brozzler.easy:main',
|
"brozzler-easy=brozzler.easy:main",
|
||||||
'brozzler-wayback=brozzler.pywb:main',
|
"brozzler-wayback=brozzler.pywb:main",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'PyYAML>=5.1',
|
"PyYAML>=5.1",
|
||||||
'yt_dlp<2023.11.16',
|
"yt_dlp<2023.11.16",
|
||||||
'reppy==0.3.4',
|
"reppy==0.3.4",
|
||||||
'requests>=2.21',
|
"requests>=2.21",
|
||||||
'websocket-client>=0.39.0,<=0.48.0',
|
"websocket-client>=0.39.0,<=0.48.0",
|
||||||
'pillow>=5.2.0',
|
"pillow>=5.2.0",
|
||||||
'urlcanon>=0.1.dev23',
|
"urlcanon>=0.1.dev23",
|
||||||
'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311',
|
"doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
|
||||||
'rethinkdb<2.4.10',
|
"rethinkdb<2.4.10",
|
||||||
'cerberus>=1.0.1',
|
"cerberus>=1.0.1",
|
||||||
'jinja2>=2.10',
|
"jinja2>=2.10",
|
||||||
'cryptography>=2.3',
|
"cryptography>=2.3",
|
||||||
'python-magic>=0.4.15',
|
"python-magic>=0.4.15",
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': [
|
"dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
|
||||||
'flask>=1.0',
|
"easy": [
|
||||||
'gunicorn>=19.8.1'
|
"warcprox>=2.4.31",
|
||||||
],
|
"pywb>=0.33.2,<2",
|
||||||
'easy': [
|
"flask>=1.0",
|
||||||
'warcprox>=2.4.31',
|
"gunicorn>=19.8.1",
|
||||||
'pywb>=0.33.2,<2',
|
|
||||||
'flask>=1.0',
|
|
||||||
'gunicorn>=19.8.1'
|
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
"Development Status :: 5 - Production/Stable",
|
||||||
'Environment :: Console',
|
"Environment :: Console",
|
||||||
'License :: OSI Approved :: Apache Software License',
|
"License :: OSI Approved :: Apache Software License",
|
||||||
'Programming Language :: Python :: 3.5',
|
"Programming Language :: Python :: 3.5",
|
||||||
'Programming Language :: Python :: 3.6',
|
"Programming Language :: Python :: 3.6",
|
||||||
'Programming Language :: Python :: 3.7',
|
"Programming Language :: Python :: 3.7",
|
||||||
'Topic :: Internet :: WWW/HTTP',
|
"Topic :: Internet :: WWW/HTTP",
|
||||||
'Topic :: System :: Archiving',
|
"Topic :: System :: Archiving",
|
||||||
])
|
],
|
||||||
|
)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
test_brozzling.py - XXX explain
|
test_brozzling.py - XXX explain
|
||||||
|
|
||||||
Copyright (C) 2016-2018 Internet Archive
|
Copyright (C) 2016-2018 Internet Archive
|
||||||
|
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import brozzler
|
import brozzler
|
||||||
|
@ -34,79 +34,81 @@ args.log_level = logging.INFO
|
||||||
brozzler.cli.configure_logging(args)
|
brozzler.cli.configure_logging(args)
|
||||||
|
|
||||||
WARCPROX_META_420 = {
|
WARCPROX_META_420 = {
|
||||||
'stats': {
|
"stats": {
|
||||||
'test_limits_bucket': {
|
"test_limits_bucket": {
|
||||||
'total': {'urls': 0, 'wire_bytes': 0},
|
"total": {"urls": 0, "wire_bytes": 0},
|
||||||
'new': {'urls': 0, 'wire_bytes': 0},
|
"new": {"urls": 0, "wire_bytes": 0},
|
||||||
'revisit': {'urls': 0, 'wire_bytes': 0},
|
"revisit": {"urls": 0, "wire_bytes": 0},
|
||||||
'bucket': 'test_limits_bucket'
|
"bucket": "test_limits_bucket",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'reached-limit': {'test_limits_bucket/total/urls': 0}
|
"reached-limit": {"test_limits_bucket/total/urls": 0},
|
||||||
}
|
}
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd'
|
self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
|
||||||
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
|
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
if self.path == '/420':
|
if self.path == "/420":
|
||||||
self.send_response(420, 'Reached limit')
|
self.send_response(420, "Reached limit")
|
||||||
self.send_header('Connection', 'close')
|
self.send_header("Connection", "close")
|
||||||
self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
|
self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
|
||||||
payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
|
payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
|
||||||
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||||
self.send_header('Content-Length', len(payload))
|
self.send_header("Content-Length", len(payload))
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(payload)
|
self.wfile.write(payload)
|
||||||
elif self.path == '/401':
|
elif self.path == "/401":
|
||||||
self.send_response(401)
|
self.send_response(401)
|
||||||
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
|
self.send_header("WWW-Authenticate", 'Basic realm="Test"')
|
||||||
self.send_header('Content-type', 'text/html')
|
self.send_header("Content-type", "text/html")
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(self.headers.get('Authorization', b''))
|
self.wfile.write(self.headers.get("Authorization", b""))
|
||||||
self.wfile.write(b'not authenticated')
|
self.wfile.write(b"not authenticated")
|
||||||
else:
|
else:
|
||||||
super().do_GET()
|
super().do_GET()
|
||||||
|
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
if self.path == '/login-action':
|
if self.path == "/login-action":
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
payload = b'login successful\n'
|
payload = b"login successful\n"
|
||||||
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||||
self.send_header('Content-Length', len(payload))
|
self.send_header("Content-Length", len(payload))
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(payload)
|
self.wfile.write(payload)
|
||||||
else:
|
else:
|
||||||
super().do_POST()
|
super().do_POST()
|
||||||
|
|
||||||
|
|
||||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
|
httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
def fin():
|
def fin():
|
||||||
httpd.shutdown()
|
httpd.shutdown()
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
httpd_thread.join()
|
httpd_thread.join()
|
||||||
|
|
||||||
request.addfinalizer(fin)
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
return httpd
|
return httpd
|
||||||
|
|
||||||
|
|
||||||
def test_httpd(httpd):
|
def test_httpd(httpd):
|
||||||
'''
|
"""
|
||||||
Tests that our http server is working as expected, and that two fetches
|
Tests that our http server is working as expected, and that two fetches
|
||||||
of the same url return the same payload, proving it can be used to test
|
of the same url return the same payload, proving it can be used to test
|
||||||
deduplication.
|
deduplication.
|
||||||
'''
|
"""
|
||||||
payload1 = content2 = None
|
payload1 = content2 = None
|
||||||
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
|
||||||
with urllib.request.urlopen(url) as response:
|
with urllib.request.urlopen(url) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload1 = response.read()
|
payload1 = response.read()
|
||||||
|
@ -119,123 +121,136 @@ def test_httpd(httpd):
|
||||||
|
|
||||||
assert payload1 == payload2
|
assert payload1 == payload2
|
||||||
|
|
||||||
url = 'http://localhost:%s/420' % httpd.server_port
|
url = "http://localhost:%s/420" % httpd.server_port
|
||||||
with pytest.raises(urllib.error.HTTPError) as excinfo:
|
with pytest.raises(urllib.error.HTTPError) as excinfo:
|
||||||
urllib.request.urlopen(url)
|
urllib.request.urlopen(url)
|
||||||
assert excinfo.value.getcode() == 420
|
assert excinfo.value.getcode() == 420
|
||||||
|
|
||||||
|
|
||||||
def test_aw_snap_hes_dead_jim():
|
def test_aw_snap_hes_dead_jim():
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
with pytest.raises(brozzler.BrowsingException):
|
with pytest.raises(brozzler.BrowsingException):
|
||||||
browser.browse_page('chrome://crash')
|
browser.browse_page("chrome://crash")
|
||||||
|
|
||||||
|
|
||||||
# chromium's 401 handling changed???
|
# chromium's 401 handling changed???
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_page_interstitial_exception(httpd):
|
def test_page_interstitial_exception(httpd):
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
url = 'http://localhost:%s/401' % httpd.server_port
|
url = "http://localhost:%s/401" % httpd.server_port
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
with pytest.raises(brozzler.PageInterstitialShown):
|
with pytest.raises(brozzler.PageInterstitialShown):
|
||||||
browser.browse_page(url)
|
browser.browse_page(url)
|
||||||
|
|
||||||
|
|
||||||
def test_on_response(httpd):
|
def test_on_response(httpd):
|
||||||
response_urls = []
|
response_urls = []
|
||||||
|
|
||||||
def on_response(msg):
|
def on_response(msg):
|
||||||
response_urls.append(msg['params']['response']['url'])
|
response_urls.append(msg["params"]["response"]["url"])
|
||||||
|
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
url = 'http://localhost:%s/site3/page.html' % httpd.server_port
|
url = "http://localhost:%s/site3/page.html" % httpd.server_port
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
browser.browse_page(url, on_response=on_response)
|
browser.browse_page(url, on_response=on_response)
|
||||||
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
|
assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
|
||||||
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
|
assert (
|
||||||
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
|
response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
|
||||||
|
)
|
||||||
|
assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
|
||||||
|
|
||||||
|
|
||||||
def test_420(httpd):
|
def test_420(httpd):
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
url = 'http://localhost:%s/420' % httpd.server_port
|
url = "http://localhost:%s/420" % httpd.server_port
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
with pytest.raises(brozzler.ReachedLimit) as excinfo:
|
with pytest.raises(brozzler.ReachedLimit) as excinfo:
|
||||||
browser.browse_page(url)
|
browser.browse_page(url)
|
||||||
assert excinfo.value.warcprox_meta == WARCPROX_META_420
|
assert excinfo.value.warcprox_meta == WARCPROX_META_420
|
||||||
|
|
||||||
|
|
||||||
def test_js_dialogs(httpd):
|
def test_js_dialogs(httpd):
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
|
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
# before commit d2ed6b97a24 these would hang and eventually raise
|
# before commit d2ed6b97a24 these would hang and eventually raise
|
||||||
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
||||||
|
browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
|
||||||
browser.browse_page(
|
browser.browse_page(
|
||||||
'http://localhost:%s/site4/alert.html' % httpd.server_port)
|
"http://localhost:%s/site4/confirm.html" % httpd.server_port
|
||||||
browser.browse_page(
|
)
|
||||||
'http://localhost:%s/site4/confirm.html' % httpd.server_port)
|
browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
|
||||||
browser.browse_page(
|
|
||||||
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
|
|
||||||
# XXX print dialog unresolved
|
# XXX print dialog unresolved
|
||||||
# browser.browse_page(
|
# browser.browse_page(
|
||||||
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
||||||
|
|
||||||
|
|
||||||
def test_page_videos(httpd):
|
def test_page_videos(httpd):
|
||||||
# test depends on behavior of youtube-dl and chromium, could fail and need
|
# test depends on behavior of youtube-dl and chromium, could fail and need
|
||||||
# to be adjusted on youtube-dl or chromium updates
|
# to be adjusted on youtube-dl or chromium updates
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
worker = brozzler.BrozzlerWorker(None)
|
worker = brozzler.BrozzlerWorker(None)
|
||||||
site = brozzler.Site(None, {})
|
site = brozzler.Site(None, {})
|
||||||
page = brozzler.Page(None, {
|
page = brozzler.Page(
|
||||||
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
|
||||||
|
)
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
worker.brozzle_page(browser, site, page)
|
worker.brozzle_page(browser, site, page)
|
||||||
assert page.videos
|
assert page.videos
|
||||||
assert len(page.videos) == 4
|
assert len(page.videos) == 4
|
||||||
assert page.videos[0] == {
|
assert page.videos[0] == {
|
||||||
'blame': 'youtube-dl',
|
"blame": "youtube-dl",
|
||||||
'response_code': 200,
|
"response_code": 200,
|
||||||
'content-length': 383631,
|
"content-length": 383631,
|
||||||
'content-type': 'video/mp4',
|
"content-type": "video/mp4",
|
||||||
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
|
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||||
}
|
}
|
||||||
assert page.videos[1] == {
|
assert page.videos[1] == {
|
||||||
'blame': 'youtube-dl',
|
"blame": "youtube-dl",
|
||||||
'content-length': 92728,
|
"content-length": 92728,
|
||||||
'content-type': 'video/webm',
|
"content-type": "video/webm",
|
||||||
'response_code': 200,
|
"response_code": 200,
|
||||||
'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
|
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||||
|
% httpd.server_port,
|
||||||
}
|
}
|
||||||
assert page.videos[2] == {
|
assert page.videos[2] == {
|
||||||
'blame': 'youtube-dl',
|
"blame": "youtube-dl",
|
||||||
'content-length': 101114,
|
"content-length": 101114,
|
||||||
'content-type': 'video/webm',
|
"content-type": "video/webm",
|
||||||
'response_code': 200,
|
"response_code": 200,
|
||||||
'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
|
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||||
}
|
}
|
||||||
assert page.videos[3] == {
|
assert page.videos[3] == {
|
||||||
'blame': 'browser',
|
"blame": "browser",
|
||||||
# 'response_code': 206,
|
# 'response_code': 206,
|
||||||
# 'content-range': 'bytes 0-229454/229455',
|
# 'content-range': 'bytes 0-229454/229455',
|
||||||
'response_code': 200,
|
"response_code": 200,
|
||||||
'content-length': 229455,
|
"content-length": 229455,
|
||||||
'content-type': 'video/webm',
|
"content-type": "video/webm",
|
||||||
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
"url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_extract_outlinks(httpd):
|
def test_extract_outlinks(httpd):
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
worker = brozzler.BrozzlerWorker(None)
|
worker = brozzler.BrozzlerWorker(None)
|
||||||
site = brozzler.Site(None, {})
|
site = brozzler.Site(None, {})
|
||||||
page = brozzler.Page(None, {
|
page = brozzler.Page(
|
||||||
'url':'http://localhost:%s/site8/' % httpd.server_port})
|
None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
|
||||||
|
)
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
outlinks = worker.brozzle_page(browser, site, page)
|
outlinks = worker.brozzle_page(browser, site, page)
|
||||||
assert outlinks == {
|
assert outlinks == {
|
||||||
'http://example.com/offsite',
|
"http://example.com/offsite",
|
||||||
'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
|
"http://localhost:%s/site8/baz/zuh" % httpd.server_port,
|
||||||
'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
|
"http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
|
||||||
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
|
"http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_proxy_down():
|
def test_proxy_down():
|
||||||
'''
|
"""
|
||||||
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
||||||
|
|
||||||
See also `test_proxy_down` in test_units.py.
|
See also `test_proxy_down` in test_units.py.
|
||||||
|
@ -243,40 +258,41 @@ def test_proxy_down():
|
||||||
Tests two different kinds of connection error:
|
Tests two different kinds of connection error:
|
||||||
- nothing listening the port (nobody listens on on port 4 :))
|
- nothing listening the port (nobody listens on on port 4 :))
|
||||||
- port bound but not accepting connections
|
- port bound but not accepting connections
|
||||||
'''
|
"""
|
||||||
sock = socket.socket()
|
sock = socket.socket()
|
||||||
sock.bind(('127.0.0.1', 0))
|
sock.bind(("127.0.0.1", 0))
|
||||||
for not_listening_proxy in (
|
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||||
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
site = brozzler.Site(None, {"seed": "http://example.com/"})
|
||||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
|
||||||
|
|
||||||
worker = brozzler.BrozzlerWorker(
|
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||||
frontier=None, proxy=not_listening_proxy)
|
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
worker.brozzle_page(browser, site, page)
|
worker.brozzle_page(browser, site, page)
|
||||||
|
|
||||||
|
|
||||||
def test_try_login(httpd):
|
def test_try_login(httpd):
|
||||||
"""Test try_login behavior.
|
"""Test try_login behavior."""
|
||||||
"""
|
|
||||||
response_urls = []
|
response_urls = []
|
||||||
|
|
||||||
def on_response(msg):
|
def on_response(msg):
|
||||||
response_urls.append(msg['params']['response']['url'])
|
response_urls.append(msg["params"]["response"]["url"])
|
||||||
|
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
|
form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
|
||||||
form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port
|
form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
|
||||||
favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
|
favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
|
||||||
login_url = 'http://localhost:%s/login-action' % httpd.server_port
|
login_url = "http://localhost:%s/login-action" % httpd.server_port
|
||||||
# When username and password are defined and initial page has login form,
|
# When username and password are defined and initial page has login form,
|
||||||
# detect login form, submit login, and then return to the initial page.
|
# detect login form, submit login, and then return to the initial page.
|
||||||
username = 'user1'
|
username = "user1"
|
||||||
password = 'pass1'
|
password = "pass1"
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
browser.browse_page(form_url, username=username, password=password,
|
browser.browse_page(
|
||||||
on_response=on_response)
|
form_url, username=username, password=password, on_response=on_response
|
||||||
|
)
|
||||||
assert len(response_urls) == 4
|
assert len(response_urls) == 4
|
||||||
assert response_urls[0] == form_url
|
assert response_urls[0] == form_url
|
||||||
assert response_urls[1] == favicon_url
|
assert response_urls[1] == favicon_url
|
||||||
|
@ -285,11 +301,15 @@ def test_try_login(httpd):
|
||||||
|
|
||||||
# We are now supporting a different type of form, we'll test that here.
|
# We are now supporting a different type of form, we'll test that here.
|
||||||
response_urls = []
|
response_urls = []
|
||||||
username = 'user1'
|
username = "user1"
|
||||||
password = 'pass1'
|
password = "pass1"
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
browser.browse_page(form_url_other, username=username, password=password,
|
browser.browse_page(
|
||||||
on_response=on_response)
|
form_url_other,
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
on_response=on_response,
|
||||||
|
)
|
||||||
assert len(response_urls) == 4
|
assert len(response_urls) == 4
|
||||||
assert response_urls[0] == form_url_other
|
assert response_urls[0] == form_url_other
|
||||||
assert response_urls[1] == favicon_url
|
assert response_urls[1] == favicon_url
|
||||||
|
@ -306,10 +326,16 @@ def test_try_login(httpd):
|
||||||
|
|
||||||
# when the page doesn't have a form with username/password, don't submit it
|
# when the page doesn't have a form with username/password, don't submit it
|
||||||
response_urls = []
|
response_urls = []
|
||||||
form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
|
form_without_login_url = (
|
||||||
|
"http://localhost:%s/site11/form-no-login.html" % httpd.server_port
|
||||||
|
)
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
browser.browse_page(form_without_login_url, username=username,
|
browser.browse_page(
|
||||||
password=password, on_response=on_response)
|
form_without_login_url,
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
on_response=on_response,
|
||||||
|
)
|
||||||
assert len(response_urls) == 2
|
assert len(response_urls) == 2
|
||||||
assert response_urls[0] == form_without_login_url
|
assert response_urls[0] == form_without_login_url
|
||||||
assert response_urls[1] == favicon_url
|
assert response_urls[1] == favicon_url
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
test_cli.py - test brozzler commands
|
test_cli.py - test brozzler commands
|
||||||
|
|
||||||
Copyright (C) 2017 Internet Archive
|
Copyright (C) 2017 Internet Archive
|
||||||
|
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import brozzler.cli
|
import brozzler.cli
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
@ -23,59 +23,62 @@ import pytest
|
||||||
import subprocess
|
import subprocess
|
||||||
import doublethink
|
import doublethink
|
||||||
|
|
||||||
|
|
||||||
def cli_commands():
|
def cli_commands():
|
||||||
commands = set(pkg_resources.get_entry_map(
|
commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
|
||||||
'brozzler')['console_scripts'].keys())
|
commands.remove("brozzler-wayback")
|
||||||
commands.remove('brozzler-wayback')
|
|
||||||
try:
|
try:
|
||||||
import gunicorn
|
import gunicorn
|
||||||
except ImportError:
|
except ImportError:
|
||||||
commands.remove('brozzler-dashboard')
|
commands.remove("brozzler-dashboard")
|
||||||
try:
|
try:
|
||||||
import pywb
|
import pywb
|
||||||
except ImportError:
|
except ImportError:
|
||||||
commands.remove('brozzler-easy')
|
commands.remove("brozzler-easy")
|
||||||
return commands
|
return commands
|
||||||
|
|
||||||
@pytest.mark.parametrize('cmd', cli_commands())
|
|
||||||
|
@pytest.mark.parametrize("cmd", cli_commands())
|
||||||
def test_call_entrypoint(capsys, cmd):
|
def test_call_entrypoint(capsys, cmd):
|
||||||
entrypoint = pkg_resources.get_entry_map(
|
entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
|
||||||
'brozzler')['console_scripts'][cmd]
|
|
||||||
callable = entrypoint.resolve()
|
callable = entrypoint.resolve()
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
callable(['/whatever/bin/%s' % cmd, '--version'])
|
callable(["/whatever/bin/%s" % cmd, "--version"])
|
||||||
out, err = capsys.readouterr()
|
out, err = capsys.readouterr()
|
||||||
assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd)
|
assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
|
||||||
assert err == ''
|
assert err == ""
|
||||||
|
|
||||||
@pytest.mark.parametrize('cmd', cli_commands())
|
|
||||||
|
@pytest.mark.parametrize("cmd", cli_commands())
|
||||||
def test_run_command(capsys, cmd):
|
def test_run_command(capsys, cmd):
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
out, err = proc.communicate()
|
out, err = proc.communicate()
|
||||||
assert err == b''
|
assert err == b""
|
||||||
assert out == ('brozzler %s - %s\n' % (
|
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
|
||||||
brozzler.__version__, cmd)).encode('ascii')
|
|
||||||
|
|
||||||
def test_rethinkdb_up():
|
def test_rethinkdb_up():
|
||||||
'''Check that rethinkdb is up and running.'''
|
"""Check that rethinkdb is up and running."""
|
||||||
# check that rethinkdb is listening and looks sane
|
# check that rethinkdb is listening and looks sane
|
||||||
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
|
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||||
tbls = rr.table_list().run()
|
tbls = rr.table_list().run()
|
||||||
assert len(tbls) > 10
|
assert len(tbls) > 10
|
||||||
|
|
||||||
|
|
||||||
# XXX don't know why this test is failing in travis-ci and vagrant while
|
# XXX don't know why this test is failing in travis-ci and vagrant while
|
||||||
# test_call_entrypoint tests pass :( (also fails with capfd)
|
# test_call_entrypoint tests pass :( (also fails with capfd)
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_stop_nonexistent_crawl(capsys):
|
def test_stop_nonexistent_crawl(capsys):
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123'])
|
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
|
||||||
out, err = capsys.readouterr()
|
out, err = capsys.readouterr()
|
||||||
assert err.endswith('site not found with id=123\n')
|
assert err.endswith("site not found with id=123\n")
|
||||||
assert out == ''
|
assert out == ""
|
||||||
|
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc'])
|
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
|
||||||
out, err = capsys.readouterr()
|
out, err = capsys.readouterr()
|
||||||
assert err.endswith('''job not found with id='abc'\n''')
|
assert err.endswith("""job not found with id='abc'\n""")
|
||||||
assert out == ''
|
assert out == ""
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||||
|
|
||||||
Copyright (C) 2016-2017 Internet Archive
|
Copyright (C) 2016-2017 Internet Archive
|
||||||
|
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import http.server
|
import http.server
|
||||||
|
@ -37,99 +37,131 @@ import threading
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stderr, level=logging.INFO, format=(
|
stream=sys.stderr,
|
||||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
level=logging.INFO,
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
format=(
|
||||||
|
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||||
|
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(
|
httpd = http.server.HTTPServer(
|
||||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
("localhost", 0), http.server.SimpleHTTPRequestHandler
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
)
|
||||||
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
def fin():
|
def fin():
|
||||||
httpd.shutdown()
|
httpd.shutdown()
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
httpd_thread.join()
|
httpd_thread.join()
|
||||||
|
|
||||||
request.addfinalizer(fin)
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
return httpd
|
return httpd
|
||||||
|
|
||||||
|
|
||||||
def test_robots(httpd):
|
def test_robots(httpd):
|
||||||
'''
|
"""
|
||||||
Basic test of robots.txt user-agent substring matching.
|
Basic test of robots.txt user-agent substring matching.
|
||||||
'''
|
"""
|
||||||
url = 'http://localhost:%s/' % httpd.server_port
|
url = "http://localhost:%s/" % httpd.server_port
|
||||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
|
||||||
assert not brozzler.is_permitted_by_robots(site, url)
|
assert not brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
|
||||||
def test_robots_http_statuses():
|
def test_robots_http_statuses():
|
||||||
for status in (
|
for status in (
|
||||||
200, 204, 400, 401, 402, 403, 404, 405,
|
200,
|
||||||
500, 501, 502, 503, 504, 505):
|
204,
|
||||||
|
400,
|
||||||
|
401,
|
||||||
|
402,
|
||||||
|
403,
|
||||||
|
404,
|
||||||
|
405,
|
||||||
|
500,
|
||||||
|
501,
|
||||||
|
502,
|
||||||
|
503,
|
||||||
|
504,
|
||||||
|
505,
|
||||||
|
):
|
||||||
|
|
||||||
class Handler(http.server.BaseHTTPRequestHandler):
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
response = (
|
||||||
+ 'Content-length: 0\r\n'
|
(
|
||||||
+ '\r\n') % status).encode('utf-8')
|
"HTTP/1.1 %s Meaningless message\r\n"
|
||||||
|
+ "Content-length: 0\r\n"
|
||||||
|
+ "\r\n"
|
||||||
|
)
|
||||||
|
% status
|
||||||
|
).encode("utf-8")
|
||||||
self.connection.sendall(response)
|
self.connection.sendall(response)
|
||||||
# self.send_response(status)
|
# self.send_response(status)
|
||||||
# self.end_headers()
|
# self.end_headers()
|
||||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||||
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = 'http://localhost:%s/' % httpd.server_port
|
url = "http://localhost:%s/" % httpd.server_port
|
||||||
site = brozzler.Site(None, {'seed': url})
|
site = brozzler.Site(None, {"seed": url})
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
finally:
|
finally:
|
||||||
httpd.shutdown()
|
httpd.shutdown()
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
httpd_thread.join()
|
httpd_thread.join()
|
||||||
|
|
||||||
|
|
||||||
def test_robots_empty_response():
|
def test_robots_empty_response():
|
||||||
class Handler(http.server.BaseHTTPRequestHandler):
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
self.connection.shutdown(socket.SHUT_RDWR)
|
self.connection.shutdown(socket.SHUT_RDWR)
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||||
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = 'http://localhost:%s/' % httpd.server_port
|
url = "http://localhost:%s/" % httpd.server_port
|
||||||
site = brozzler.Site(None, {'seed': url})
|
site = brozzler.Site(None, {"seed": url})
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
finally:
|
finally:
|
||||||
httpd.shutdown()
|
httpd.shutdown()
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
httpd_thread.join()
|
httpd_thread.join()
|
||||||
|
|
||||||
|
|
||||||
def test_robots_socket_timeout():
|
def test_robots_socket_timeout():
|
||||||
stop_hanging = threading.Event()
|
stop_hanging = threading.Event()
|
||||||
|
|
||||||
class Handler(http.server.BaseHTTPRequestHandler):
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
stop_hanging.wait(60)
|
stop_hanging.wait(60)
|
||||||
self.connection.sendall(
|
self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
|
||||||
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
|
||||||
|
|
||||||
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = 'http://localhost:%s/' % httpd.server_port
|
url = "http://localhost:%s/" % httpd.server_port
|
||||||
site = brozzler.Site(None, {'seed': url})
|
site = brozzler.Site(None, {"seed": url})
|
||||||
brozzler.robots._SessionRaiseOn420.timeout = 2
|
brozzler.robots._SessionRaiseOn420.timeout = 2
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
finally:
|
finally:
|
||||||
|
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
httpd_thread.join()
|
httpd_thread.join()
|
||||||
|
|
||||||
|
|
||||||
def test_robots_dns_failure():
|
def test_robots_dns_failure():
|
||||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||||
url = 'http://whatever.invalid./'
|
url = "http://whatever.invalid./"
|
||||||
site = brozzler.Site(None, {'seed': url})
|
site = brozzler.Site(None, {"seed": url})
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
|
||||||
def test_robots_connection_failure():
|
def test_robots_connection_failure():
|
||||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||||
url = 'http://localhost:4/' # nobody listens on port 4
|
url = "http://localhost:4/" # nobody listens on port 4
|
||||||
site = brozzler.Site(None, {'seed': url})
|
site = brozzler.Site(None, {"seed": url})
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
|
||||||
def test_scoping():
|
def test_scoping():
|
||||||
test_scope = yaml.safe_load('''
|
test_scope = yaml.safe_load(
|
||||||
|
"""
|
||||||
max_hops: 100
|
max_hops: 100
|
||||||
accepts:
|
accepts:
|
||||||
- url_match: REGEX_MATCH
|
- url_match: REGEX_MATCH
|
||||||
|
@ -169,40 +205,73 @@ blocks:
|
||||||
- domain: twitter.com
|
- domain: twitter.com
|
||||||
url_match: REGEX_MATCH
|
url_match: REGEX_MATCH
|
||||||
value: ^.*lang=(?!en).*$
|
value: ^.*lang=(?!en).*$
|
||||||
''')
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
site = brozzler.Site(None, {
|
site = brozzler.Site(
|
||||||
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
|
None,
|
||||||
'scope': test_scope})
|
{
|
||||||
page = brozzler.Page(None, {
|
"id": 1,
|
||||||
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
"seed": "http://example.com/foo/bar?baz=quux#monkey",
|
||||||
'site_id': site.id})
|
"scope": test_scope,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
page = brozzler.Page(
|
||||||
|
None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
|
||||||
|
)
|
||||||
|
|
||||||
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
|
||||||
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
|
||||||
|
|
||||||
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
|
||||||
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
assert (
|
||||||
|
site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
|
||||||
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
assert (
|
||||||
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
|
||||||
|
)
|
||||||
|
|
||||||
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
|
||||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
assert (
|
||||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
|
||||||
|
)
|
||||||
|
|
||||||
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
assert (
|
||||||
|
site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
site.accept_reject_or_neither(
|
||||||
|
"https://www.youtube.com/watch?v=dUIn5OAPS5s", page
|
||||||
|
)
|
||||||
|
is None
|
||||||
|
)
|
||||||
|
yt_user_page = brozzler.Page(
|
||||||
|
None,
|
||||||
|
{
|
||||||
|
"url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
|
||||||
|
"site_id": site.id,
|
||||||
|
"hops_from_seed": 10,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.accept_reject_or_neither(
|
||||||
|
"https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
|
||||||
|
)
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
|
||||||
assert site.accept_reject_or_neither(
|
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
|
||||||
yt_user_page = brozzler.Page(None, {
|
|
||||||
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
|
||||||
'site_id': site.id, 'hops_from_seed': 10})
|
|
||||||
assert site.accept_reject_or_neither(
|
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
|
||||||
|
|
||||||
def test_proxy_down():
|
def test_proxy_down():
|
||||||
'''
|
"""
|
||||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||||
|
|
||||||
This test needs to cover every possible fetch through the proxy other than
|
This test needs to cover every possible fetch through the proxy other than
|
||||||
|
@ -211,24 +280,24 @@ def test_proxy_down():
|
||||||
Tests two different kinds of connection error:
|
Tests two different kinds of connection error:
|
||||||
- nothing listening the port (nobody listens on on port 4 :))
|
- nothing listening the port (nobody listens on on port 4 :))
|
||||||
- port bound but not accepting connections
|
- port bound but not accepting connections
|
||||||
'''
|
"""
|
||||||
sock = socket.socket()
|
sock = socket.socket()
|
||||||
sock.bind(('127.0.0.1', 0))
|
sock.bind(("127.0.0.1", 0))
|
||||||
for not_listening_proxy in (
|
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||||
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||||
worker = brozzler.BrozzlerWorker(
|
site = brozzler.Site(
|
||||||
frontier=None, proxy=not_listening_proxy)
|
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
|
||||||
site = brozzler.Site(None, {
|
)
|
||||||
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
|
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
|
||||||
|
|
||||||
# robots.txt fetch
|
# robots.txt fetch
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
brozzler.is_permitted_by_robots(
|
brozzler.is_permitted_by_robots(
|
||||||
site, 'http://example.com/', proxy=not_listening_proxy)
|
site, "http://example.com/", proxy=not_listening_proxy
|
||||||
|
)
|
||||||
|
|
||||||
# youtube-dl fetch
|
# youtube-dl fetch
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||||
|
|
||||||
|
@ -240,46 +309,57 @@ def test_proxy_down():
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
worker._warcprox_write_record(
|
worker._warcprox_write_record(
|
||||||
warcprox_address=not_listening_proxy,
|
warcprox_address=not_listening_proxy,
|
||||||
url='test://proxy_down/warcprox_write_record',
|
url="test://proxy_down/warcprox_write_record",
|
||||||
warc_type='metadata',
|
warc_type="metadata",
|
||||||
content_type='text/plain',
|
content_type="text/plain",
|
||||||
payload=b'''payload doesn't matter here''')
|
payload=b"""payload doesn't matter here""",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_start_stop_backwards_compat():
|
def test_start_stop_backwards_compat():
|
||||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
site = brozzler.Site(None, {"seed": "http://example.com/"})
|
||||||
assert len(site.starts_and_stops) == 1
|
assert len(site.starts_and_stops) == 1
|
||||||
assert site.starts_and_stops[0]['start']
|
assert site.starts_and_stops[0]["start"]
|
||||||
assert site.starts_and_stops[0]['stop'] is None
|
assert site.starts_and_stops[0]["stop"] is None
|
||||||
assert not 'start_time' in site
|
assert not "start_time" in site
|
||||||
|
|
||||||
site = brozzler.Site(None, {
|
site = brozzler.Site(
|
||||||
'seed': 'http://example.com/',
|
None,
|
||||||
'start_time': datetime.datetime(2017,1,1)})
|
{"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
|
||||||
|
)
|
||||||
assert len(site.starts_and_stops) == 1
|
assert len(site.starts_and_stops) == 1
|
||||||
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||||
assert site.starts_and_stops[0]['stop'] is None
|
assert site.starts_and_stops[0]["stop"] is None
|
||||||
assert not 'start_time' in site
|
assert not "start_time" in site
|
||||||
|
|
||||||
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
|
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
||||||
assert job.starts_and_stops[0]['start']
|
assert job.starts_and_stops[0]["start"]
|
||||||
assert job.starts_and_stops[0]['stop'] is None
|
assert job.starts_and_stops[0]["stop"] is None
|
||||||
assert not 'started' in job
|
assert not "started" in job
|
||||||
assert not 'finished' in job
|
assert not "finished" in job
|
||||||
|
|
||||||
|
job = brozzler.Job(
|
||||||
|
None,
|
||||||
|
{
|
||||||
|
"seeds": [{"url": "https://example.com/"}],
|
||||||
|
"started": datetime.datetime(2017, 1, 1),
|
||||||
|
"finished": datetime.datetime(2017, 1, 2),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||||
|
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
||||||
|
assert not "started" in job
|
||||||
|
assert not "finished" in job
|
||||||
|
|
||||||
job = brozzler.Job(None, {
|
|
||||||
'seeds': [{'url':'https://example.com/'}],
|
|
||||||
'started': datetime.datetime(2017, 1, 1),
|
|
||||||
'finished': datetime.datetime(2017, 1, 2)})
|
|
||||||
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
|
||||||
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
|
|
||||||
assert not 'started' in job
|
|
||||||
assert not 'finished' in job
|
|
||||||
|
|
||||||
class Exception1(Exception):
|
class Exception1(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Exception2(Exception):
|
class Exception2(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_thread_raise_not_accept():
|
def test_thread_raise_not_accept():
|
||||||
def never_accept():
|
def never_accept():
|
||||||
try:
|
try:
|
||||||
|
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
|
||||||
th.join()
|
th.join()
|
||||||
assert thread_caught_exception is None
|
assert thread_caught_exception is None
|
||||||
|
|
||||||
|
|
||||||
def test_thread_raise_immediate():
|
def test_thread_raise_immediate():
|
||||||
def accept_immediately():
|
def accept_immediately():
|
||||||
try:
|
try:
|
||||||
|
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
|
||||||
assert isinstance(thread_caught_exception, Exception1)
|
assert isinstance(thread_caught_exception, Exception1)
|
||||||
assert time.time() - start < 1.0
|
assert time.time() - start < 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_thread_raise_safe_exit():
|
def test_thread_raise_safe_exit():
|
||||||
def delay_context_exit():
|
def delay_context_exit():
|
||||||
gate = brozzler.thread_accept_exceptions()
|
gate = brozzler.thread_accept_exceptions()
|
||||||
orig_exit = type(gate).__exit__
|
orig_exit = type(gate).__exit__
|
||||||
try:
|
try:
|
||||||
type(gate).__exit__ = lambda self, et, ev, t: (
|
type(gate).__exit__ = lambda self, et, ev, t: (
|
||||||
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
|
brozzler.sleep(2),
|
||||||
|
orig_exit(self, et, ev, t),
|
||||||
|
False,
|
||||||
|
)[-1]
|
||||||
with brozzler.thread_accept_exceptions() as gate:
|
with brozzler.thread_accept_exceptions() as gate:
|
||||||
brozzler.sleep(2)
|
brozzler.sleep(2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
|
||||||
assert thread_caught_exception
|
assert thread_caught_exception
|
||||||
assert isinstance(thread_caught_exception, Exception1)
|
assert isinstance(thread_caught_exception, Exception1)
|
||||||
|
|
||||||
|
|
||||||
def test_thread_raise_pending_exception():
|
def test_thread_raise_pending_exception():
|
||||||
def accept_eventually():
|
def accept_eventually():
|
||||||
try:
|
try:
|
||||||
|
@ -365,6 +451,7 @@ def test_thread_raise_pending_exception():
|
||||||
assert isinstance(thread_caught_exception, Exception1)
|
assert isinstance(thread_caught_exception, Exception1)
|
||||||
assert time.time() - start > 1.0
|
assert time.time() - start > 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_thread_raise_second_with_block():
|
def test_thread_raise_second_with_block():
|
||||||
def two_with_blocks():
|
def two_with_blocks():
|
||||||
try:
|
try:
|
||||||
|
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
|
||||||
th.join()
|
th.join()
|
||||||
assert isinstance(thread_caught_exception, Exception2)
|
assert isinstance(thread_caught_exception, Exception2)
|
||||||
|
|
||||||
|
|
||||||
def test_needs_browsing():
|
def test_needs_browsing():
|
||||||
# only one test case here right now, which exposed a bug
|
# only one test case here right now, which exposed a bug
|
||||||
|
|
||||||
class ConvenientHeaders(http.client.HTTPMessage):
|
class ConvenientHeaders(http.client.HTTPMessage):
|
||||||
def __init__(self, headers):
|
def __init__(self, headers):
|
||||||
http.client.HTTPMessage.__init__(self)
|
http.client.HTTPMessage.__init__(self)
|
||||||
for (k, v) in headers.items():
|
for k, v in headers.items():
|
||||||
self.add_header(k, v)
|
self.add_header(k, v)
|
||||||
|
|
||||||
page = brozzler.Page(None, {
|
page = brozzler.Page(None, {"url": "http://example.com/a"})
|
||||||
'url':'http://example.com/a'})
|
|
||||||
|
|
||||||
spy = brozzler.ydl.YoutubeDLSpy()
|
spy = brozzler.ydl.YoutubeDLSpy()
|
||||||
spy.fetches.append({
|
spy.fetches.append(
|
||||||
'url': 'http://example.com/a',
|
{
|
||||||
'method': 'HEAD',
|
"url": "http://example.com/a",
|
||||||
'response_code': 301,
|
"method": "HEAD",
|
||||||
'response_headers': ConvenientHeaders({'Location': '/b'})})
|
"response_code": 301,
|
||||||
spy.fetches.append({
|
"response_headers": ConvenientHeaders({"Location": "/b"}),
|
||||||
'url': 'http://example.com/b',
|
}
|
||||||
'method': 'GET',
|
)
|
||||||
'response_code': 200,
|
spy.fetches.append(
|
||||||
'response_headers': ConvenientHeaders({
|
{
|
||||||
'Content-Type': 'application/pdf'})})
|
"url": "http://example.com/b",
|
||||||
|
"method": "GET",
|
||||||
|
"response_code": 200,
|
||||||
|
"response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
|
||||||
|
|
||||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
|
||||||
None, page, spy.fetches)
|
|
||||||
|
|
||||||
def test_seed_redirect():
|
def test_seed_redirect():
|
||||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||||
site.note_seed_redirect('https://foo.com/a/b/c')
|
site.note_seed_redirect("https://foo.com/a/b/c")
|
||||||
assert site.scope == {'accepts': [
|
assert site.scope == {
|
||||||
{'ssurt': 'com,foo,//http:/',},
|
"accepts": [
|
||||||
{'ssurt': 'com,foo,//https:/',}]}
|
{
|
||||||
|
"ssurt": "com,foo,//http:/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ssurt": "com,foo,//https:/",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
|
site = brozzler.Site(None, {"seed": "https://foo.com/"})
|
||||||
site.note_seed_redirect('http://foo.com/a/b/c')
|
site.note_seed_redirect("http://foo.com/a/b/c")
|
||||||
assert site.scope == {'accepts': [
|
assert site.scope == {
|
||||||
{'ssurt': 'com,foo,//https:/',},
|
"accepts": [
|
||||||
{'ssurt': 'com,foo,//http:/',}]}
|
{
|
||||||
|
"ssurt": "com,foo,//https:/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ssurt": "com,foo,//http:/",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||||
|
site.note_seed_redirect("https://bar.com/a/b/c")
|
||||||
|
assert site.scope == {
|
||||||
|
"accepts": [
|
||||||
|
{
|
||||||
|
"ssurt": "com,foo,//http:/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ssurt": "com,bar,//https:/a/b/c",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
|
||||||
site.note_seed_redirect('https://bar.com/a/b/c')
|
|
||||||
assert site.scope == {'accepts': [
|
|
||||||
{'ssurt': 'com,foo,//http:/',},
|
|
||||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
|
||||||
|
|
||||||
def test_limit_failures():
|
def test_limit_failures():
|
||||||
page = mock.Mock()
|
page = mock.Mock()
|
||||||
|
@ -446,9 +560,9 @@ def test_limit_failures():
|
||||||
page.brozzle_count = 0
|
page.brozzle_count = 0
|
||||||
|
|
||||||
site = mock.Mock()
|
site = mock.Mock()
|
||||||
site.status = 'ACTIVE'
|
site.status = "ACTIVE"
|
||||||
site.active_brozzling_time = 0
|
site.active_brozzling_time = 0
|
||||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
|
||||||
|
|
||||||
rr = mock.Mock()
|
rr = mock.Mock()
|
||||||
rr.servers = [mock.Mock()]
|
rr.servers = [mock.Mock()]
|
||||||
|
@ -458,9 +572,10 @@ def test_limit_failures():
|
||||||
rr.table = mock.Mock(
|
rr.table = mock.Mock(
|
||||||
return_value=mock.Mock(
|
return_value=mock.Mock(
|
||||||
between=mock.Mock(
|
between=mock.Mock(
|
||||||
return_value=mock.Mock(
|
return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
|
||||||
limit=mock.Mock(
|
)
|
||||||
return_value=rethink_query)))))
|
)
|
||||||
|
)
|
||||||
assert rr.table().between().limit().run() == []
|
assert rr.table().between().limit().run() == []
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
frontier.enforce_time_limit = mock.Mock()
|
frontier.enforce_time_limit = mock.Mock()
|
||||||
|
@ -475,20 +590,19 @@ def test_limit_failures():
|
||||||
|
|
||||||
assert page.failed_attempts is None
|
assert page.failed_attempts is None
|
||||||
assert page.brozzle_count == 0
|
assert page.brozzle_count == 0
|
||||||
assert site.status == 'ACTIVE'
|
assert site.status == "ACTIVE"
|
||||||
|
|
||||||
worker.brozzle_site(browser, site)
|
worker.brozzle_site(browser, site)
|
||||||
assert page.failed_attempts == 1
|
assert page.failed_attempts == 1
|
||||||
assert page.brozzle_count == 0
|
assert page.brozzle_count == 0
|
||||||
assert site.status == 'ACTIVE'
|
assert site.status == "ACTIVE"
|
||||||
|
|
||||||
worker.brozzle_site(browser, site)
|
worker.brozzle_site(browser, site)
|
||||||
assert page.failed_attempts == 2
|
assert page.failed_attempts == 2
|
||||||
assert page.brozzle_count == 0
|
assert page.brozzle_count == 0
|
||||||
assert site.status == 'ACTIVE'
|
assert site.status == "ACTIVE"
|
||||||
|
|
||||||
worker.brozzle_site(browser, site)
|
worker.brozzle_site(browser, site)
|
||||||
assert page.failed_attempts == 3
|
assert page.failed_attempts == 3
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert site.status == 'FINISHED'
|
assert site.status == "FINISHED"
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
|
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
|
||||||
queue a job for your vagrant brozzler deployment.
|
queue a job for your vagrant brozzler deployment.
|
||||||
|
|
||||||
|
@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
def main(argv=[]):
|
def main(argv=[]):
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
"job_conf_file",
|
||||||
help='brozzler job configuration file in yaml')
|
metavar="JOB_CONF_FILE",
|
||||||
|
help="brozzler job configuration file in yaml",
|
||||||
|
)
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
with open(args.job_conf_file, 'rb') as f:
|
with open(args.job_conf_file, "rb") as f:
|
||||||
subprocess.call([
|
subprocess.call(
|
||||||
'vagrant', 'ssh', '--',
|
[
|
||||||
'f=`mktemp` && cat > $f && '
|
"vagrant",
|
||||||
'/home/vagrant/brozzler-ve3/bin/python '
|
"ssh",
|
||||||
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
|
"--",
|
||||||
stdin=f)
|
"f=`mktemp` && cat > $f && "
|
||||||
|
"/home/vagrant/brozzler-ve3/bin/python "
|
||||||
|
"/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
|
||||||
|
],
|
||||||
|
stdin=f,
|
||||||
|
)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
main(sys.argv)
|
main(sys.argv)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
"""
|
||||||
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
|
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
|
||||||
queue a site for your vagrant brozzler deployment.
|
queue a site for your vagrant brozzler deployment.
|
||||||
|
|
||||||
|
@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from shlex import quote
|
from shlex import quote
|
||||||
except:
|
except:
|
||||||
from pipes import quote
|
from pipes import quote
|
||||||
|
|
||||||
|
|
||||||
def main(argv=[]):
|
def main(argv=[]):
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
arg_parser.add_argument("seed", metavar="SEED", help="seed url")
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--time-limit', dest='time_limit', default=None,
|
"--time-limit",
|
||||||
help='time limit in seconds for this site')
|
dest="time_limit",
|
||||||
|
default=None,
|
||||||
|
help="time limit in seconds for this site",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--ignore-robots', dest='ignore_robots', action='store_true',
|
"--ignore-robots",
|
||||||
help='ignore robots.txt for this site')
|
dest="ignore_robots",
|
||||||
|
action="store_true",
|
||||||
|
help="ignore robots.txt for this site",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--warcprox-meta', dest='warcprox_meta',
|
"--warcprox-meta",
|
||||||
|
dest="warcprox_meta",
|
||||||
help=(
|
help=(
|
||||||
'Warcprox-Meta http request header to send with each request; '
|
"Warcprox-Meta http request header to send with each request; "
|
||||||
'must be a json blob, ignored unless warcprox features are '
|
"must be a json blob, ignored unless warcprox features are "
|
||||||
'enabled'))
|
"enabled"
|
||||||
arg_parser.add_argument(
|
),
|
||||||
'-q', '--quiet', dest='quiet', action='store_true')
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
|
||||||
'-v', '--verbose', dest='verbose', action='store_true')
|
arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
||||||
options = []
|
options = []
|
||||||
if args.time_limit:
|
if args.time_limit:
|
||||||
options.append('--time-limit=%s' % args.time_limit)
|
options.append("--time-limit=%s" % args.time_limit)
|
||||||
if args.ignore_robots:
|
if args.ignore_robots:
|
||||||
options.append('--ignore-robots')
|
options.append("--ignore-robots")
|
||||||
if args.warcprox_meta:
|
if args.warcprox_meta:
|
||||||
# I think this shell escaping is correct?
|
# I think this shell escaping is correct?
|
||||||
options.append(
|
options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
|
||||||
'--warcprox-meta=%s' % quote(args.warcprox_meta))
|
|
||||||
if args.quiet:
|
if args.quiet:
|
||||||
options.append('--quiet')
|
options.append("--quiet")
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
options.append('--verbose')
|
options.append("--verbose")
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
"/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
|
||||||
'%s %s') % (' '.join(options), args.seed)
|
) % (" ".join(options), args.seed)
|
||||||
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
subprocess.call(["vagrant", "ssh", "--", cmd])
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
main(sys.argv)
|
main(sys.argv)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue