Merge branch 'logging' into qa

This commit is contained in:
Misty De Méo 2025-02-21 09:27:44 -08:00
commit 5f1e92c23f
15 changed files with 446 additions and 397 deletions

View File

@ -18,6 +18,7 @@ limitations under the License.
"""
import logging
import structlog
from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution("brozzler").version
@ -79,32 +80,6 @@ class ReachedLimit(Exception):
return self.__repr__()
# monkey-patch log levels TRACE and NOTICE
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.TRACE):
self._log(logging.TRACE, msg, args, **kwargs)
logging.Logger.trace = _logger_trace
logging.trace = logging.root.trace
logging.addLevelName(logging.TRACE, "TRACE")
logging.NOTICE = (logging.INFO + logging.WARN) // 2
def _logger_notice(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.NOTICE):
self._log(logging.NOTICE, msg, args, **kwargs)
logging.Logger.notice = _logger_notice
logging.notice = logging.root.notice
logging.addLevelName(logging.NOTICE, "NOTICE")
# see https://github.com/internetarchive/brozzler/issues/91
def _logging_handler_handle(self, record):
rv = self.filter(record)
@ -146,7 +121,9 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
"""
Returns the javascript behavior string populated with template_parameters.
"""
import re, logging, json
import re, json
logger = structlog.get_logger()
for behavior in behaviors(behaviors_dir=behaviors_dir):
if re.match(behavior["url_regex"], url):
@ -159,18 +136,18 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
behavior["behavior_js_template"]
)
script = template.render(parameters)
logging.info(
"using template=%r populated with parameters=%r for %r",
behavior["behavior_js_template"],
json.dumps(parameters),
url,
logger.info(
"rendering template",
template=behavior["behavior_js_template"],
parameters=json.dumps(parameters),
url=url,
)
return script
return None
class ThreadExceptionGate:
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, thread):
self.thread = thread
@ -181,7 +158,9 @@ class ThreadExceptionGate:
def __enter__(self):
assert self.thread == threading.current_thread()
if self.pending_exception:
self.logger.info("raising pending exception %s", self.pending_exception)
self.logger.info(
"raising pending exception", pending_exception=self.pending_exception
)
tmp = self.pending_exception
self.pending_exception = None
raise tmp
@ -198,10 +177,10 @@ class ThreadExceptionGate:
with self.lock:
if self.pending_exception:
self.logger.warning(
"%r already pending for thread %r, discarding %r",
self.pending_exception,
self.thread,
e,
"exception already pending for thread, discarding",
pending_exception=self.pending_exception,
thread=self.thread,
exception=e,
)
else:
self.pending_exception = e
@ -266,7 +245,9 @@ def thread_raise(thread, exctype):
TypeError if `exctype` is not a class
ValueError, SystemError in case of unexpected problems
"""
import ctypes, inspect, threading, logging
import ctypes, inspect, threading, structlog
logger = structlog.get_logger(exctype=exctype, thread=thread)
if not inspect.isclass(exctype):
raise TypeError(
@ -278,7 +259,7 @@ def thread_raise(thread, exctype):
with gate.lock:
if gate.ok_to_raise.is_set() and thread.is_alive():
gate.ok_to_raise.clear()
logging.info("raising %s in thread %s", exctype, thread)
logger.info("raising exception in thread")
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(thread.ident), ctypes.py_object(exctype)
)
@ -290,7 +271,7 @@ def thread_raise(thread, exctype):
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
raise SystemError("PyThreadState_SetAsyncExc failed")
else:
logging.info("queueing %s for thread %s", exctype, thread)
logger.info("queueing exception for thread")
gate.queue_exception(exctype)

View File

@ -31,6 +31,7 @@ import base64
from ipaddress import AddressValueError
from brozzler.chrome import Chrome
import socket
import structlog
import urlcanon
MAX_UNMATCHED_INVALID_CHECKS = 5
@ -53,7 +54,7 @@ class BrowserPool:
debugging protocol.
"""
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, size=3, **kwargs):
"""
@ -144,7 +145,7 @@ class BrowserPool:
class WebsockReceiverThread(threading.Thread):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, websock, name=None, daemon=True):
super().__init__(name=name, daemon=daemon)
@ -194,7 +195,7 @@ class WebsockReceiverThread(threading.Thread):
):
self.logger.error("websocket closed, did chrome die?")
else:
self.logger.error("exception from websocket receiver thread", exc_info=1)
self.logger.exception("exception from websocket receiver thread")
brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self):
@ -214,10 +215,9 @@ class WebsockReceiverThread(threading.Thread):
try:
self._handle_message(websock, message)
except:
self.logger.error(
"uncaught exception in _handle_message message=%s",
message,
exc_info=True,
self.logger.exception(
"uncaught exception in _handle_message",
message=message,
)
def _network_response_received(self, message):
@ -232,7 +232,7 @@ class WebsockReceiverThread(threading.Thread):
]
)
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
self.logger.info("reached limit %s", self.reached_limit)
self.logger.info("reached limit", limit=self.reached_limit)
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
else:
self.logger.info(
@ -246,7 +246,7 @@ class WebsockReceiverThread(threading.Thread):
self.page_status = status
def _javascript_dialog_opening(self, message):
self.logger.info("javascript dialog opened: %s", message)
self.logger.info("javascript dialog opened", message=message)
if message["params"]["type"] == "alert":
accept = True
else:
@ -293,7 +293,7 @@ class WebsockReceiverThread(threading.Thread):
message["params"]["message"]["text"],
)
elif message["method"] == "Runtime.exceptionThrown":
self.logger.debug("uncaught exception: %s", message)
self.logger.debug("uncaught exception", exception=message)
elif message["method"] == "Page.javascriptDialogOpening":
self._javascript_dialog_opening(message)
elif (
@ -323,7 +323,7 @@ class Browser:
Manages an instance of Chrome for browsing pages.
"""
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, **kwargs):
"""
@ -366,11 +366,10 @@ class Browser:
msg_id = next(self._command_id)
kwargs["id"] = msg_id
msg = json.dumps(kwargs, separators=",:")
logging.log(
logging.TRACE if suppress_logging else logging.DEBUG,
"sending message to %s: %s",
self.websock,
msg,
self.logger.debug(
"sending message",
websock=self.websock,
message=msg,
)
self.websock.send(msg)
return msg_id
@ -398,7 +397,7 @@ class Browser:
# Enable Console & Runtime output only when debugging.
# After all, we just print these events with debug(), we don't use
# them in Brozzler logic.
if self.logger.isEnabledFor(logging.DEBUG):
if self.logger.is_enabled_for(logging.DEBUG):
self.send_to_chrome(method="Console.enable")
self.send_to_chrome(method="Runtime.enable")
self.send_to_chrome(method="ServiceWorker.enable")
@ -433,8 +432,8 @@ class Browser:
try:
self.websock.close()
except BaseException as e:
self.logger.error(
"exception closing websocket %s - %s", self.websock, e
self.logger.exception(
"exception closing websocket", websocket=self.websock
)
self.chrome.stop()
@ -461,7 +460,7 @@ class Browser:
self.websock_url = None
except:
self.logger.error("problem stopping", exc_info=True)
self.logger.exception("problem stopping")
def is_running(self):
return self.websock_url is not None
@ -567,7 +566,7 @@ class Browser:
# if login redirected us, return to page_url
if page_url != self.url().split("#")[0]:
self.logger.debug(
"login navigated away from %s; returning!", page_url
"login navigated away; returning!", page_url=page_url
)
self.navigate_to_page(page_url, timeout=page_timeout)
# If the target page HTTP status is 4xx/5xx, there is no point
@ -611,7 +610,7 @@ class Browser:
# more information, raise that one
raise self.websock_thread.reached_limit
except websocket.WebSocketConnectionClosedException as e:
self.logger.error("websocket closed, did chrome die?")
self.logger.exception("websocket closed, did chrome die?")
raise BrowsingException(e)
finally:
self.is_browsing = False
@ -633,7 +632,7 @@ class Browser:
on_screenshot(jpeg_bytes)
return
except BrowsingTimeout as e:
logging.error("attempt %s/3: %s", i + 1, e)
self.logger.exception("attempt %s/3", i + 1)
def visit_hashtags(self, page_url, hashtags, outlinks):
_hashtags = set(hashtags or [])
@ -647,7 +646,7 @@ class Browser:
# out which hashtags were visited already and skip those
for hashtag in _hashtags:
# navigate_to_hashtag (nothing to wait for so no timeout?)
self.logger.debug("navigating to hashtag %s", hashtag)
self.logger.debug("navigating to hashtag", hashtag=hashtag)
url = urlcanon.whatwg(page_url)
url.hash_sign = b"#"
url.fragment = hashtag[1:].encode("utf-8")
@ -687,7 +686,7 @@ class Browser:
)
def navigate_to_page(self, page_url, timeout=300):
self.logger.info("navigating to page %s", page_url)
self.logger.info("navigating to page", page_url=page_url)
self.websock_thread.got_page_load_event = None
self.websock_thread.page_status = None
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
@ -715,15 +714,13 @@ class Browser:
try:
out.append(str(urlcanon.whatwg(link)))
except AddressValueError:
self.logger.warning("skip invalid outlink: %s", link)
self.logger.warning("skip invalid outlink", outlink=link)
return frozenset(out)
else:
# no links found
return frozenset()
else:
self.logger.error(
"problem extracting outlinks, result message: %s", message
)
self.logger.error("problem extracting outlinks", message=message)
return frozenset()
def screenshot(self, full_page=False, timeout=45):
@ -797,11 +794,11 @@ class Browser:
elapsed = time.time() - start
if elapsed > timeout:
logging.info(
"behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks, for url %s",
"behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks",
elapsed,
valid_behavior_checks,
invalid_behavior_checks,
page_url,
page_url=page_url,
)
return

View File

@ -16,7 +16,6 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import urllib.request
import time
import threading
@ -27,6 +26,7 @@ import select
import re
import signal
import sqlite3
import structlog
import json
import tempfile
import sys
@ -65,7 +65,7 @@ def check_version(chrome_exe):
class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
"""
@ -97,22 +97,22 @@ class Chrome:
def _init_cookie_db(self, cookie_db):
cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
cookie_location = os.path.join(cookie_dir, "Cookies")
self.logger.debug("cookie DB provided, writing to %s", cookie_location)
cookie_logger = self.logger.bind(cookie_location=cookie_location)
cookie_logger.debug("cookie DB provided, writing to")
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, "wb") as cookie_file:
cookie_file.write(cookie_db)
except OSError:
self.logger.error(
"exception writing cookie file at %s", cookie_location, exc_info=True
)
cookie_logger.exception("exception writing cookie file")
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
self.logger.debug(
"marking cookies persistent then reading file into memory: %s",
cookie_location,
cookie_logger = self.logger.bind(cookie_location=cookie_location)
cookie_logger.debug(
"marking cookies persistent then reading file into memory",
)
try:
with sqlite3.connect(cookie_location) as conn:
@ -125,20 +125,14 @@ class Chrome:
cur = conn.cursor()
cur.execute("UPDATE cookies SET persistent = 1")
except sqlite3.Error:
self.logger.error(
"exception updating cookie DB %s", cookie_location, exc_info=True
)
cookie_logger.exception("exception updating cookie DB")
cookie_db = None
try:
with open(cookie_location, "rb") as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
"exception reading from cookie DB file %s",
cookie_location,
exc_info=True,
)
cookie_logger.exception("exception reading from cookie DB file")
return cookie_db
def start(
@ -228,7 +222,7 @@ class Chrome:
if proxy:
chrome_args.append("--proxy-server=%s" % proxy)
chrome_args.append("about:blank")
self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
chrome_args,
@ -244,12 +238,13 @@ class Chrome:
daemon=True,
)
self._out_reader_thread.start()
self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
self.logger.info("chrome running", pid=self.chrome_process.pid)
return self._websocket_url(timeout_sec=websocket_timeout)
def _websocket_url(self, timeout_sec=60):
json_url = "http://localhost:%s/json" % self.port
url_logger = self.logger.bind(json_url=json_url)
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
self._last_warning = self._start
@ -260,24 +255,21 @@ class Chrome:
debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
self.logger.debug("%s returned %s", json_url, raw_json)
url_logger.debug("webSocketDebuggerUrl returned", raw_json=raw_json)
url = debug_info[0]["webSocketDebuggerUrl"]
self.logger.info(
"got chrome window websocket debug url %s from %s",
url,
json_url,
url_logger.info(
"got chrome window websocket debug url",
debug_url=url,
)
return url
except brozzler.ShutdownRequested:
raise
except Exception as e:
if time.time() - self._last_warning > 30:
self.logger.warning(
"problem with %s (will keep trying until timeout "
"of %d seconds): %s",
json_url,
timeout_sec,
e,
url_logger.warning(
"problem accessing url (will keep trying until timeout)",
timeout=timeout_sec,
exc_info=True,
)
self._last_warning = time.time()
finally:
@ -323,26 +315,28 @@ class Chrome:
while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout)
if buf:
self.logger.trace(
self.logger.debug(
"chrome pid %s STDOUT %s", self.chrome_process.pid, buf
)
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
self.logger.trace(
self.logger.debug(
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
)
except:
self.logger.error("unexpected exception", exc_info=True)
self.logger.exception("unexpected exception")
def stop(self):
if not self.chrome_process or self._shutdown.is_set():
return
self._shutdown.set()
pid_logger = self.logger.bind(pid=self.chrome_process.pid)
timeout_sec = 300
if self.chrome_process.poll() is None:
self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
pid_logger.info("terminating chrome")
os.killpg(self.chrome_process.pid, signal.SIGTERM)
t0 = time.time()
@ -352,14 +346,11 @@ class Chrome:
status = self.chrome_process.poll()
if status is not None:
if status == 0:
self.logger.info(
"chrome pid %s exited normally", self.chrome_process.pid
)
pid_logger.info("chrome exited normally")
else:
self.logger.warning(
"chrome pid %s exited with nonzero status %s",
self.chrome_process.pid,
status,
pid_logger.warning(
"chrome exited with nonzero status",
status=status,
)
# XXX I would like to forcefully kill the process group
@ -369,18 +360,16 @@ class Chrome:
return
time.sleep(0.5)
self.logger.warning(
"chrome pid %s still alive %.1f seconds after sending "
pid_logger.warning(
"chrome still alive %.1f seconds after sending "
"SIGTERM, sending SIGKILL",
self.chrome_process.pid,
time.time() - t0,
)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warning(
"chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
self.chrome_process.pid,
status,
pid_logger.warning(
"chrome reaped after killing with " "SIGKILL",
status=status,
)
finally:
@ -389,8 +378,8 @@ class Chrome:
try:
self._home_tmpdir.cleanup()
except:
self.logger.error(
"exception deleting %s", self._home_tmpdir, exc_info=True
self.logger.exception(
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
)
self._out_reader_thread.join()
self.chrome_process = None

View File

@ -29,6 +29,8 @@ import requests
import doublethink
import signal
import string
import structlog
import subprocess
import sys
import threading
import time
@ -41,6 +43,8 @@ import rethinkdb as rdb
r = rdb.RethinkDB()
logger = structlog.get_logger()
def add_common_options(arg_parser, argv=None):
argv = argv or sys.argv
@ -50,7 +54,7 @@ def add_common_options(arg_parser, argv=None):
dest="log_level",
action="store_const",
default=logging.INFO,
const=logging.NOTICE,
const=logging.WARN,
help="quiet logging",
)
arg_parser.add_argument(
@ -67,7 +71,7 @@ def add_common_options(arg_parser, argv=None):
dest="log_level",
action="store_const",
default=logging.INFO,
const=logging.TRACE,
const=logging.DEBUG,
help=("very verbose logging"),
)
# arg_parser.add_argument(
@ -108,7 +112,50 @@ def rethinker(args):
return doublethink.Rethinker(servers.split(","), db)
# Decorates the logger name with call location, if provided
def decorate_logger_name(a, b, event_dict):
old_name = event_dict.get("logger_name")
if old_name is None:
return event_dict
try:
filename = event_dict.pop("filename")
func_name = event_dict.pop("func_name")
lineno = event_dict.pop("lineno")
except KeyError:
return event_dict
new_name = f"{old_name}.{func_name}({filename}:{lineno})"
event_dict["logger_name"] = new_name
return event_dict
def configure_logging(args):
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.StackInfoRenderer(),
structlog.dev.set_exc_info,
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
structlog.processors.CallsiteParameterAdder(
[
structlog.processors.CallsiteParameter.FILENAME,
structlog.processors.CallsiteParameter.FUNC_NAME,
structlog.processors.CallsiteParameter.LINENO,
],
),
decorate_logger_name,
structlog.dev.ConsoleRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=False,
)
# We still configure logging for now because its handlers
# are used for the gunicorn spawned by the brozzler dashboard.
logging.basicConfig(
stream=sys.stderr,
level=args.log_level,
@ -126,8 +173,36 @@ def configure_logging(args):
)
def suggest_default_chrome_exe():
# mac os x application executable paths
def mdfind(identifier):
try:
result = subprocess.check_output(
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
)
# Just treat any errors as "couldn't find app"
except subprocess.CalledProcessError:
return None
if result:
return result.rstrip("\n")
def suggest_default_chrome_exe_mac():
path = None
# Try Chromium first, then Chrome
result = mdfind("org.chromium.Chromium")
if result is not None:
path = f"{result}/Contents/MacOS/Chromium"
result = mdfind("com.google.Chrome")
if result is not None:
path = f"{result}/Contents/MacOS/Google Chrome"
if path is not None and os.path.exists(path):
return path
# Fall back to default paths if mdfind couldn't find it
# (mdfind might fail to find them even in their default paths
# if the system has Spotlight disabled.)
for path in [
"/Applications/Thorium.app/Contents/MacOS/Thorium",
"/Applications/Chromium.app/Contents/MacOS/Chromium",
@ -136,6 +211,14 @@ def suggest_default_chrome_exe():
if os.path.exists(path):
return path
def suggest_default_chrome_exe():
# First ask mdfind, which lets us find it in non-default paths
if sys.platform == "darwin":
path = suggest_default_chrome_exe_mac()
if path is not None:
return path
# "chromium-browser" is the executable on ubuntu trusty
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
# google chrome executable names taken from these packages:
@ -319,7 +402,7 @@ def brozzle_page(argv=None):
)
with open(filename, "wb") as f:
f.write(screenshot_jpeg)
logging.info("wrote screenshot to %s", filename)
logger.info("wrote screenshot", filename=filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
try:
@ -335,11 +418,11 @@ def brozzle_page(argv=None):
on_screenshot=on_screenshot,
enable_youtube_dl=not args.skip_youtube_dl,
)
logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
logger.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error("reached limit %s", e)
logger.exception("reached limit")
except brozzler.PageInterstitialShown as e:
logging.error("page interstitial shown %s", e)
logger.exception("page interstitial shown")
finally:
browser.stop()
@ -597,11 +680,11 @@ def brozzler_worker(argv=None):
state_strs.append("<???:thread:ident=%s>" % ident)
stack = traceback.format_stack(frames[ident])
state_strs.append("".join(stack))
logging.info(
"dumping state (caught signal %s)\n%s" % (signum, "\n".join(state_strs))
logger.info(
"dumping state (caught signal)\n%s", signal=signum, state=state_strs
)
except BaseException as e:
logging.error("exception dumping state: %s" % e)
logger.exception("exception dumping state")
finally:
signal.signal(signal.SIGQUIT, dump_state)
@ -612,13 +695,13 @@ def brozzler_worker(argv=None):
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
if ytdlp_proxy_endpoints:
logging.info(
"running with ytdlp proxy endpoints file %s"
% YTDLP_PROXY_ENDPOINTS_FILE
logger.info(
"running with ytdlp proxy endpoints file",
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
)
except Exception as e:
ytdlp_proxy_endpoints = []
logging.info("running with empty proxy endpoints file")
logger.info("running with empty proxy endpoints file")
return ytdlp_proxy_endpoints
rr = rethinker(args)
@ -650,7 +733,7 @@ def brozzler_worker(argv=None):
th = threading.Thread(target=worker.run, name="BrozzlerWorkerThread")
th.start()
th.join()
logging.info("brozzler-worker is all done, exiting")
logger.info("brozzler-worker is all done, exiting")
def brozzler_ensure_tables(argv=None):
@ -724,18 +807,18 @@ def brozzler_list_jobs(argv=None):
except ValueError:
job_id = args.job
reql = rr.table("jobs").get(job_id)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
result = reql.run()
if result:
results = [reql.run()]
else:
logging.error("no such job with id %r", job_id)
logger.error("no such job with id", job_id=job_id)
sys.exit(1)
else:
reql = rr.table("jobs").order_by("id")
if args.active:
reql = reql.filter({"status": "ACTIVE"})
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
results = reql.run()
if args.yaml:
yaml.dump_all(
@ -800,7 +883,7 @@ def brozzler_list_sites(argv=None):
)
elif args.site:
reql = reql.get_all(args.site)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
results = reql.run()
if args.yaml:
yaml.dump_all(
@ -868,7 +951,7 @@ def brozzler_list_pages(argv=None):
except ValueError:
job_id = args.job
reql = rr.table("sites").get_all(job_id, index="job_id")["id"]
logging.debug("querying rethinkb: %s", reql)
logger.debug("querying rethinkb", query=reql)
site_ids = reql.run()
elif args.site:
try:
@ -897,7 +980,7 @@ def brozzler_list_pages(argv=None):
reql = reql.order_by(index="least_hops")
if args.claimed:
reql = reql.filter({"claimed": True})
logging.debug("querying rethinkb: %s", reql)
logger.debug("querying rethinkb", query=reql)
results = reql.run()
if args.yaml:
yaml.dump_all(
@ -963,20 +1046,20 @@ def brozzler_purge(argv=None):
job_id = args.job
job = brozzler.Job.load(rr, job_id)
if not job:
logging.fatal("no such job %r", job_id)
logger.fatal("no such job", job_id=job_id)
sys.exit(1)
if job.status == "ACTIVE":
if args.force:
logging.warning(
"job %s has status ACTIVE, purging anyway because "
logger.warning(
"job has status ACTIVE, purging anyway because "
"--force was supplied",
job_id,
job_id=job_id,
)
else:
logging.fatal(
"refusing to purge job %s because status is ACTIVE "
logger.fatal(
"refusing to purge job because status is ACTIVE "
"(override with --force)",
job_id,
job_id=job_id,
)
sys.exit(1)
_purge_job(rr, job_id)
@ -984,20 +1067,20 @@ def brozzler_purge(argv=None):
site_id = args.site
site = brozzler.Site.load(rr, site_id)
if not site:
logging.fatal("no such job %r", job_id)
logger.fatal("no such job", job_id=job_id)
sys.exit(1)
if site.status == "ACTIVE":
if args.force:
logging.warning(
"site %s has status ACTIVE, purging anyway because "
logger.warning(
"site has status ACTIVE, purging anyway because "
"--force was supplied",
site_id,
site_id=site_id,
)
else:
logging.fatal(
"refusing to purge site %s because status is ACTIVE "
logger.fatal(
"refusing to purge site because status is ACTIVE "
"(override with --force)",
site_id,
site_id=site_id,
)
sys.exit(1)
_purge_site(rr, site_id)
@ -1016,7 +1099,7 @@ def brozzler_purge(argv=None):
.lt(finished_before)
)
)
logging.debug("retrieving jobs older than %s: %s", finished_before, reql)
logger.debug("retrieving jobs older than %s", finished_before, query=reql)
for job in reql.run():
# logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
# job['id'], job.get('finished'),
@ -1034,27 +1117,31 @@ def _purge_site(rr, site_id):
)
.delete()
)
logging.debug("purging pages for site %s: %s", site_id, reql)
site_logger = logger.bind(site_id=site_id)
site_logger.debug("purging pages for site", query=reql)
result = reql.run()
logging.info("purged pages for site %s: %s", site_id, result)
site_logger.info("purged pages for site", result=result)
reql = rr.table("sites").get(site_id).delete()
logging.debug("purging site %s: %s", site_id, reql)
site_logger.debug("purging site", query=reql)
result = reql.run()
logging.info("purged site %s: %s", site_id, result)
site_logger.info("purged site", result=result)
def _purge_job(rr, job_id):
job_logger = logger.bind(job_id=job_id)
reql = rr.table("sites").get_all(job_id, index="job_id").get_field("id")
logging.debug("querying rethinkdb: %s", reql)
job_logger.debug("querying rethinkdb", query=reql)
site_ids = list(reql.run())
for site_id in site_ids:
_purge_site(rr, site_id)
reql = rr.table("jobs").get(job_id).delete()
logging.debug("purging job %s: %s", job_id, reql)
job_logger.debug("purging job", query=reql)
result = reql.run()
logging.info("purged job %s: %s", job_id, result)
job_logger.info("purged job", result=result)
def brozzler_list_captures(argv=None):
@ -1101,7 +1188,7 @@ def brozzler_list_captures(argv=None):
if args.url_or_sha1[:5] == "sha1:":
if args.prefix:
logging.warning(
logger.warning(
"ignoring supplied --prefix option which does not apply "
"to lookup by sha1"
)
@ -1112,7 +1199,7 @@ def brozzler_list_captures(argv=None):
[sha1base32, r.maxval, r.maxval],
index="sha1_warc_type",
)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
results = reql.run()
else:
key = urlcanon.semantic(args.url_or_sha1).surt().decode("ascii")
@ -1135,7 +1222,7 @@ def brozzler_list_captures(argv=None):
lambda capture: (capture["canon_surt"] >= key)
& (capture["canon_surt"] <= end_key)
)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
results = reql.run()
if args.yaml:
@ -1180,7 +1267,7 @@ def brozzler_stop_crawl(argv=None):
job_id = args.job_id
job = brozzler.Job.load(rr, job_id)
if not job:
logging.fatal("job not found with id=%r", job_id)
logger.fatal("job not found with", id=job_id)
sys.exit(1)
job.stop_requested = doublethink.utcnow()
job.save()
@ -1191,7 +1278,7 @@ def brozzler_stop_crawl(argv=None):
site_id = args.site_id
site = brozzler.Site.load(rr, site_id)
if not site:
logging.fatal("site not found with id=%r", site_id)
logger.fatal("site not found with", id=site_id)
sys.exit(1)
site.stop_requested = doublethink.utcnow()
site.save()

View File

@ -17,13 +17,15 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import structlog
import sys
logger = structlog.get_logger()
try:
import flask
except ImportError as e:
logging.critical(
logger.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__,
@ -77,7 +79,7 @@ def queued_count(site_id):
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
count = reql.run()
return flask.jsonify(count=count)
@ -85,7 +87,7 @@ def queued_count(site_id):
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
def queue(site_id):
logging.debug("flask.request.args=%s", flask.request.args)
logger.debug("flask.request.args", args=flask.request.args)
start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90)
reql = rr.table("pages").between(
@ -93,7 +95,7 @@ def queue(site_id):
[site_id, 0, False, r.maxval],
index="priority_by_site",
)[start:end]
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_))
@ -112,7 +114,7 @@ def page_count(site_id):
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
count = reql.run()
return flask.jsonify(count=count)
@ -130,7 +132,7 @@ def pages(site_id):
)
.order_by(index="least_hops")[start:end]
)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_))
@ -139,7 +141,7 @@ def pages(site_id):
@app.route("/api/page/<page_id>")
def page(page_id):
reql = rr.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
page_ = reql.run()
return flask.jsonify(page_)
@ -148,7 +150,7 @@ def page(page_id):
@app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id):
reql = rr.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
page_ = reql.run()
return app.response_class(
yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
@ -159,7 +161,7 @@ def page_yaml(page_id):
@app.route("/api/site/<site_id>")
def site(site_id):
reql = rr.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
s = reql.run()
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
@ -170,7 +172,7 @@ def site(site_id):
@app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id):
reql = rr.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
site_ = reql.run()
return app.response_class(
yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
@ -180,7 +182,7 @@ def site_yaml(site_id):
@app.route("/api/stats/<bucket>")
def stats(bucket):
reql = rr.table("stats").get(bucket)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
stats_ = reql.run()
return flask.jsonify(stats_)
@ -193,7 +195,7 @@ def sites(job_id):
except ValueError:
jid = job_id
reql = rr.table("sites").get_all(jid, index="job_id")
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
@ -206,7 +208,7 @@ def sites(job_id):
def jobless_sites():
# XXX inefficient (unindexed) query
reql = rr.table("sites").filter(~r.row.has_fields("job_id"))
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
@ -223,7 +225,7 @@ def job(job_id):
except ValueError:
jid = job_id
reql = rr.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
job_ = reql.run()
return flask.jsonify(job_)
@ -236,7 +238,7 @@ def job_yaml(job_id):
except ValueError:
jid = job_id
reql = rr.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
job_ = reql.run()
return app.response_class(
yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
@ -258,7 +260,7 @@ def services():
@app.route("/api/jobs")
def jobs():
reql = rr.table("jobs").order_by(r.desc("id"))
logging.debug("querying rethinkdb: %s", reql)
logger.debug("querying rethinkdb", query=reql)
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_)
@ -313,13 +315,13 @@ try:
return self.application
def run(**options):
logging.info("running brozzler-dashboard using gunicorn")
logger.info("running brozzler-dashboard using gunicorn")
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info("running brozzler-dashboard using simple flask app.run")
logger.info("running brozzler-dashboard using simple flask app.run")
app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])

View File

@ -18,8 +18,8 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import structlog
import sys
import logging
try:
import warcprox
@ -30,11 +30,11 @@ try:
import wsgiref.handlers
import brozzler.dashboard
except ImportError as e:
logging.critical(
structlog.get_logger().critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__,
e,
exc_info=True,
)
sys.exit(1)
import argparse
@ -156,7 +156,7 @@ class ThreadingWSGIServer(
class BrozzlerEasyController:
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, args):
self.stop = threading.Event()
@ -238,11 +238,14 @@ class BrozzlerEasyController:
self.logger.info("starting brozzler-worker")
self.brozzler_worker.start()
self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
self.logger.info(
"starting pywb", address="%s:%s" % self.pywb_httpd.server_address
)
threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info(
"starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
"starting brozzler-dashboard",
address="%s:%s" % self.dashboard_httpd.server_address,
)
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
@ -307,8 +310,8 @@ class BrozzlerEasyController:
state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append("".join(stack))
logging.warning(
"dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
structlog.get_logger().warning(
"dumping state (caught signal)", signal=signum, state="\n".join(state_strs)
)

View File

@ -16,12 +16,12 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import brozzler
import random
import time
import datetime
import rethinkdb as rdb
import structlog
import doublethink
import urlcanon
@ -33,7 +33,7 @@ class UnexpectedDbResult(Exception):
class RethinkDbFrontier:
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
def __init__(self, rr, shards=None, replicas=None):
self.rr = rr
@ -42,15 +42,15 @@ class RethinkDbFrontier:
self._ensure_db()
def _ensure_db(self):
db_logger = self.logger.bind(dbname=self.rr.dbname)
dbs = self.rr.db_list().run()
if not self.rr.dbname in dbs:
self.logger.info("creating rethinkdb database %r", self.rr.dbname)
db_logger.info("creating rethinkdb database")
self.rr.db_create(self.rr.dbname).run()
tables = self.rr.table_list().run()
if not "sites" in tables:
self.logger.info(
"creating rethinkdb table 'sites' in database %r", self.rr.dbname
)
db_logger.info("creating rethinkdb table 'sites' in database")
self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas
).run()
@ -59,9 +59,7 @@ class RethinkDbFrontier:
).run()
self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables:
self.logger.info(
"creating rethinkdb table 'pages' in database %r", self.rr.dbname
)
db_logger.info("creating rethinkdb table 'pages' in database")
self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas
).run()
@ -81,9 +79,7 @@ class RethinkDbFrontier:
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run()
if not "jobs" in tables:
self.logger.info(
"creating rethinkdb table 'jobs' in database %r", self.rr.dbname
)
db_logger.info("creating rethinkdb table 'jobs' in database")
self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas
).run()
@ -108,7 +104,7 @@ class RethinkDbFrontier:
)
def claim_sites(self, n=1):
self.logger.trace("claiming up to %s sites to brozzle", n)
self.logger.debug("claiming up to %s sites to brozzle", n)
result = (
self.rr.table("sites")
.get_all(
@ -186,10 +182,10 @@ class RethinkDbFrontier:
if result["changes"][i]["old_val"]["claimed"]:
self.logger.warning(
"re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from "
"because it was last claimed a long time ago, "
"and presumably some error stopped it from "
"being disclaimed",
result["changes"][i]["old_val"]["last_claimed"],
last_claimed=result["changes"][i]["old_val"]["last_claimed"],
)
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
sites.append(site)
@ -205,10 +201,10 @@ class RethinkDbFrontier:
"""
if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
site.time_limit,
site.elapsed(),
site,
"site FINISHED_TIME_LIMIT!",
time_limit=site.time_limit,
elapsed=site.elapsed(),
site=site,
)
raise brozzler.ReachedTimeLimit
@ -273,7 +269,7 @@ class RethinkDbFrontier:
"""Raises brozzler.CrawlStopped if stop has been requested."""
site.refresh()
if site.stop_requested and site.stop_requested <= doublethink.utcnow():
self.logger.info("stop requested for site %s", site.id)
self.logger.info("stop requested for site", site_id=site.id)
raise brozzler.CrawlStopped
if site.job_id:
@ -283,7 +279,7 @@ class RethinkDbFrontier:
and job.stop_requested
and job.stop_requested <= doublethink.utcnow()
):
self.logger.info("stop requested for job %s", site.job_id)
self.logger.info("stop requested for job", job_id=site.job_id)
raise brozzler.CrawlStopped
def _maybe_finish_job(self, job_id):
@ -304,7 +300,7 @@ class RethinkDbFrontier:
return False
n += 1
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
self.logger.info("all %s sites finished, job is FINISHED!", n, job_id=job.id)
job.finish()
job.save()
return True
@ -320,7 +316,7 @@ class RethinkDbFrontier:
self._maybe_finish_job(site.job_id)
def disclaim_site(self, site, page=None):
self.logger.info("disclaiming %s", site)
self.logger.info("disclaiming", site=site)
site.claimed = False
site.last_disclaimed = doublethink.utcnow()
if not page and not self.has_outstanding_pages(site):
@ -468,17 +464,16 @@ class RethinkDbFrontier:
try:
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
reql = self.rr.table("pages").insert(batch, conflict="replace")
self.logger.trace(
self.logger.debug(
'running query self.rr.table("pages").insert(%r, '
'conflict="replace")',
batch,
)
result = reql.run()
except Exception as e:
self.logger.error(
self.logger.exception(
"problem inserting/replacing batch of %s pages",
len(batch),
exc_info=True,
)
parent_page.outlinks = {}
@ -497,7 +492,7 @@ class RethinkDbFrontier:
)
def reached_limit(self, site, e):
self.logger.info("reached_limit site=%s e=%s", site, e)
self.logger.info("reached_limit", site=site, e=e)
assert isinstance(e, brozzler.ReachedLimit)
if (
site.reached_limit
@ -530,7 +525,7 @@ class RethinkDbFrontier:
)
pages = list(results)
if len(pages) > 1:
self.logger.warning("more than one seed page for site_id %s ?", site_id)
self.logger.warning("more than one seed page?", site_id=site_id)
if len(pages) < 1:
return None
return brozzler.Page(self.rr, pages[0])
@ -550,8 +545,8 @@ class RethinkDbFrontier:
[site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
index="priority_by_site",
)
self.logger.trace("running query: %r", query)
self.logger.debug("running query", query=query)
results = query.run()
for result in results:
self.logger.trace("yielding result: %r", result)
self.logger.debug("yielding result", result=result)
yield brozzler.Page(self.rr, result)

View File

@ -25,9 +25,9 @@ import datetime
import doublethink
import hashlib
import json
import logging
import os
import re
import structlog
import time
import urlcanon
import urllib
@ -37,6 +37,8 @@ import zlib
from enum import Enum
from typing import Optional
logger = structlog.get_logger()
def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
@ -84,7 +86,7 @@ def merge(a, b):
def new_job_file(frontier, job_conf_file):
"""Returns new Job."""
logging.info("loading %s", job_conf_file)
logger.info("loading", job_conf_file=job_conf_file)
with open(job_conf_file) as f:
job_conf = yaml.safe_load(f)
return new_job(frontier, job_conf)
@ -120,12 +122,12 @@ def new_job(frontier, job_conf):
# insert in batches to avoid this error
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
logging.info("inserting batch of %s pages", len(batch))
logger.info("inserting batch of %s pages", len(batch))
result = frontier.rr.table("pages").insert(batch).run()
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
logging.info("inserting batch of %s sites", len(batch))
logger.info("inserting batch of %s sites", len(batch))
result = frontier.rr.table("sites").insert(batch).run()
logging.info("job %s fully started", job.id)
logger.info("job fully started", job_id=job.id)
return job
@ -154,7 +156,7 @@ def new_seed_page(frontier, site):
def new_site(frontier, site):
logging.info("new site %s", site)
logger.info("new site", site=site)
site.id = site.id or str(uuid.uuid4())
# insert the Page into the database before the Site, to avoid situation
# where a brozzler worker immediately claims the site, finds no pages
@ -162,7 +164,7 @@ def new_site(frontier, site):
try:
page = new_seed_page(frontier, site)
page.save()
logging.info("queued page %s", page)
logger.info("queued page", page=page)
finally:
# finally block because we want to insert the Site no matter what
site.save()
@ -195,7 +197,7 @@ class ElapsedMixIn(object):
class Job(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
table = "jobs"
def populate_defaults(self):
@ -217,9 +219,9 @@ class Job(doublethink.Document, ElapsedMixIn):
def finish(self):
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
self.logger.error(
"job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
self.status,
self.starts_and_stops[-1]["stop"],
"job is already finished",
status=self.status,
stop=self.starts_and_stops[-1]["stop"],
)
self.status = "FINISHED"
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
@ -246,7 +248,7 @@ class VideoCaptureOptions(Enum):
class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
table = "sites"
def populate_defaults(self):
@ -304,7 +306,7 @@ class Site(doublethink.Document, ElapsedMixIn):
if set(rule.keys()) == {"ssurt"}
)
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
self.logger.info("adding ssurt %s to scope accept rules", ssurt)
self.logger.info("adding ssurt to scope accept rules", ssurt=ssurt)
self.scope["accepts"].append({"ssurt": ssurt})
def note_seed_redirect(self, url):
@ -402,7 +404,7 @@ class Site(doublethink.Document, ElapsedMixIn):
class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
table = "pages"
@staticmethod

View File

@ -19,7 +19,9 @@ limitations under the License.
"""
import sys
import logging
import structlog
logger = structlog.get_logger()
try:
import pywb.apps.cli
@ -30,7 +32,7 @@ try:
import pywb.framework.basehandlers
import pywb.rewrite.wburl
except ImportError as e:
logging.critical(
logger.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__,
@ -111,7 +113,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
)
if cdx_query.limit:
reql = reql.limit(cdx_query.limit)
logging.debug("rethinkdb query: %s", reql)
logger.debug("rethinkdb query", query=reql)
results = reql.run()
return results

View File

@ -23,12 +23,12 @@ limitations under the License.
"""
import json
import logging
import brozzler
import reppy
import reppy.cache
import reppy.parser
import requests
import structlog
__all__ = ["is_permitted_by_robots"]
@ -119,10 +119,9 @@ def is_permitted_by_robots(site, url, proxy=None):
# reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e)
else:
logging.warning(
"returning true (permitted) after problem fetching "
"robots.txt for %r: %r",
url,
e,
structlog.get_logger().warning(
"returning true (permitted) after problem fetching " "robots.txt",
url=url,
exception=e,
)
return True

View File

@ -18,7 +18,6 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
@ -32,6 +31,7 @@ import io
import socket
import random
import requests
import structlog
import urllib3
from urllib3.exceptions import TimeoutError, ProxyError
import doublethink
@ -46,7 +46,7 @@ r = rdb.RethinkDB()
class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
# 3⅓ min heartbeat interval => 10 min ttl
# This is kind of a long time, because `frontier.claim_sites()`, which runs
@ -125,7 +125,7 @@ class BrozzlerWorker:
self._metrics_port, self._registry_url, self._env
)
else:
logging.warning(
self.logger.warning(
"not starting prometheus scrape endpoint: metrics_port is undefined"
)
@ -173,9 +173,9 @@ class BrozzlerWorker:
site.proxy = "%s:%s" % (svc["host"], svc["port"])
site.save()
self.logger.info(
"chose warcprox instance %r from service registry for %r",
site.proxy,
site,
"chose warcprox instance from service registry",
instance=site.proxy,
registry=site,
)
return site.proxy
return None
@ -189,7 +189,7 @@ class BrozzlerWorker:
self._proxy_is_warcprox = status["role"] == "warcprox"
except Exception as e:
self._proxy_is_warcprox = False
logging.info(
self.logger.info(
"%s %s warcprox",
self._proxy,
"IS" if self._proxy_is_warcprox else "IS NOT",
@ -227,18 +227,18 @@ class BrozzlerWorker:
with urllib.request.urlopen(request, timeout=900) as response:
if response.getcode() != 204:
self.logger.warning(
'got "%s %s" response on warcprox '
"got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)",
response.getcode(),
response.reason,
code=response.getcode(),
reason=response.reason,
)
return request, response
except urllib.error.HTTPError as e:
self.logger.warning(
'got "%s %s" response on warcprox '
"got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)",
e.getcode(),
e.info(),
code=e.getcode(),
reason=e.info(),
)
return request, None
except urllib.error.URLError as e:
@ -271,26 +271,27 @@ class BrozzlerWorker:
on_request=None,
enable_youtube_dl=True,
):
self.logger.info("brozzling {}".format(page))
page_logger = self.logger.bind(page=page)
page_logger.info("brozzling")
outlinks = set()
page_headers = self._get_page_headers(site, page)
if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
page_logger.info("needs fetch")
if site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
elif site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
page_logger.info(
"skipping video content: video MIME type capture disabled for site"
)
else:
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page)
page_logger.info("needs browsing")
try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
@ -300,7 +301,7 @@ class BrozzlerWorker:
if status_code in [502, 504]:
raise brozzler.PageConnectionError()
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
page_logger.info("page interstitial shown (http auth)")
if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
try:
@ -316,10 +317,7 @@ class BrozzlerWorker:
except brozzler.ProxyError:
raise
except brozzler.VideoExtractorError as e:
logging.error(
"error extracting video info: %s",
e,
)
self.logger.exception("error extracting video info")
except Exception as e:
if (
hasattr(e, "exc_info")
@ -328,26 +326,25 @@ class BrozzlerWorker:
and e.exc_info[1].code == 430
):
self.logger.info(
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
"youtube-dl encountered an error",
code=e.exc_info[1].code,
message=e.exc_info[1].msg,
url=page.url,
)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)
self.logger.exception("youtube_dl raised exception", page=page)
return outlinks
@metrics.brozzler_header_processing_duration_seconds.time()
@metrics.brozzler_in_progress_headers.track_inprogress()
def _get_page_headers(self, site, page):
url_logger = self.logger.bind(url=page.url)
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try:
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
self.logger.info("getting page headers for %s", page.url)
url_logger.info("getting page headers")
with requests.get(
page.url,
stream=True,
@ -357,11 +354,9 @@ class BrozzlerWorker:
) as r:
return r.headers
except requests.exceptions.Timeout as e:
self.logger.warning(
"Timed out trying to get headers for %s: %s", page.url, e
)
url_logger.warning("Timed out trying to get headers", exc_info=True)
except requests.exceptions.RequestException as e:
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
url_logger.warning("Failed to get headers", exc_info=True)
return {}
def _needs_browsing(self, page_headers) -> bool:
@ -402,10 +397,9 @@ class BrozzlerWorker:
on_screenshot(screenshot_jpeg)
if self._using_warcprox(site):
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s",
self._proxy_for(site),
page,
"sending WARCPROX_WRITE_RECORD request",
proxy=self._proxy_for(site),
screenshot_for_page=page,
)
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
self._warcprox_write_record(
@ -452,7 +446,7 @@ class BrozzlerWorker:
video["content-length"] = int(response_headers["content-length"])
if "content-range" in response_headers:
video["content-range"] = response_headers["content-range"]
logging.debug("embedded video %s", video)
self.logger.debug("embedded video", video=video)
if not "videos" in page:
page.videos = []
page.videos.append(video)
@ -461,11 +455,11 @@ class BrozzlerWorker:
def _on_service_worker_version_updated(chrome_msg):
# https://github.com/internetarchive/brozzler/issues/140
self.logger.trace("%r", chrome_msg)
self.logger.debug("service worker updated", chrome_msg=chrome_msg)
if chrome_msg.get("params", {}).get("versions"):
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
if url and url.startswith("http") and url not in sw_fetched:
self.logger.info("fetching service worker script %s", url)
self.logger.info("fetching service worker script", url=url)
self._fetch_url(site, url=url)
sw_fetched.add(url)
@ -520,7 +514,7 @@ class BrozzlerWorker:
headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page))
self.logger.info("fetching url %s", url)
self.logger.info("fetching url", url=url)
try:
# response is ignored
http.request(
@ -530,17 +524,18 @@ class BrozzlerWorker:
timeout=self.FETCH_URL_TIMEOUT,
retries=False,
)
self.logger.info("Completed fetching url %s", url)
self.logger.info("Completed fetching url", url=url)
except TimeoutError as e:
self.logger.warning("Timed out fetching %s", url)
self.logger.warning("Timed out fetching url", url=url)
raise brozzler.PageConnectionError() from e
except ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
except urllib3.exceptions.RequestError as e:
self.logger.warning("Failed to fetch url %s: %s", url, e)
self.logger.warning("Failed to fetch url", url=url, exc_info=True)
raise brozzler.PageConnectionError() from e
def brozzle_site(self, browser, site):
site_logger = self.logger.bind(site=site)
try:
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
site.save()
@ -550,9 +545,7 @@ class BrozzlerWorker:
self._frontier.honor_stop_request(site)
# _proxy_for() call in log statement can raise brozzler.ProxyError
# which is why we honor time limit and stop request first☝🏻
self.logger.info(
"brozzling site (proxy=%r) %s", self._proxy_for(site), site
)
site_logger.info("brozzling site", proxy=self._proxy_for(site))
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh()
self._frontier.enforce_time_limit(site)
@ -564,7 +557,7 @@ class BrozzlerWorker:
if page.needs_robots_check and not brozzler.is_permitted_by_robots(
site, page.url, self._proxy_for(site)
):
logging.warning("page %s is blocked by robots.txt", page.url)
self.logger.warning("page is blocked by robots.txt", url=page.url)
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
@ -580,7 +573,7 @@ class BrozzlerWorker:
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except brozzler.NothingToClaim:
self.logger.info("no pages left for site %s", site)
site_logger.info("no pages left for site")
except brozzler.ReachedLimit as e:
self._frontier.reached_limit(site, e)
except brozzler.ReachedTimeLimit as e:
@ -591,29 +584,24 @@ class BrozzlerWorker:
# self.logger.info("{} shut down".format(browser))
except brozzler.ProxyError as e:
if self._warcprox_auto:
logging.error(
"proxy error (site.proxy=%s), will try to choose a "
"healthy instance next time site is brozzled: %s",
site.proxy,
e,
self.logger.exception(
"proxy error, will try to choose a "
"healthy instance next time site is brozzled",
site_proxy=site.proxy,
)
site.proxy = None
else:
# using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
self.logger.exception("proxy error", self_proxy=self._proxy)
except (brozzler.PageConnectionError, Exception) as e:
if isinstance(e, brozzler.PageConnectionError):
self.logger.error(
"Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
site,
page,
exc_info=True,
site_logger.exception(
"Page status code possibly indicates connection failure between host and warcprox",
page=page,
)
else:
self.logger.error(
"unexpected exception site=%r page=%r", site, page, exc_info=True
)
site_logger.exception("unexpected exception", page=page)
if page:
# Calculate backoff in seconds based on number of failed attempts.
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
@ -624,10 +612,10 @@ class BrozzlerWorker:
page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info(
'marking page "completed" after %s unexpected '
"exceptions attempting to brozzle %s",
page.failed_attempts,
page,
'marking page "completed" after several unexpected '
"exceptions attempting to brozzle",
failed_attempts=page.failed_attempts,
page=page,
)
self._frontier.completed_page(site, page)
page = None
@ -665,13 +653,11 @@ class BrozzlerWorker:
try:
self.status_info = self._service_registry.heartbeat(status_info)
self.logger.trace("status in service registry: %s", self.status_info)
self.logger.debug("status in service registry", status=self.status_info)
except r.ReqlError as e:
self.logger.error(
"failed to send heartbeat and update service registry "
"with info %s: %s",
status_info,
e,
self.logger.exception(
"failed to send heartbeat and update service registry",
info=status_info,
)
def _service_heartbeat_if_due(self):
@ -719,9 +705,7 @@ class BrozzlerWorker:
self._browser_pool.release(browsers[i])
def run(self):
self.logger.notice(
"brozzler %s - brozzler-worker starting", brozzler.__version__
)
self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__)
last_nothing_to_claim = 0
try:
while not self._shutdown.is_set():
@ -730,20 +714,20 @@ class BrozzlerWorker:
try:
self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable:
logging.trace("all %s browsers are in use", self._max_browsers)
self.logger.debug(
"all browsers are in use", max_browsers=self._max_browsers
)
except brozzler.NothingToClaim:
last_nothing_to_claim = time.time()
logging.trace(
self.logger.debug(
"nothing to claim, all available active sites "
"are already claimed by a brozzler worker"
)
time.sleep(0.5)
self.logger.notice("shutdown requested")
self.logger.warn("shutdown requested")
except r.ReqlError as e:
self.logger.error(
"caught rethinkdb exception, will try to proceed", exc_info=True
)
self.logger.exception("caught rethinkdb exception, will try to proceed")
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except:
@ -755,12 +739,11 @@ class BrozzlerWorker:
try:
self._service_registry.unregister(self.status_info["id"])
except:
self.logger.error(
"failed to unregister from service registry", exc_info=True
)
self.logger.exception("failed to unregister from service registry")
self.logger.info(
"shutting down %s brozzling threads", len(self._browsing_threads)
"shutting down brozzling threads",
thread_count=len(self._browsing_threads),
)
with self._browsing_threads_lock:
for th in self._browsing_threads:
@ -780,6 +763,7 @@ class BrozzlerWorker:
)
return
self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
self.logger = self.logger.bind(thread=self._thread)
self._thread.start()
def shutdown_now(self):

View File

@ -16,7 +16,6 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import yt_dlp
from yt_dlp.utils import match_filter_func, ExtractorError
import brozzler
@ -31,6 +30,7 @@ import datetime
from . import metrics
import random
import structlog
import threading
import traceback
import doublethink
@ -44,17 +44,20 @@ YTDLP_WAIT = 10
YTDLP_MAX_REDIRECTS = 5
logger = structlog.get_logger()
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check
if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
logger.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
logger.info("skipping ytdlp: site has video capture disabled")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -104,7 +107,13 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
"""
class _YoutubeDL(yt_dlp.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(__module__ + "." + __qualname__)
def __init__(self, url, params=None, auto_init=True):
super().__init__(params, auto_init)
self.url = url
self.logger = self.logger.bind(url=url)
def process_ie_result(self, ie_result, download=True, extra_info=None):
if extra_info is None:
@ -114,7 +123,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info:
self.logger.info(
f"Following redirect URL: {ie_result['url']} extraction_depth: {extra_info['extraction_depth']}"
f"Following redirect",
redirect_url=ie_result["url"],
extraction_depth=extra_info["extraction_depth"],
)
extra_info["extraction_depth"] = 1 + extra_info.get(
"extraction_depth", 0
@ -131,8 +142,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
extract_context = self.logger.bind(extractor=ie.IE_NAME)
if ie_result.get("_type") == "playlist":
self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
extract_context.info("found playlist")
if ie.IE_NAME in {
"youtube:playlist",
"youtube:tab",
@ -147,22 +159,20 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
try:
ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e:
self.logger.warning(
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
ie.IE_NAME,
url,
e,
extract_context.warning(
"failed to unroll entries ie_result['entries']?",
exc_info=True,
)
ie_result["entries_no_dl"] = []
ie_result["entries"] = []
self.logger.info(
"not downloading %s media files from this "
"not downloading media files from this "
"playlist because we expect to capture them from "
"individual watch/track/detail pages",
len(ie_result["entries_no_dl"]),
media_file_count=len(ie_result["entries_no_dl"]),
)
else:
self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
extract_context.info("found a download")
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such
@ -180,7 +190,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e:
mimetype = "video/%s" % info_dict["ext"]
self.logger.warning("guessing mimetype %s because %r", mimetype, e)
self.logger.warning(
"guessing mimetype due to error",
mimetype=mimetype,
exc_info=True,
)
# youtube watch page postprocessor is MoveFiles
@ -198,12 +212,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
size = os.path.getsize(info_dict["filepath"])
self.logger.info(
"pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
info_dict["format"],
mimetype,
size,
worker._proxy_for(site),
url,
"pushing video to warcprox",
format=info_dict["format"],
mimetype=mimetype,
size=size,
warcprox=worker._proxy_for(site),
)
with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked
@ -240,23 +253,23 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
):
worker.logger.debug(
"heartbeating site.last_claimed to prevent another "
"brozzler-worker claiming this site id=%r",
site.id,
"brozzler-worker claiming this site",
id=site.id,
)
site.last_claimed = doublethink.utcnow()
site.save()
except:
worker.logger.debug(
"problem heartbeating site.last_claimed site id=%r",
site.id,
"problem heartbeating site.last_claimed site",
id=site.id,
exc_info=True,
)
def ydl_postprocess_hook(d):
if d["status"] == "finished":
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
worker.logger.info(
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
"[ydl_postprocess_hook] Finished postprocessing",
postprocessor=d["postprocessor"],
)
is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])
@ -290,7 +303,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
# --cache-dir local or..
# this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal?
"cache_dir": "/home/archiveit",
"logger": logging.getLogger("yt_dlp"),
"logger": logger,
"verbose": False,
"quiet": False,
# recommended to avoid bot detection
@ -306,17 +319,16 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
ytdlp_proxy_for_logs = (
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
)
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)
logger.info("using yt-dlp proxy ...", proxy=ytdlp_proxy_for_logs)
# skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges
# if worker._proxy_for(site):
# ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
ydl = _YoutubeDL(ydl_opts)
ydl = _YoutubeDL(ytdlp_url, params=ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
ydl.pushed_videos = []
ydl.url = ytdlp_url
ydl.is_youtube_host = is_youtube_host
return ydl
@ -336,7 +348,7 @@ def _remember_videos(page, pushed_videos=None):
"content-type": pushed_video["content-type"],
"content-length": pushed_video["content-length"],
}
logging.debug("pushed video %s", video)
logger.debug("pushed video", video=video)
page.videos.append(video)
@ -345,7 +357,7 @@ def _try_youtube_dl(worker, ydl, site, page):
attempt = 0
while attempt < max_attempts:
try:
logging.info("trying yt-dlp on %s", ydl.url)
logger.info("trying yt-dlp", url=ydl.url)
# should_download_vid = not ydl.is_youtube_host
# then
# ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid)
@ -386,15 +398,18 @@ def _try_youtube_dl(worker, ydl, site, page):
# and others...
attempt += 1
if attempt == max_attempts:
logging.warning(
"Failed after %s attempt(s). Error: %s", max_attempts, e
logger.warning(
"Failed after %s attempt(s)",
max_attempts,
attempts=max_attempts,
exc_info=True,
)
raise brozzler.VideoExtractorError(
"yt-dlp hit error extracting info for %s" % ydl.url
)
else:
retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1)))
logging.info(
logger.info(
"Attempt %s failed. Retrying in %s seconds...",
attempt,
retry_wait,
@ -405,15 +420,14 @@ def _try_youtube_dl(worker, ydl, site, page):
"yt-dlp hit unknown error extracting info for %s" % ydl.url
)
logging.info("ytdlp completed successfully")
logger.info("ytdlp completed successfully")
_remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
ydl.url,
logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
url=ydl.url,
)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
@ -444,7 +458,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
with tempfile.TemporaryDirectory(
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
) as tempdir:
logging.info("tempdir for yt-dlp: %s", tempdir)
logger.info("tempdir for yt-dlp", tempdir=tempdir)
ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints)
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set()

View File

@ -76,6 +76,7 @@ setuptools.setup(
"cryptography>=2.3",
"python-magic>=0.4.15",
"prometheus-client>=0.20.0",
"structlog>=25.1.0",
],
extras_require={
"yt-dlp": ["yt-dlp>=2024.7.25"],

View File

@ -31,11 +31,14 @@ import datetime
import requests
import subprocess
import http.server
import logging
import structlog
import sys
import warcprox
logger = structlog.get_logger()
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
def _local_address():
import socket
@ -70,11 +73,11 @@ def stop_service(service):
def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler):
def do_POST(self):
logging.info("\n%s\n%s", self.requestline, self.headers)
logger.info("\n%s\n%s", self.requestline, self.headers)
self.do_GET()
def do_GET(self):
logging.info("\n%s\n%s", self.requestline, self.headers)
logger.info("\n%s\n%s", self.requestline, self.headers)
if self.path == "/site5/redirect/":
self.send_response(303, "See other")
self.send_header("Connection", "close")
@ -270,7 +273,7 @@ def test_proxy_non_warcprox(httpd):
def do_HEAD(self):
if not hasattr(self.server, "requests"):
self.server.requests = []
logging.info("%s %s", self.command, self.path)
logger.info("%s %s", self.command, self.path)
self.server.requests.append("%s %s" % (self.command, self.path))
response = urllib.request.urlopen(self.path)
self.wfile.write(
@ -292,7 +295,7 @@ def test_proxy_non_warcprox(httpd):
def do_WARCPROX_WRITE_RECORD(self):
if not hasattr(self.server, "requests"):
self.server.requests = []
logging.info("%s %s", self.command, self.path)
logger.info("%s %s", self.command, self.path)
self.send_error(400)
proxy = http.server.HTTPServer(("localhost", 0), DumbProxyRequestHandler)
@ -826,7 +829,7 @@ def test_warcprox_outage_resiliency(httpd):
try:
stop_service("warcprox")
except Exception as e:
logging.warning("problem stopping warcprox service: %s", e)
logger.warning("problem stopping warcprox service: %s", exc_info=True)
# queue the site for brozzling
brozzler.new_site(frontier, site)

View File

@ -24,7 +24,6 @@ import os
import brozzler
import brozzler.chrome
import brozzler.ydl
import logging
import yaml
import datetime
import requests
@ -36,15 +35,6 @@ import sys
import threading
from unittest import mock
logging.basicConfig(
stream=sys.stderr,
level=logging.INFO,
format=(
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
),
)
@pytest.fixture(scope="module")
def httpd(request):