diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 9309116..ed74e1a 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -18,6 +18,7 @@ limitations under the License. """ import logging +import structlog from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution("brozzler").version @@ -79,32 +80,6 @@ class ReachedLimit(Exception): return self.__repr__() -# monkey-patch log levels TRACE and NOTICE -logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2 - - -def _logger_trace(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.TRACE): - self._log(logging.TRACE, msg, args, **kwargs) - - -logging.Logger.trace = _logger_trace -logging.trace = logging.root.trace -logging.addLevelName(logging.TRACE, "TRACE") - -logging.NOTICE = (logging.INFO + logging.WARN) // 2 - - -def _logger_notice(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.NOTICE): - self._log(logging.NOTICE, msg, args, **kwargs) - - -logging.Logger.notice = _logger_notice -logging.notice = logging.root.notice -logging.addLevelName(logging.NOTICE, "NOTICE") - - # see https://github.com/internetarchive/brozzler/issues/91 def _logging_handler_handle(self, record): rv = self.filter(record) @@ -146,7 +121,9 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None): """ Returns the javascript behavior string populated with template_parameters. """ - import re, logging, json + import re, json + + logger = structlog.get_logger() for behavior in behaviors(behaviors_dir=behaviors_dir): if re.match(behavior["url_regex"], url): @@ -159,18 +136,18 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None): behavior["behavior_js_template"] ) script = template.render(parameters) - logging.info( - "using template=%r populated with parameters=%r for %r", - behavior["behavior_js_template"], - json.dumps(parameters), - url, + logger.info( + "rendering template", + template=behavior["behavior_js_template"], + parameters=json.dumps(parameters), + url=url, ) return script return None class ThreadExceptionGate: - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, thread): self.thread = thread @@ -181,7 +158,9 @@ class ThreadExceptionGate: def __enter__(self): assert self.thread == threading.current_thread() if self.pending_exception: - self.logger.info("raising pending exception %s", self.pending_exception) + self.logger.info( + "raising pending exception", pending_exception=self.pending_exception + ) tmp = self.pending_exception self.pending_exception = None raise tmp @@ -198,10 +177,10 @@ class ThreadExceptionGate: with self.lock: if self.pending_exception: self.logger.warning( - "%r already pending for thread %r, discarding %r", - self.pending_exception, - self.thread, - e, + "exception already pending for thread, discarding", + pending_exception=self.pending_exception, + thread=self.thread, + exception=e, ) else: self.pending_exception = e @@ -266,7 +245,9 @@ def thread_raise(thread, exctype): TypeError if `exctype` is not a class ValueError, SystemError in case of unexpected problems """ - import ctypes, inspect, threading, logging + import ctypes, inspect, threading, structlog + + logger = structlog.get_logger(exctype=exctype, thread=thread) if not inspect.isclass(exctype): raise TypeError( @@ -278,7 +259,7 @@ def thread_raise(thread, exctype): with gate.lock: if gate.ok_to_raise.is_set() and thread.is_alive(): gate.ok_to_raise.clear() - logging.info("raising %s in thread %s", exctype, thread) + logger.info("raising exception in thread") res = ctypes.pythonapi.PyThreadState_SetAsyncExc( ctypes.c_long(thread.ident), ctypes.py_object(exctype) ) @@ -290,7 +271,7 @@ def thread_raise(thread, exctype): ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0) raise SystemError("PyThreadState_SetAsyncExc failed") else: - logging.info("queueing %s for thread %s", exctype, thread) + logger.info("queueing exception for thread") gate.queue_exception(exctype) diff --git a/brozzler/browser.py b/brozzler/browser.py index 99f076e..f636662 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -31,6 +31,7 @@ import base64 from ipaddress import AddressValueError from brozzler.chrome import Chrome import socket +import structlog import urlcanon MAX_UNMATCHED_INVALID_CHECKS = 5 @@ -53,7 +54,7 @@ class BrowserPool: debugging protocol. """ - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, size=3, **kwargs): """ @@ -144,7 +145,7 @@ class BrowserPool: class WebsockReceiverThread(threading.Thread): - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, websock, name=None, daemon=True): super().__init__(name=name, daemon=daemon) @@ -194,7 +195,7 @@ class WebsockReceiverThread(threading.Thread): ): self.logger.error("websocket closed, did chrome die?") else: - self.logger.error("exception from websocket receiver thread", exc_info=1) + self.logger.exception("exception from websocket receiver thread") brozzler.thread_raise(self.calling_thread, BrowsingException) def run(self): @@ -214,10 +215,9 @@ class WebsockReceiverThread(threading.Thread): try: self._handle_message(websock, message) except: - self.logger.error( - "uncaught exception in _handle_message message=%s", - message, - exc_info=True, + self.logger.exception( + "uncaught exception in _handle_message", + message=message, ) def _network_response_received(self, message): @@ -232,7 +232,7 @@ class WebsockReceiverThread(threading.Thread): ] ) self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta) - self.logger.info("reached limit %s", self.reached_limit) + self.logger.info("reached limit", limit=self.reached_limit) brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit) else: self.logger.info( @@ -246,7 +246,7 @@ class WebsockReceiverThread(threading.Thread): self.page_status = status def _javascript_dialog_opening(self, message): - self.logger.info("javascript dialog opened: %s", message) + self.logger.info("javascript dialog opened", message=message) if message["params"]["type"] == "alert": accept = True else: @@ -293,7 +293,7 @@ class WebsockReceiverThread(threading.Thread): message["params"]["message"]["text"], ) elif message["method"] == "Runtime.exceptionThrown": - self.logger.debug("uncaught exception: %s", message) + self.logger.debug("uncaught exception", exception=message) elif message["method"] == "Page.javascriptDialogOpening": self._javascript_dialog_opening(message) elif ( @@ -323,7 +323,7 @@ class Browser: Manages an instance of Chrome for browsing pages. """ - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, **kwargs): """ @@ -366,11 +366,10 @@ class Browser: msg_id = next(self._command_id) kwargs["id"] = msg_id msg = json.dumps(kwargs, separators=",:") - logging.log( - logging.TRACE if suppress_logging else logging.DEBUG, - "sending message to %s: %s", - self.websock, - msg, + self.logger.debug( + "sending message", + websock=self.websock, + message=msg, ) self.websock.send(msg) return msg_id @@ -398,7 +397,7 @@ class Browser: # Enable Console & Runtime output only when debugging. # After all, we just print these events with debug(), we don't use # them in Brozzler logic. - if self.logger.isEnabledFor(logging.DEBUG): + if self.logger.is_enabled_for(logging.DEBUG): self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Runtime.enable") self.send_to_chrome(method="ServiceWorker.enable") @@ -433,8 +432,8 @@ class Browser: try: self.websock.close() except BaseException as e: - self.logger.error( - "exception closing websocket %s - %s", self.websock, e + self.logger.exception( + "exception closing websocket", websocket=self.websock ) self.chrome.stop() @@ -461,7 +460,7 @@ class Browser: self.websock_url = None except: - self.logger.error("problem stopping", exc_info=True) + self.logger.exception("problem stopping") def is_running(self): return self.websock_url is not None @@ -567,7 +566,7 @@ class Browser: # if login redirected us, return to page_url if page_url != self.url().split("#")[0]: self.logger.debug( - "login navigated away from %s; returning!", page_url + "login navigated away; returning!", page_url=page_url ) self.navigate_to_page(page_url, timeout=page_timeout) # If the target page HTTP status is 4xx/5xx, there is no point @@ -611,7 +610,7 @@ class Browser: # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: - self.logger.error("websocket closed, did chrome die?") + self.logger.exception("websocket closed, did chrome die?") raise BrowsingException(e) finally: self.is_browsing = False @@ -633,7 +632,7 @@ class Browser: on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: - logging.error("attempt %s/3: %s", i + 1, e) + self.logger.exception("attempt %s/3", i + 1) def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) @@ -647,7 +646,7 @@ class Browser: # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) - self.logger.debug("navigating to hashtag %s", hashtag) + self.logger.debug("navigating to hashtag", hashtag=hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b"#" url.fragment = hashtag[1:].encode("utf-8") @@ -687,7 +686,7 @@ class Browser: ) def navigate_to_page(self, page_url, timeout=300): - self.logger.info("navigating to page %s", page_url) + self.logger.info("navigating to page", page_url=page_url) self.websock_thread.got_page_load_event = None self.websock_thread.page_status = None self.send_to_chrome(method="Page.navigate", params={"url": page_url}) @@ -715,15 +714,13 @@ class Browser: try: out.append(str(urlcanon.whatwg(link))) except AddressValueError: - self.logger.warning("skip invalid outlink: %s", link) + self.logger.warning("skip invalid outlink", outlink=link) return frozenset(out) else: # no links found return frozenset() else: - self.logger.error( - "problem extracting outlinks, result message: %s", message - ) + self.logger.error("problem extracting outlinks", message=message) return frozenset() def screenshot(self, full_page=False, timeout=45): @@ -797,11 +794,11 @@ class Browser: elapsed = time.time() - start if elapsed > timeout: logging.info( - "behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks, for url %s", + "behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks", elapsed, valid_behavior_checks, invalid_behavior_checks, - page_url, + page_url=page_url, ) return diff --git a/brozzler/chrome.py b/brozzler/chrome.py index cf107fd..a23c3e5 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -16,7 +16,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import logging import urllib.request import time import threading @@ -27,6 +26,7 @@ import select import re import signal import sqlite3 +import structlog import json import tempfile import sys @@ -65,7 +65,7 @@ def check_version(chrome_exe): class Chrome: - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): """ @@ -97,22 +97,22 @@ class Chrome: def _init_cookie_db(self, cookie_db): cookie_dir = os.path.join(self._chrome_user_data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") - self.logger.debug("cookie DB provided, writing to %s", cookie_location) + cookie_logger = self.logger.bind(cookie_location=cookie_location) + + cookie_logger.debug("cookie DB provided, writing to") os.makedirs(cookie_dir, exist_ok=True) try: with open(cookie_location, "wb") as cookie_file: cookie_file.write(cookie_db) except OSError: - self.logger.error( - "exception writing cookie file at %s", cookie_location, exc_info=True - ) + cookie_logger.exception("exception writing cookie file") def persist_and_read_cookie_db(self): cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies") - self.logger.debug( - "marking cookies persistent then reading file into memory: %s", - cookie_location, + cookie_logger = self.logger.bind(cookie_location=cookie_location) + cookie_logger.debug( + "marking cookies persistent then reading file into memory", ) try: with sqlite3.connect(cookie_location) as conn: @@ -125,20 +125,14 @@ class Chrome: cur = conn.cursor() cur.execute("UPDATE cookies SET persistent = 1") except sqlite3.Error: - self.logger.error( - "exception updating cookie DB %s", cookie_location, exc_info=True - ) + cookie_logger.exception("exception updating cookie DB") cookie_db = None try: with open(cookie_location, "rb") as cookie_file: cookie_db = cookie_file.read() except OSError: - self.logger.error( - "exception reading from cookie DB file %s", - cookie_location, - exc_info=True, - ) + cookie_logger.exception("exception reading from cookie DB file") return cookie_db def start( @@ -228,7 +222,7 @@ class Chrome: if proxy: chrome_args.append("--proxy-server=%s" % proxy) chrome_args.append("about:blank") - self.logger.info("running: %r", subprocess.list2cmdline(chrome_args)) + self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args)) # start_new_session - new process group so we can kill the whole group self.chrome_process = subprocess.Popen( chrome_args, @@ -244,12 +238,13 @@ class Chrome: daemon=True, ) self._out_reader_thread.start() - self.logger.info("chrome running, pid %s" % self.chrome_process.pid) + self.logger.info("chrome running", pid=self.chrome_process.pid) return self._websocket_url(timeout_sec=websocket_timeout) def _websocket_url(self, timeout_sec=60): json_url = "http://localhost:%s/json" % self.port + url_logger = self.logger.bind(json_url=json_url) # make this a member variable so that kill -QUIT reports it self._start = time.time() self._last_warning = self._start @@ -260,24 +255,21 @@ class Chrome: debug_info = [x for x in all_debug_info if x["url"] == "about:blank"] if debug_info and "webSocketDebuggerUrl" in debug_info[0]: - self.logger.debug("%s returned %s", json_url, raw_json) + url_logger.debug("webSocketDebuggerUrl returned", raw_json=raw_json) url = debug_info[0]["webSocketDebuggerUrl"] - self.logger.info( - "got chrome window websocket debug url %s from %s", - url, - json_url, + url_logger.info( + "got chrome window websocket debug url", + debug_url=url, ) return url except brozzler.ShutdownRequested: raise except Exception as e: if time.time() - self._last_warning > 30: - self.logger.warning( - "problem with %s (will keep trying until timeout " - "of %d seconds): %s", - json_url, - timeout_sec, - e, + url_logger.warning( + "problem accessing url (will keep trying until timeout)", + timeout=timeout_sec, + exc_info=True, ) self._last_warning = time.time() finally: @@ -323,26 +315,28 @@ class Chrome: while not self._shutdown.is_set(): buf = readline_nonblock(self.chrome_process.stdout) if buf: - self.logger.trace( + self.logger.debug( "chrome pid %s STDOUT %s", self.chrome_process.pid, buf ) buf = readline_nonblock(self.chrome_process.stderr) if buf: - self.logger.trace( + self.logger.debug( "chrome pid %s STDERR %s", self.chrome_process.pid, buf ) except: - self.logger.error("unexpected exception", exc_info=True) + self.logger.exception("unexpected exception") def stop(self): if not self.chrome_process or self._shutdown.is_set(): return self._shutdown.set() + pid_logger = self.logger.bind(pid=self.chrome_process.pid) + timeout_sec = 300 if self.chrome_process.poll() is None: - self.logger.info("terminating chrome pgid %s", self.chrome_process.pid) + pid_logger.info("terminating chrome") os.killpg(self.chrome_process.pid, signal.SIGTERM) t0 = time.time() @@ -352,14 +346,11 @@ class Chrome: status = self.chrome_process.poll() if status is not None: if status == 0: - self.logger.info( - "chrome pid %s exited normally", self.chrome_process.pid - ) + pid_logger.info("chrome exited normally") else: - self.logger.warning( - "chrome pid %s exited with nonzero status %s", - self.chrome_process.pid, - status, + pid_logger.warning( + "chrome exited with nonzero status", + status=status, ) # XXX I would like to forcefully kill the process group @@ -369,18 +360,16 @@ class Chrome: return time.sleep(0.5) - self.logger.warning( - "chrome pid %s still alive %.1f seconds after sending " + pid_logger.warning( + "chrome still alive %.1f seconds after sending " "SIGTERM, sending SIGKILL", - self.chrome_process.pid, time.time() - t0, ) os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() - self.logger.warning( - "chrome pid %s reaped (status=%s) after killing with " "SIGKILL", - self.chrome_process.pid, - status, + pid_logger.warning( + "chrome reaped after killing with " "SIGKILL", + status=status, ) finally: @@ -389,8 +378,8 @@ class Chrome: try: self._home_tmpdir.cleanup() except: - self.logger.error( - "exception deleting %s", self._home_tmpdir, exc_info=True + self.logger.exception( + "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir ) self._out_reader_thread.join() self.chrome_process = None diff --git a/brozzler/cli.py b/brozzler/cli.py index c97917e..2781b30 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -29,6 +29,8 @@ import requests import doublethink import signal import string +import structlog +import subprocess import sys import threading import time @@ -41,6 +43,8 @@ import rethinkdb as rdb r = rdb.RethinkDB() +logger = structlog.get_logger() + def add_common_options(arg_parser, argv=None): argv = argv or sys.argv @@ -50,7 +54,7 @@ def add_common_options(arg_parser, argv=None): dest="log_level", action="store_const", default=logging.INFO, - const=logging.NOTICE, + const=logging.WARN, help="quiet logging", ) arg_parser.add_argument( @@ -67,7 +71,7 @@ def add_common_options(arg_parser, argv=None): dest="log_level", action="store_const", default=logging.INFO, - const=logging.TRACE, + const=logging.DEBUG, help=("very verbose logging"), ) # arg_parser.add_argument( @@ -108,7 +112,50 @@ def rethinker(args): return doublethink.Rethinker(servers.split(","), db) +# Decorates the logger name with call location, if provided +def decorate_logger_name(a, b, event_dict): + old_name = event_dict.get("logger_name") + if old_name is None: + return event_dict + + try: + filename = event_dict.pop("filename") + func_name = event_dict.pop("func_name") + lineno = event_dict.pop("lineno") + except KeyError: + return event_dict + new_name = f"{old_name}.{func_name}({filename}:{lineno})" + event_dict["logger_name"] = new_name + + return event_dict + + def configure_logging(args): + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.dev.set_exc_info, + structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False), + structlog.processors.CallsiteParameterAdder( + [ + structlog.processors.CallsiteParameter.FILENAME, + structlog.processors.CallsiteParameter.FUNC_NAME, + structlog.processors.CallsiteParameter.LINENO, + ], + ), + decorate_logger_name, + structlog.dev.ConsoleRenderer(), + ], + wrapper_class=structlog.make_filtering_bound_logger(args.log_level), + context_class=dict, + logger_factory=structlog.PrintLoggerFactory(), + cache_logger_on_first_use=False, + ) + + # We still configure logging for now because its handlers + # are used for the gunicorn spawned by the brozzler dashboard. logging.basicConfig( stream=sys.stderr, level=args.log_level, @@ -126,8 +173,36 @@ def configure_logging(args): ) -def suggest_default_chrome_exe(): - # mac os x application executable paths +def mdfind(identifier): + try: + result = subprocess.check_output( + ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True + ) + # Just treat any errors as "couldn't find app" + except subprocess.CalledProcessError: + return None + + if result: + return result.rstrip("\n") + + +def suggest_default_chrome_exe_mac(): + path = None + # Try Chromium first, then Chrome + result = mdfind("org.chromium.Chromium") + if result is not None: + path = f"{result}/Contents/MacOS/Chromium" + + result = mdfind("com.google.Chrome") + if result is not None: + path = f"{result}/Contents/MacOS/Google Chrome" + + if path is not None and os.path.exists(path): + return path + + # Fall back to default paths if mdfind couldn't find it + # (mdfind might fail to find them even in their default paths + # if the system has Spotlight disabled.) for path in [ "/Applications/Thorium.app/Contents/MacOS/Thorium", "/Applications/Chromium.app/Contents/MacOS/Chromium", @@ -136,6 +211,14 @@ def suggest_default_chrome_exe(): if os.path.exists(path): return path + +def suggest_default_chrome_exe(): + # First ask mdfind, which lets us find it in non-default paths + if sys.platform == "darwin": + path = suggest_default_chrome_exe_mac() + if path is not None: + return path + # "chromium-browser" is the executable on ubuntu trusty # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium" # google chrome executable names taken from these packages: @@ -319,7 +402,7 @@ def brozzle_page(argv=None): ) with open(filename, "wb") as f: f.write(screenshot_jpeg) - logging.info("wrote screenshot to %s", filename) + logger.info("wrote screenshot", filename=filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: @@ -335,11 +418,11 @@ def brozzle_page(argv=None): on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl, ) - logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks))) + logger.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks))) except brozzler.ReachedLimit as e: - logging.error("reached limit %s", e) + logger.exception("reached limit") except brozzler.PageInterstitialShown as e: - logging.error("page interstitial shown %s", e) + logger.exception("page interstitial shown") finally: browser.stop() @@ -597,11 +680,11 @@ def brozzler_worker(argv=None): state_strs.append("" % ident) stack = traceback.format_stack(frames[ident]) state_strs.append("".join(stack)) - logging.info( - "dumping state (caught signal %s)\n%s" % (signum, "\n".join(state_strs)) + logger.info( + "dumping state (caught signal)\n%s", signal=signum, state=state_strs ) except BaseException as e: - logging.error("exception dumping state: %s" % e) + logger.exception("exception dumping state") finally: signal.signal(signal.SIGQUIT, dump_state) @@ -612,13 +695,13 @@ def brozzler_worker(argv=None): with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] if ytdlp_proxy_endpoints: - logging.info( - "running with ytdlp proxy endpoints file %s" - % YTDLP_PROXY_ENDPOINTS_FILE + logger.info( + "running with ytdlp proxy endpoints file", + ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE, ) except Exception as e: ytdlp_proxy_endpoints = [] - logging.info("running with empty proxy endpoints file") + logger.info("running with empty proxy endpoints file") return ytdlp_proxy_endpoints rr = rethinker(args) @@ -650,7 +733,7 @@ def brozzler_worker(argv=None): th = threading.Thread(target=worker.run, name="BrozzlerWorkerThread") th.start() th.join() - logging.info("brozzler-worker is all done, exiting") + logger.info("brozzler-worker is all done, exiting") def brozzler_ensure_tables(argv=None): @@ -724,18 +807,18 @@ def brozzler_list_jobs(argv=None): except ValueError: job_id = args.job reql = rr.table("jobs").get(job_id) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) result = reql.run() if result: results = [reql.run()] else: - logging.error("no such job with id %r", job_id) + logger.error("no such job with id", job_id=job_id) sys.exit(1) else: reql = rr.table("jobs").order_by("id") if args.active: reql = reql.filter({"status": "ACTIVE"}) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) results = reql.run() if args.yaml: yaml.dump_all( @@ -800,7 +883,7 @@ def brozzler_list_sites(argv=None): ) elif args.site: reql = reql.get_all(args.site) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) results = reql.run() if args.yaml: yaml.dump_all( @@ -868,7 +951,7 @@ def brozzler_list_pages(argv=None): except ValueError: job_id = args.job reql = rr.table("sites").get_all(job_id, index="job_id")["id"] - logging.debug("querying rethinkb: %s", reql) + logger.debug("querying rethinkb", query=reql) site_ids = reql.run() elif args.site: try: @@ -897,7 +980,7 @@ def brozzler_list_pages(argv=None): reql = reql.order_by(index="least_hops") if args.claimed: reql = reql.filter({"claimed": True}) - logging.debug("querying rethinkb: %s", reql) + logger.debug("querying rethinkb", query=reql) results = reql.run() if args.yaml: yaml.dump_all( @@ -963,20 +1046,20 @@ def brozzler_purge(argv=None): job_id = args.job job = brozzler.Job.load(rr, job_id) if not job: - logging.fatal("no such job %r", job_id) + logger.fatal("no such job", job_id=job_id) sys.exit(1) if job.status == "ACTIVE": if args.force: - logging.warning( - "job %s has status ACTIVE, purging anyway because " + logger.warning( + "job has status ACTIVE, purging anyway because " "--force was supplied", - job_id, + job_id=job_id, ) else: - logging.fatal( - "refusing to purge job %s because status is ACTIVE " + logger.fatal( + "refusing to purge job because status is ACTIVE " "(override with --force)", - job_id, + job_id=job_id, ) sys.exit(1) _purge_job(rr, job_id) @@ -984,20 +1067,20 @@ def brozzler_purge(argv=None): site_id = args.site site = brozzler.Site.load(rr, site_id) if not site: - logging.fatal("no such job %r", job_id) + logger.fatal("no such job", job_id=job_id) sys.exit(1) if site.status == "ACTIVE": if args.force: - logging.warning( - "site %s has status ACTIVE, purging anyway because " + logger.warning( + "site has status ACTIVE, purging anyway because " "--force was supplied", - site_id, + site_id=site_id, ) else: - logging.fatal( - "refusing to purge site %s because status is ACTIVE " + logger.fatal( + "refusing to purge site because status is ACTIVE " "(override with --force)", - site_id, + site_id=site_id, ) sys.exit(1) _purge_site(rr, site_id) @@ -1016,7 +1099,7 @@ def brozzler_purge(argv=None): .lt(finished_before) ) ) - logging.debug("retrieving jobs older than %s: %s", finished_before, reql) + logger.debug("retrieving jobs older than %s", finished_before, query=reql) for job in reql.run(): # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s', # job['id'], job.get('finished'), @@ -1034,27 +1117,31 @@ def _purge_site(rr, site_id): ) .delete() ) - logging.debug("purging pages for site %s: %s", site_id, reql) + site_logger = logger.bind(site_id=site_id) + + site_logger.debug("purging pages for site", query=reql) result = reql.run() - logging.info("purged pages for site %s: %s", site_id, result) + site_logger.info("purged pages for site", result=result) reql = rr.table("sites").get(site_id).delete() - logging.debug("purging site %s: %s", site_id, reql) + site_logger.debug("purging site", query=reql) result = reql.run() - logging.info("purged site %s: %s", site_id, result) + site_logger.info("purged site", result=result) def _purge_job(rr, job_id): + job_logger = logger.bind(job_id=job_id) + reql = rr.table("sites").get_all(job_id, index="job_id").get_field("id") - logging.debug("querying rethinkdb: %s", reql) + job_logger.debug("querying rethinkdb", query=reql) site_ids = list(reql.run()) for site_id in site_ids: _purge_site(rr, site_id) reql = rr.table("jobs").get(job_id).delete() - logging.debug("purging job %s: %s", job_id, reql) + job_logger.debug("purging job", query=reql) result = reql.run() - logging.info("purged job %s: %s", job_id, result) + job_logger.info("purged job", result=result) def brozzler_list_captures(argv=None): @@ -1101,7 +1188,7 @@ def brozzler_list_captures(argv=None): if args.url_or_sha1[:5] == "sha1:": if args.prefix: - logging.warning( + logger.warning( "ignoring supplied --prefix option which does not apply " "to lookup by sha1" ) @@ -1112,7 +1199,7 @@ def brozzler_list_captures(argv=None): [sha1base32, r.maxval, r.maxval], index="sha1_warc_type", ) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) results = reql.run() else: key = urlcanon.semantic(args.url_or_sha1).surt().decode("ascii") @@ -1135,7 +1222,7 @@ def brozzler_list_captures(argv=None): lambda capture: (capture["canon_surt"] >= key) & (capture["canon_surt"] <= end_key) ) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) results = reql.run() if args.yaml: @@ -1180,7 +1267,7 @@ def brozzler_stop_crawl(argv=None): job_id = args.job_id job = brozzler.Job.load(rr, job_id) if not job: - logging.fatal("job not found with id=%r", job_id) + logger.fatal("job not found with", id=job_id) sys.exit(1) job.stop_requested = doublethink.utcnow() job.save() @@ -1191,7 +1278,7 @@ def brozzler_stop_crawl(argv=None): site_id = args.site_id site = brozzler.Site.load(rr, site_id) if not site: - logging.fatal("site not found with id=%r", site_id) + logger.fatal("site not found with", id=site_id) sys.exit(1) site.stop_requested = doublethink.utcnow() site.save() diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 16e34c1..e53d363 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -17,13 +17,15 @@ See the License for the specific language governing permissions and limitations under the License. """ -import logging +import structlog import sys +logger = structlog.get_logger() + try: import flask except ImportError as e: - logging.critical( + logger.critical( '%s: %s\n\nYou might need to run "pip install ' 'brozzler[dashboard]".\nSee README.rst for more information.', type(e).__name__, @@ -77,7 +79,7 @@ def queued_count(site_id): ) .count() ) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) count = reql.run() return flask.jsonify(count=count) @@ -85,7 +87,7 @@ def queued_count(site_id): @app.route("/api/sites//queue") @app.route("/api/site//queue") def queue(site_id): - logging.debug("flask.request.args=%s", flask.request.args) + logger.debug("flask.request.args", args=flask.request.args) start = flask.request.args.get("start", 0) end = flask.request.args.get("end", start + 90) reql = rr.table("pages").between( @@ -93,7 +95,7 @@ def queue(site_id): [site_id, 0, False, r.maxval], index="priority_by_site", )[start:end] - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) queue_ = reql.run() return flask.jsonify(queue_=list(queue_)) @@ -112,7 +114,7 @@ def page_count(site_id): ) .count() ) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) count = reql.run() return flask.jsonify(count=count) @@ -130,7 +132,7 @@ def pages(site_id): ) .order_by(index="least_hops")[start:end] ) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) pages_ = reql.run() return flask.jsonify(pages=list(pages_)) @@ -139,7 +141,7 @@ def pages(site_id): @app.route("/api/page/") def page(page_id): reql = rr.table("pages").get(page_id) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) page_ = reql.run() return flask.jsonify(page_) @@ -148,7 +150,7 @@ def page(page_id): @app.route("/api/page//yaml") def page_yaml(page_id): reql = rr.table("pages").get(page_id) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) page_ = reql.run() return app.response_class( yaml.dump(page_, default_flow_style=False), mimetype="application/yaml" @@ -159,7 +161,7 @@ def page_yaml(page_id): @app.route("/api/site/") def site(site_id): reql = rr.table("sites").get(site_id) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) s = reql.run() if "cookie_db" in s: s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") @@ -170,7 +172,7 @@ def site(site_id): @app.route("/api/site//yaml") def site_yaml(site_id): reql = rr.table("sites").get(site_id) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) site_ = reql.run() return app.response_class( yaml.dump(site_, default_flow_style=False), mimetype="application/yaml" @@ -180,7 +182,7 @@ def site_yaml(site_id): @app.route("/api/stats/") def stats(bucket): reql = rr.table("stats").get(bucket) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) stats_ = reql.run() return flask.jsonify(stats_) @@ -193,7 +195,7 @@ def sites(job_id): except ValueError: jid = job_id reql = rr.table("sites").get_all(jid, index="job_id") - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) sites_ = list(reql.run()) # TypeError: is not JSON serializable for s in sites_: @@ -206,7 +208,7 @@ def sites(job_id): def jobless_sites(): # XXX inefficient (unindexed) query reql = rr.table("sites").filter(~r.row.has_fields("job_id")) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) sites_ = list(reql.run()) # TypeError: is not JSON serializable for s in sites_: @@ -223,7 +225,7 @@ def job(job_id): except ValueError: jid = job_id reql = rr.table("jobs").get(jid) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) job_ = reql.run() return flask.jsonify(job_) @@ -236,7 +238,7 @@ def job_yaml(job_id): except ValueError: jid = job_id reql = rr.table("jobs").get(jid) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) job_ = reql.run() return app.response_class( yaml.dump(job_, default_flow_style=False), mimetype="application/yaml" @@ -258,7 +260,7 @@ def services(): @app.route("/api/jobs") def jobs(): reql = rr.table("jobs").order_by(r.desc("id")) - logging.debug("querying rethinkdb: %s", reql) + logger.debug("querying rethinkdb", query=reql) jobs_ = list(reql.run()) return flask.jsonify(jobs=jobs_) @@ -313,13 +315,13 @@ try: return self.application def run(**options): - logging.info("running brozzler-dashboard using gunicorn") + logger.info("running brozzler-dashboard using gunicorn") GunicornBrozzlerDashboard(app, options).run() except ImportError: def run(): - logging.info("running brozzler-dashboard using simple flask app.run") + logger.info("running brozzler-dashboard using simple flask app.run") app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"]) diff --git a/brozzler/easy.py b/brozzler/easy.py index f8392f6..2834aa4 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -18,8 +18,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +import structlog import sys -import logging try: import warcprox @@ -30,11 +30,11 @@ try: import wsgiref.handlers import brozzler.dashboard except ImportError as e: - logging.critical( + structlog.get_logger().critical( '%s: %s\n\nYou might need to run "pip install ' 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, - e, + exc_info=True, ) sys.exit(1) import argparse @@ -156,7 +156,7 @@ class ThreadingWSGIServer( class BrozzlerEasyController: - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, args): self.stop = threading.Event() @@ -238,11 +238,14 @@ class BrozzlerEasyController: self.logger.info("starting brozzler-worker") self.brozzler_worker.start() - self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address) + self.logger.info( + "starting pywb", address="%s:%s" % self.pywb_httpd.server_address + ) threading.Thread(target=self.pywb_httpd.serve_forever).start() self.logger.info( - "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address + "starting brozzler-dashboard", + address="%s:%s" % self.dashboard_httpd.server_address, ) threading.Thread(target=self.dashboard_httpd.serve_forever).start() @@ -307,8 +310,8 @@ class BrozzlerEasyController: state_strs.append(str(th)) stack = traceback.format_stack(sys._current_frames()[th.ident]) state_strs.append("".join(stack)) - logging.warning( - "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)) + structlog.get_logger().warning( + "dumping state (caught signal)", signal=signum, state="\n".join(state_strs) ) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index c6ac971..0ef3924 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -16,12 +16,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import logging import brozzler import random import time import datetime import rethinkdb as rdb +import structlog import doublethink import urlcanon @@ -33,7 +33,7 @@ class UnexpectedDbResult(Exception): class RethinkDbFrontier: - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) def __init__(self, rr, shards=None, replicas=None): self.rr = rr @@ -42,15 +42,15 @@ class RethinkDbFrontier: self._ensure_db() def _ensure_db(self): + db_logger = self.logger.bind(dbname=self.rr.dbname) + dbs = self.rr.db_list().run() if not self.rr.dbname in dbs: - self.logger.info("creating rethinkdb database %r", self.rr.dbname) + db_logger.info("creating rethinkdb database") self.rr.db_create(self.rr.dbname).run() tables = self.rr.table_list().run() if not "sites" in tables: - self.logger.info( - "creating rethinkdb table 'sites' in database %r", self.rr.dbname - ) + db_logger.info("creating rethinkdb table 'sites' in database") self.rr.table_create( "sites", shards=self.shards, replicas=self.replicas ).run() @@ -59,9 +59,7 @@ class RethinkDbFrontier: ).run() self.rr.table("sites").index_create("job_id").run() if not "pages" in tables: - self.logger.info( - "creating rethinkdb table 'pages' in database %r", self.rr.dbname - ) + db_logger.info("creating rethinkdb table 'pages' in database") self.rr.table_create( "pages", shards=self.shards, replicas=self.replicas ).run() @@ -81,9 +79,7 @@ class RethinkDbFrontier: [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]], ).run() if not "jobs" in tables: - self.logger.info( - "creating rethinkdb table 'jobs' in database %r", self.rr.dbname - ) + db_logger.info("creating rethinkdb table 'jobs' in database") self.rr.table_create( "jobs", shards=self.shards, replicas=self.replicas ).run() @@ -108,7 +104,7 @@ class RethinkDbFrontier: ) def claim_sites(self, n=1): - self.logger.trace("claiming up to %s sites to brozzle", n) + self.logger.debug("claiming up to %s sites to brozzle", n) result = ( self.rr.table("sites") .get_all( @@ -186,10 +182,10 @@ class RethinkDbFrontier: if result["changes"][i]["old_val"]["claimed"]: self.logger.warning( "re-claimed site that was still marked 'claimed' " - "because it was last claimed a long time ago " - "at %s, and presumably some error stopped it from " + "because it was last claimed a long time ago, " + "and presumably some error stopped it from " "being disclaimed", - result["changes"][i]["old_val"]["last_claimed"], + last_claimed=result["changes"][i]["old_val"]["last_claimed"], ) site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) sites.append(site) @@ -205,10 +201,10 @@ class RethinkDbFrontier: """ if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit: self.logger.debug( - "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s", - site.time_limit, - site.elapsed(), - site, + "site FINISHED_TIME_LIMIT!", + time_limit=site.time_limit, + elapsed=site.elapsed(), + site=site, ) raise brozzler.ReachedTimeLimit @@ -273,7 +269,7 @@ class RethinkDbFrontier: """Raises brozzler.CrawlStopped if stop has been requested.""" site.refresh() if site.stop_requested and site.stop_requested <= doublethink.utcnow(): - self.logger.info("stop requested for site %s", site.id) + self.logger.info("stop requested for site", site_id=site.id) raise brozzler.CrawlStopped if site.job_id: @@ -283,7 +279,7 @@ class RethinkDbFrontier: and job.stop_requested and job.stop_requested <= doublethink.utcnow() ): - self.logger.info("stop requested for job %s", site.job_id) + self.logger.info("stop requested for job", job_id=site.job_id) raise brozzler.CrawlStopped def _maybe_finish_job(self, job_id): @@ -304,7 +300,7 @@ class RethinkDbFrontier: return False n += 1 - self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id) + self.logger.info("all %s sites finished, job is FINISHED!", n, job_id=job.id) job.finish() job.save() return True @@ -320,7 +316,7 @@ class RethinkDbFrontier: self._maybe_finish_job(site.job_id) def disclaim_site(self, site, page=None): - self.logger.info("disclaiming %s", site) + self.logger.info("disclaiming", site=site) site.claimed = False site.last_disclaimed = doublethink.utcnow() if not page and not self.has_outstanding_pages(site): @@ -468,17 +464,16 @@ class RethinkDbFrontier: try: self.logger.debug("inserting/replacing batch of %s pages", len(batch)) reql = self.rr.table("pages").insert(batch, conflict="replace") - self.logger.trace( + self.logger.debug( 'running query self.rr.table("pages").insert(%r, ' 'conflict="replace")', batch, ) result = reql.run() except Exception as e: - self.logger.error( + self.logger.exception( "problem inserting/replacing batch of %s pages", len(batch), - exc_info=True, ) parent_page.outlinks = {} @@ -497,7 +492,7 @@ class RethinkDbFrontier: ) def reached_limit(self, site, e): - self.logger.info("reached_limit site=%s e=%s", site, e) + self.logger.info("reached_limit", site=site, e=e) assert isinstance(e, brozzler.ReachedLimit) if ( site.reached_limit @@ -530,7 +525,7 @@ class RethinkDbFrontier: ) pages = list(results) if len(pages) > 1: - self.logger.warning("more than one seed page for site_id %s ?", site_id) + self.logger.warning("more than one seed page?", site_id=site_id) if len(pages) < 1: return None return brozzler.Page(self.rr, pages[0]) @@ -550,8 +545,8 @@ class RethinkDbFrontier: [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval], index="priority_by_site", ) - self.logger.trace("running query: %r", query) + self.logger.debug("running query", query=query) results = query.run() for result in results: - self.logger.trace("yielding result: %r", result) + self.logger.debug("yielding result", result=result) yield brozzler.Page(self.rr, result) diff --git a/brozzler/model.py b/brozzler/model.py index 608847c..271d64e 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -25,9 +25,9 @@ import datetime import doublethink import hashlib import json -import logging import os import re +import structlog import time import urlcanon import urllib @@ -37,6 +37,8 @@ import zlib from enum import Enum from typing import Optional +logger = structlog.get_logger() + def load_schema(): schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml") @@ -84,7 +86,7 @@ def merge(a, b): def new_job_file(frontier, job_conf_file): """Returns new Job.""" - logging.info("loading %s", job_conf_file) + logger.info("loading", job_conf_file=job_conf_file) with open(job_conf_file) as f: job_conf = yaml.safe_load(f) return new_job(frontier, job_conf) @@ -120,12 +122,12 @@ def new_job(frontier, job_conf): # insert in batches to avoid this error # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)): - logging.info("inserting batch of %s pages", len(batch)) + logger.info("inserting batch of %s pages", len(batch)) result = frontier.rr.table("pages").insert(batch).run() for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)): - logging.info("inserting batch of %s sites", len(batch)) + logger.info("inserting batch of %s sites", len(batch)) result = frontier.rr.table("sites").insert(batch).run() - logging.info("job %s fully started", job.id) + logger.info("job fully started", job_id=job.id) return job @@ -154,7 +156,7 @@ def new_seed_page(frontier, site): def new_site(frontier, site): - logging.info("new site %s", site) + logger.info("new site", site=site) site.id = site.id or str(uuid.uuid4()) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages @@ -162,7 +164,7 @@ def new_site(frontier, site): try: page = new_seed_page(frontier, site) page.save() - logging.info("queued page %s", page) + logger.info("queued page", page=page) finally: # finally block because we want to insert the Site no matter what site.save() @@ -195,7 +197,7 @@ class ElapsedMixIn(object): class Job(doublethink.Document, ElapsedMixIn): - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) table = "jobs" def populate_defaults(self): @@ -217,9 +219,9 @@ class Job(doublethink.Document, ElapsedMixIn): def finish(self): if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]: self.logger.error( - "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s", - self.status, - self.starts_and_stops[-1]["stop"], + "job is already finished", + status=self.status, + stop=self.starts_and_stops[-1]["stop"], ) self.status = "FINISHED" self.starts_and_stops[-1]["stop"] = doublethink.utcnow() @@ -246,7 +248,7 @@ class VideoCaptureOptions(Enum): class Site(doublethink.Document, ElapsedMixIn): - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) table = "sites" def populate_defaults(self): @@ -304,7 +306,7 @@ class Site(doublethink.Document, ElapsedMixIn): if set(rule.keys()) == {"ssurt"} ) if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): - self.logger.info("adding ssurt %s to scope accept rules", ssurt) + self.logger.info("adding ssurt to scope accept rules", ssurt=ssurt) self.scope["accepts"].append({"ssurt": ssurt}) def note_seed_redirect(self, url): @@ -402,7 +404,7 @@ class Site(doublethink.Document, ElapsedMixIn): class Page(doublethink.Document): - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) table = "pages" @staticmethod diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 8b61841..993e807 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -19,7 +19,9 @@ limitations under the License. """ import sys -import logging +import structlog + +logger = structlog.get_logger() try: import pywb.apps.cli @@ -30,7 +32,7 @@ try: import pywb.framework.basehandlers import pywb.rewrite.wburl except ImportError as e: - logging.critical( + logger.critical( '%s: %s\n\nYou might need to run "pip install ' 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, @@ -111,7 +113,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): ) if cdx_query.limit: reql = reql.limit(cdx_query.limit) - logging.debug("rethinkdb query: %s", reql) + logger.debug("rethinkdb query", query=reql) results = reql.run() return results diff --git a/brozzler/robots.py b/brozzler/robots.py index 744c996..e20a9c7 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -23,12 +23,12 @@ limitations under the License. """ import json -import logging import brozzler import reppy import reppy.cache import reppy.parser import requests +import structlog __all__ = ["is_permitted_by_robots"] @@ -119,10 +119,9 @@ def is_permitted_by_robots(site, url, proxy=None): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: - logging.warning( - "returning true (permitted) after problem fetching " - "robots.txt for %r: %r", - url, - e, + structlog.get_logger().warning( + "returning true (permitted) after problem fetching " "robots.txt", + url=url, + exception=e, ) return True diff --git a/brozzler/worker.py b/brozzler/worker.py index 6d44956..5d4b03e 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -18,7 +18,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import logging import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions @@ -32,6 +31,7 @@ import io import socket import random import requests +import structlog import urllib3 from urllib3.exceptions import TimeoutError, ProxyError import doublethink @@ -46,7 +46,7 @@ r = rdb.RethinkDB() class BrozzlerWorker: - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) # 3⅓ min heartbeat interval => 10 min ttl # This is kind of a long time, because `frontier.claim_sites()`, which runs @@ -125,7 +125,7 @@ class BrozzlerWorker: self._metrics_port, self._registry_url, self._env ) else: - logging.warning( + self.logger.warning( "not starting prometheus scrape endpoint: metrics_port is undefined" ) @@ -173,9 +173,9 @@ class BrozzlerWorker: site.proxy = "%s:%s" % (svc["host"], svc["port"]) site.save() self.logger.info( - "chose warcprox instance %r from service registry for %r", - site.proxy, - site, + "chose warcprox instance from service registry", + instance=site.proxy, + registry=site, ) return site.proxy return None @@ -189,7 +189,7 @@ class BrozzlerWorker: self._proxy_is_warcprox = status["role"] == "warcprox" except Exception as e: self._proxy_is_warcprox = False - logging.info( + self.logger.info( "%s %s warcprox", self._proxy, "IS" if self._proxy_is_warcprox else "IS NOT", @@ -227,18 +227,18 @@ class BrozzlerWorker: with urllib.request.urlopen(request, timeout=900) as response: if response.getcode() != 204: self.logger.warning( - 'got "%s %s" response on warcprox ' + "got unexpected response on warcprox " "WARCPROX_WRITE_RECORD request (expected 204)", - response.getcode(), - response.reason, + code=response.getcode(), + reason=response.reason, ) return request, response except urllib.error.HTTPError as e: self.logger.warning( - 'got "%s %s" response on warcprox ' + "got unexpected response on warcprox " "WARCPROX_WRITE_RECORD request (expected 204)", - e.getcode(), - e.info(), + code=e.getcode(), + reason=e.info(), ) return request, None except urllib.error.URLError as e: @@ -271,26 +271,27 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True, ): - self.logger.info("brozzling {}".format(page)) + page_logger = self.logger.bind(page=page) + page_logger.info("brozzling") outlinks = set() page_headers = self._get_page_headers(site, page) if not self._needs_browsing(page_headers): - self.logger.info("needs fetch: %s", page) + page_logger.info("needs fetch") if site.pdfs_only and not self._is_pdf(page_headers): self.logger.info("skipping non-PDF content: PDFs only option enabled") elif site.video_capture in [ VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, ] and self._is_video_type(page_headers): - self.logger.info( + page_logger.info( "skipping video content: video MIME type capture disabled for site" ) else: self._fetch_url(site, page=page) else: - self.logger.info("needs browsing: %s", page) + page_logger.info("needs browsing") try: browser_outlinks = self._browse_page( browser, site, page, on_screenshot, on_request @@ -300,7 +301,7 @@ class BrozzlerWorker: if status_code in [502, 504]: raise brozzler.PageConnectionError() except brozzler.PageInterstitialShown: - self.logger.info("page interstitial shown (http auth): %s", page) + page_logger.info("page interstitial shown (http auth)") if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code): try: @@ -316,10 +317,7 @@ class BrozzlerWorker: except brozzler.ProxyError: raise except brozzler.VideoExtractorError as e: - logging.error( - "error extracting video info: %s", - e, - ) + self.logger.exception("error extracting video info") except Exception as e: if ( hasattr(e, "exc_info") @@ -328,26 +326,25 @@ class BrozzlerWorker: and e.exc_info[1].code == 430 ): self.logger.info( - "youtube-dl got %s %s processing %s", - e.exc_info[1].code, - e.exc_info[1].msg, - page.url, + "youtube-dl encountered an error", + code=e.exc_info[1].code, + message=e.exc_info[1].msg, + url=page.url, ) else: - self.logger.error( - "youtube_dl raised exception on %s", page, exc_info=True - ) + self.logger.exception("youtube_dl raised exception", page=page) return outlinks @metrics.brozzler_header_processing_duration_seconds.time() @metrics.brozzler_in_progress_headers.track_inprogress() def _get_page_headers(self, site, page): + url_logger = self.logger.bind(url=page.url) # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow try: user_agent = site.get("user_agent") headers = {"User-Agent": user_agent} if user_agent else {} - self.logger.info("getting page headers for %s", page.url) + url_logger.info("getting page headers") with requests.get( page.url, stream=True, @@ -357,11 +354,9 @@ class BrozzlerWorker: ) as r: return r.headers except requests.exceptions.Timeout as e: - self.logger.warning( - "Timed out trying to get headers for %s: %s", page.url, e - ) + url_logger.warning("Timed out trying to get headers", exc_info=True) except requests.exceptions.RequestException as e: - self.logger.warning("Failed to get headers for %s: %s", page.url, e) + url_logger.warning("Failed to get headers", exc_info=True) return {} def _needs_browsing(self, page_headers) -> bool: @@ -402,10 +397,9 @@ class BrozzlerWorker: on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( - "sending WARCPROX_WRITE_RECORD request to %s with " - "screenshot for %s", - self._proxy_for(site), - page, + "sending WARCPROX_WRITE_RECORD request", + proxy=self._proxy_for(site), + screenshot_for_page=page, ) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( @@ -452,7 +446,7 @@ class BrozzlerWorker: video["content-length"] = int(response_headers["content-length"]) if "content-range" in response_headers: video["content-range"] = response_headers["content-range"] - logging.debug("embedded video %s", video) + self.logger.debug("embedded video", video=video) if not "videos" in page: page.videos = [] page.videos.append(video) @@ -461,11 +455,11 @@ class BrozzlerWorker: def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 - self.logger.trace("%r", chrome_msg) + self.logger.debug("service worker updated", chrome_msg=chrome_msg) if chrome_msg.get("params", {}).get("versions"): url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL") if url and url.startswith("http") and url not in sw_fetched: - self.logger.info("fetching service worker script %s", url) + self.logger.info("fetching service worker script", url=url) self._fetch_url(site, url=url) sw_fetched.add(url) @@ -520,7 +514,7 @@ class BrozzlerWorker: headers = {"User-Agent": user_agent} if user_agent else {} headers.update(site.extra_headers(page)) - self.logger.info("fetching url %s", url) + self.logger.info("fetching url", url=url) try: # response is ignored http.request( @@ -530,17 +524,18 @@ class BrozzlerWorker: timeout=self.FETCH_URL_TIMEOUT, retries=False, ) - self.logger.info("Completed fetching url %s", url) + self.logger.info("Completed fetching url", url=url) except TimeoutError as e: - self.logger.warning("Timed out fetching %s", url) + self.logger.warning("Timed out fetching url", url=url) raise brozzler.PageConnectionError() from e except ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e except urllib3.exceptions.RequestError as e: - self.logger.warning("Failed to fetch url %s: %s", url, e) + self.logger.warning("Failed to fetch url", url=url, exc_info=True) raise brozzler.PageConnectionError() from e def brozzle_site(self, browser, site): + site_logger = self.logger.bind(site=site) try: site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port) site.save() @@ -550,9 +545,7 @@ class BrozzlerWorker: self._frontier.honor_stop_request(site) # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 - self.logger.info( - "brozzling site (proxy=%r) %s", self._proxy_for(site), site - ) + site_logger.info("brozzling site", proxy=self._proxy_for(site)) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) @@ -564,7 +557,7 @@ class BrozzlerWorker: if page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site) ): - logging.warning("page %s is blocked by robots.txt", page.url) + self.logger.warning("page is blocked by robots.txt", url=page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: @@ -580,7 +573,7 @@ class BrozzlerWorker: except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: - self.logger.info("no pages left for site %s", site) + site_logger.info("no pages left for site") except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.ReachedTimeLimit as e: @@ -591,29 +584,24 @@ class BrozzlerWorker: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: - logging.error( - "proxy error (site.proxy=%s), will try to choose a " - "healthy instance next time site is brozzled: %s", - site.proxy, - e, + self.logger.exception( + "proxy error, will try to choose a " + "healthy instance next time site is brozzled", + site_proxy=site.proxy, ) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time - logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1) + self.logger.exception("proxy error", self_proxy=self._proxy) except (brozzler.PageConnectionError, Exception) as e: if isinstance(e, brozzler.PageConnectionError): - self.logger.error( - "Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r", - site, - page, - exc_info=True, + site_logger.exception( + "Page status code possibly indicates connection failure between host and warcprox", + page=page, ) else: - self.logger.error( - "unexpected exception site=%r page=%r", site, page, exc_info=True - ) + site_logger.exception("unexpected exception", page=page) if page: # Calculate backoff in seconds based on number of failed attempts. # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135... @@ -624,10 +612,10 @@ class BrozzlerWorker: page.failed_attempts = (page.failed_attempts or 0) + 1 if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: self.logger.info( - 'marking page "completed" after %s unexpected ' - "exceptions attempting to brozzle %s", - page.failed_attempts, - page, + 'marking page "completed" after several unexpected ' + "exceptions attempting to brozzle", + failed_attempts=page.failed_attempts, + page=page, ) self._frontier.completed_page(site, page) page = None @@ -665,13 +653,11 @@ class BrozzlerWorker: try: self.status_info = self._service_registry.heartbeat(status_info) - self.logger.trace("status in service registry: %s", self.status_info) + self.logger.debug("status in service registry", status=self.status_info) except r.ReqlError as e: - self.logger.error( - "failed to send heartbeat and update service registry " - "with info %s: %s", - status_info, - e, + self.logger.exception( + "failed to send heartbeat and update service registry", + info=status_info, ) def _service_heartbeat_if_due(self): @@ -719,9 +705,7 @@ class BrozzlerWorker: self._browser_pool.release(browsers[i]) def run(self): - self.logger.notice( - "brozzler %s - brozzler-worker starting", brozzler.__version__ - ) + self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__) last_nothing_to_claim = 0 try: while not self._shutdown.is_set(): @@ -730,20 +714,20 @@ class BrozzlerWorker: try: self._start_browsing_some_sites() except brozzler.browser.NoBrowsersAvailable: - logging.trace("all %s browsers are in use", self._max_browsers) + self.logger.debug( + "all browsers are in use", max_browsers=self._max_browsers + ) except brozzler.NothingToClaim: last_nothing_to_claim = time.time() - logging.trace( + self.logger.debug( "nothing to claim, all available active sites " "are already claimed by a brozzler worker" ) time.sleep(0.5) - self.logger.notice("shutdown requested") + self.logger.warn("shutdown requested") except r.ReqlError as e: - self.logger.error( - "caught rethinkdb exception, will try to proceed", exc_info=True - ) + self.logger.exception("caught rethinkdb exception, will try to proceed") except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except: @@ -755,12 +739,11 @@ class BrozzlerWorker: try: self._service_registry.unregister(self.status_info["id"]) except: - self.logger.error( - "failed to unregister from service registry", exc_info=True - ) + self.logger.exception("failed to unregister from service registry") self.logger.info( - "shutting down %s brozzling threads", len(self._browsing_threads) + "shutting down brozzling threads", + thread_count=len(self._browsing_threads), ) with self._browsing_threads_lock: for th in self._browsing_threads: @@ -780,6 +763,7 @@ class BrozzlerWorker: ) return self._thread = threading.Thread(target=self.run, name="BrozzlerWorker") + self.logger = self.logger.bind(thread=self._thread) self._thread.start() def shutdown_now(self): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ead919e..12ac370 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -16,7 +16,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import logging import yt_dlp from yt_dlp.utils import match_filter_func, ExtractorError import brozzler @@ -31,6 +30,7 @@ import datetime from . import metrics import random +import structlog import threading import traceback import doublethink @@ -44,17 +44,20 @@ YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 +logger = structlog.get_logger() + + def should_ytdlp(site, page, page_status): # called only after we've passed needs_browsing() check if page_status != 200: - logging.info("skipping ytdlp: non-200 page status %s", page_status) + logger.info("skipping ytdlp: non-200 page status %s", page_status) return False if site.video_capture in [ VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value, ]: - logging.info("skipping ytdlp: site has video capture disabled") + logger.info("skipping ytdlp: site has video capture disabled") return False ytdlp_url = page.redirect_url if page.redirect_url else page.url @@ -104,7 +107,13 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): """ class _YoutubeDL(yt_dlp.YoutubeDL): - logger = logging.getLogger(__module__ + "." + __qualname__) + logger = structlog.get_logger(__module__ + "." + __qualname__) + + def __init__(self, url, params=None, auto_init=True): + super().__init__(params, auto_init) + + self.url = url + self.logger = self.logger.bind(url=url) def process_ie_result(self, ie_result, download=True, extra_info=None): if extra_info is None: @@ -114,7 +123,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): if result_type in ("url", "url_transparent"): if "extraction_depth" in extra_info: self.logger.info( - f"Following redirect URL: {ie_result['url']} extraction_depth: {extra_info['extraction_depth']}" + f"Following redirect", + redirect_url=ie_result["url"], + extraction_depth=extra_info["extraction_depth"], ) extra_info["extraction_depth"] = 1 + extra_info.get( "extraction_depth", 0 @@ -131,8 +142,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) + extract_context = self.logger.bind(extractor=ie.IE_NAME) if ie_result.get("_type") == "playlist": - self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url) + extract_context.info("found playlist") if ie.IE_NAME in { "youtube:playlist", "youtube:tab", @@ -147,22 +159,20 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): try: ie_result["entries_no_dl"] = list(ie_result["entries"]) except Exception as e: - self.logger.warning( - "failed to unroll ie_result['entries']? for %s, %s; exception %s", - ie.IE_NAME, - url, - e, + extract_context.warning( + "failed to unroll entries ie_result['entries']?", + exc_info=True, ) ie_result["entries_no_dl"] = [] ie_result["entries"] = [] self.logger.info( - "not downloading %s media files from this " + "not downloading media files from this " "playlist because we expect to capture them from " "individual watch/track/detail pages", - len(ie_result["entries_no_dl"]), + media_file_count=len(ie_result["entries_no_dl"]), ) else: - self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url) + extract_context.info("found a download") def _push_video_to_warcprox(self, site, info_dict, postprocessor): # 220211 update: does yt-dlp supply content-type? no, not as such @@ -180,7 +190,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): mimetype = magic.from_file(info_dict["filepath"], mime=True) except ImportError as e: mimetype = "video/%s" % info_dict["ext"] - self.logger.warning("guessing mimetype %s because %r", mimetype, e) + self.logger.warning( + "guessing mimetype due to error", + mimetype=mimetype, + exc_info=True, + ) # youtube watch page postprocessor is MoveFiles @@ -198,12 +212,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): size = os.path.getsize(info_dict["filepath"]) self.logger.info( - "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s", - info_dict["format"], - mimetype, - size, - worker._proxy_for(site), - url, + "pushing video to warcprox", + format=info_dict["format"], + mimetype=mimetype, + size=size, + warcprox=worker._proxy_for(site), ) with open(info_dict["filepath"], "rb") as f: # include content-length header to avoid chunked @@ -240,23 +253,23 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): ): worker.logger.debug( "heartbeating site.last_claimed to prevent another " - "brozzler-worker claiming this site id=%r", - site.id, + "brozzler-worker claiming this site", + id=site.id, ) site.last_claimed = doublethink.utcnow() site.save() except: worker.logger.debug( - "problem heartbeating site.last_claimed site id=%r", - site.id, + "problem heartbeating site.last_claimed site", + id=site.id, exc_info=True, ) def ydl_postprocess_hook(d): if d["status"] == "finished": - worker.logger.info("[ydl_postprocess_hook] Finished postprocessing") worker.logger.info( - "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) + "[ydl_postprocess_hook] Finished postprocessing", + postprocessor=d["postprocessor"], ) is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) @@ -290,7 +303,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): # --cache-dir local or.. # this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal? "cache_dir": "/home/archiveit", - "logger": logging.getLogger("yt_dlp"), + "logger": logger, "verbose": False, "quiet": False, # recommended to avoid bot detection @@ -306,17 +319,16 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): ytdlp_proxy_for_logs = ( ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@" ) - logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) + logger.info("using yt-dlp proxy ...", proxy=ytdlp_proxy_for_logs) # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) - ydl = _YoutubeDL(ydl_opts) + ydl = _YoutubeDL(ytdlp_url, params=ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] - ydl.url = ytdlp_url ydl.is_youtube_host = is_youtube_host return ydl @@ -336,7 +348,7 @@ def _remember_videos(page, pushed_videos=None): "content-type": pushed_video["content-type"], "content-length": pushed_video["content-length"], } - logging.debug("pushed video %s", video) + logger.debug("pushed video", video=video) page.videos.append(video) @@ -345,7 +357,7 @@ def _try_youtube_dl(worker, ydl, site, page): attempt = 0 while attempt < max_attempts: try: - logging.info("trying yt-dlp on %s", ydl.url) + logger.info("trying yt-dlp", url=ydl.url) # should_download_vid = not ydl.is_youtube_host # then # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) @@ -386,15 +398,18 @@ def _try_youtube_dl(worker, ydl, site, page): # and others... attempt += 1 if attempt == max_attempts: - logging.warning( - "Failed after %s attempt(s). Error: %s", max_attempts, e + logger.warning( + "Failed after %s attempt(s)", + max_attempts, + attempts=max_attempts, + exc_info=True, ) raise brozzler.VideoExtractorError( "yt-dlp hit error extracting info for %s" % ydl.url ) else: retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) - logging.info( + logger.info( "Attempt %s failed. Retrying in %s seconds...", attempt, retry_wait, @@ -405,15 +420,14 @@ def _try_youtube_dl(worker, ydl, site, page): "yt-dlp hit unknown error extracting info for %s" % ydl.url ) - logging.info("ytdlp completed successfully") + logger.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) - logging.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " - "with yt-dlp json for %s", - ydl.url, + logger.info( + "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json", + url=ydl.url, ) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), @@ -444,7 +458,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): with tempfile.TemporaryDirectory( prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir ) as tempdir: - logging.info("tempdir for yt-dlp: %s", tempdir) + logger.info("tempdir for yt-dlp", tempdir=tempdir) ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() diff --git a/setup.py b/setup.py index a0fce8f..b0da7c5 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ setuptools.setup( "cryptography>=2.3", "python-magic>=0.4.15", "prometheus-client>=0.20.0", + "structlog>=25.1.0", ], extras_require={ "yt-dlp": ["yt-dlp>=2024.7.25"], diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 51d78e3..979898f 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -31,11 +31,14 @@ import datetime import requests import subprocess import http.server -import logging +import structlog import sys import warcprox +logger = structlog.get_logger() + + # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib def _local_address(): import socket @@ -70,11 +73,11 @@ def stop_service(service): def httpd(request): class RequestHandler(http.server.SimpleHTTPRequestHandler): def do_POST(self): - logging.info("\n%s\n%s", self.requestline, self.headers) + logger.info("\n%s\n%s", self.requestline, self.headers) self.do_GET() def do_GET(self): - logging.info("\n%s\n%s", self.requestline, self.headers) + logger.info("\n%s\n%s", self.requestline, self.headers) if self.path == "/site5/redirect/": self.send_response(303, "See other") self.send_header("Connection", "close") @@ -270,7 +273,7 @@ def test_proxy_non_warcprox(httpd): def do_HEAD(self): if not hasattr(self.server, "requests"): self.server.requests = [] - logging.info("%s %s", self.command, self.path) + logger.info("%s %s", self.command, self.path) self.server.requests.append("%s %s" % (self.command, self.path)) response = urllib.request.urlopen(self.path) self.wfile.write( @@ -292,7 +295,7 @@ def test_proxy_non_warcprox(httpd): def do_WARCPROX_WRITE_RECORD(self): if not hasattr(self.server, "requests"): self.server.requests = [] - logging.info("%s %s", self.command, self.path) + logger.info("%s %s", self.command, self.path) self.send_error(400) proxy = http.server.HTTPServer(("localhost", 0), DumbProxyRequestHandler) @@ -826,7 +829,7 @@ def test_warcprox_outage_resiliency(httpd): try: stop_service("warcprox") except Exception as e: - logging.warning("problem stopping warcprox service: %s", e) + logger.warning("problem stopping warcprox service: %s", exc_info=True) # queue the site for brozzling brozzler.new_site(frontier, site) diff --git a/tests/test_units.py b/tests/test_units.py index b7a785f..27a89a7 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -24,7 +24,6 @@ import os import brozzler import brozzler.chrome import brozzler.ydl -import logging import yaml import datetime import requests @@ -36,15 +35,6 @@ import sys import threading from unittest import mock -logging.basicConfig( - stream=sys.stderr, - level=logging.INFO, - format=( - "%(asctime)s %(process)d %(levelname)s %(threadName)s " - "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s" - ), -) - @pytest.fixture(scope="module") def httpd(request):