browser: convert to structlog

This commit is contained in:
Misty De Méo 2025-02-18 12:44:22 -08:00
parent a7e915b35f
commit 97f225d54c

View file

@ -31,6 +31,7 @@ import base64
from ipaddress import AddressValueError from ipaddress import AddressValueError
from brozzler.chrome import Chrome from brozzler.chrome import Chrome
import socket import socket
import structlog
import urlcanon import urlcanon
@ -52,7 +53,7 @@ class BrowserPool:
debugging protocol. debugging protocol.
""" """
logger = logging.getLogger(__module__ + "." + __qualname__) logger = structlog.get_logger(__module__ + "." + __qualname__)
def __init__(self, size=3, **kwargs): def __init__(self, size=3, **kwargs):
""" """
@ -143,7 +144,7 @@ class BrowserPool:
class WebsockReceiverThread(threading.Thread): class WebsockReceiverThread(threading.Thread):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = structlog.get_logger(__module__ + "." + __qualname__)
def __init__(self, websock, name=None, daemon=True): def __init__(self, websock, name=None, daemon=True):
super().__init__(name=name, daemon=daemon) super().__init__(name=name, daemon=daemon)
@ -193,7 +194,7 @@ class WebsockReceiverThread(threading.Thread):
): ):
self.logger.error("websocket closed, did chrome die?") self.logger.error("websocket closed, did chrome die?")
else: else:
self.logger.error("exception from websocket receiver thread", exc_info=1) self.logger.exception("exception from websocket receiver thread")
brozzler.thread_raise(self.calling_thread, BrowsingException) brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self): def run(self):
@ -213,10 +214,9 @@ class WebsockReceiverThread(threading.Thread):
try: try:
self._handle_message(websock, message) self._handle_message(websock, message)
except: except:
self.logger.error( self.logger.exception(
"uncaught exception in _handle_message message=%s", "uncaught exception in _handle_message",
message, message=message,
exc_info=True,
) )
def _network_response_received(self, message): def _network_response_received(self, message):
@ -231,7 +231,7 @@ class WebsockReceiverThread(threading.Thread):
] ]
) )
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta) self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
self.logger.info("reached limit %s", self.reached_limit) self.logger.info("reached limit", limit=self.reached_limit)
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit) brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
else: else:
self.logger.info( self.logger.info(
@ -245,7 +245,7 @@ class WebsockReceiverThread(threading.Thread):
self.page_status = status self.page_status = status
def _javascript_dialog_opening(self, message): def _javascript_dialog_opening(self, message):
self.logger.info("javascript dialog opened: %s", message) self.logger.info("javascript dialog opened", message=message)
if message["params"]["type"] == "alert": if message["params"]["type"] == "alert":
accept = True accept = True
else: else:
@ -292,7 +292,7 @@ class WebsockReceiverThread(threading.Thread):
message["params"]["message"]["text"], message["params"]["message"]["text"],
) )
elif message["method"] == "Runtime.exceptionThrown": elif message["method"] == "Runtime.exceptionThrown":
self.logger.debug("uncaught exception: %s", message) self.logger.debug("uncaught exception", exception=message)
elif message["method"] == "Page.javascriptDialogOpening": elif message["method"] == "Page.javascriptDialogOpening":
self._javascript_dialog_opening(message) self._javascript_dialog_opening(message)
elif ( elif (
@ -322,7 +322,7 @@ class Browser:
Manages an instance of Chrome for browsing pages. Manages an instance of Chrome for browsing pages.
""" """
logger = logging.getLogger(__module__ + "." + __qualname__) logger = structlog.get_logger(__module__ + "." + __qualname__)
def __init__(self, **kwargs): def __init__(self, **kwargs):
""" """
@ -365,11 +365,10 @@ class Browser:
msg_id = next(self._command_id) msg_id = next(self._command_id)
kwargs["id"] = msg_id kwargs["id"] = msg_id
msg = json.dumps(kwargs, separators=",:") msg = json.dumps(kwargs, separators=",:")
logging.log( self.logger.debug(
logging.TRACE if suppress_logging else logging.DEBUG, "sending message",
"sending message to %s: %s", websock=self.websock,
self.websock, message=msg,
msg,
) )
self.websock.send(msg) self.websock.send(msg)
return msg_id return msg_id
@ -397,7 +396,7 @@ class Browser:
# Enable Console & Runtime output only when debugging. # Enable Console & Runtime output only when debugging.
# After all, we just print these events with debug(), we don't use # After all, we just print these events with debug(), we don't use
# them in Brozzler logic. # them in Brozzler logic.
if self.logger.isEnabledFor(logging.DEBUG): if self.logger.is_enabled_for(logging.DEBUG):
self.send_to_chrome(method="Console.enable") self.send_to_chrome(method="Console.enable")
self.send_to_chrome(method="Runtime.enable") self.send_to_chrome(method="Runtime.enable")
self.send_to_chrome(method="ServiceWorker.enable") self.send_to_chrome(method="ServiceWorker.enable")
@ -432,8 +431,8 @@ class Browser:
try: try:
self.websock.close() self.websock.close()
except BaseException as e: except BaseException as e:
self.logger.error( self.logger.exception(
"exception closing websocket %s - %s", self.websock, e "exception closing websocket", websocket=self.websock
) )
self.chrome.stop() self.chrome.stop()
@ -460,7 +459,7 @@ class Browser:
self.websock_url = None self.websock_url = None
except: except:
self.logger.error("problem stopping", exc_info=True) self.logger.exception("problem stopping")
def is_running(self): def is_running(self):
return self.websock_url is not None return self.websock_url is not None
@ -566,7 +565,7 @@ class Browser:
# if login redirected us, return to page_url # if login redirected us, return to page_url
if page_url != self.url().split("#")[0]: if page_url != self.url().split("#")[0]:
self.logger.debug( self.logger.debug(
"login navigated away from %s; returning!", page_url "login navigated away; returning!", page_url=page_url
) )
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
# If the target page HTTP status is 4xx/5xx, there is no point # If the target page HTTP status is 4xx/5xx, there is no point
@ -608,7 +607,7 @@ class Browser:
# more information, raise that one # more information, raise that one
raise self.websock_thread.reached_limit raise self.websock_thread.reached_limit
except websocket.WebSocketConnectionClosedException as e: except websocket.WebSocketConnectionClosedException as e:
self.logger.error("websocket closed, did chrome die?") self.logger.exception("websocket closed, did chrome die?")
raise BrowsingException(e) raise BrowsingException(e)
finally: finally:
self.is_browsing = False self.is_browsing = False
@ -630,7 +629,7 @@ class Browser:
on_screenshot(jpeg_bytes) on_screenshot(jpeg_bytes)
return return
except BrowsingTimeout as e: except BrowsingTimeout as e:
logging.error("attempt %s/3: %s", i + 1, e) self.logger.exception("attempt %s/3", i + 1)
def visit_hashtags(self, page_url, hashtags, outlinks): def visit_hashtags(self, page_url, hashtags, outlinks):
_hashtags = set(hashtags or []) _hashtags = set(hashtags or [])
@ -644,7 +643,7 @@ class Browser:
# out which hashtags were visited already and skip those # out which hashtags were visited already and skip those
for hashtag in _hashtags: for hashtag in _hashtags:
# navigate_to_hashtag (nothing to wait for so no timeout?) # navigate_to_hashtag (nothing to wait for so no timeout?)
self.logger.debug("navigating to hashtag %s", hashtag) self.logger.debug("navigating to hashtag", hashtag=hashtag)
url = urlcanon.whatwg(page_url) url = urlcanon.whatwg(page_url)
url.hash_sign = b"#" url.hash_sign = b"#"
url.fragment = hashtag[1:].encode("utf-8") url.fragment = hashtag[1:].encode("utf-8")
@ -684,7 +683,7 @@ class Browser:
) )
def navigate_to_page(self, page_url, timeout=300): def navigate_to_page(self, page_url, timeout=300):
self.logger.info("navigating to page %s", page_url) self.logger.info("navigating to page", page_url=page_url)
self.websock_thread.got_page_load_event = None self.websock_thread.got_page_load_event = None
self.websock_thread.page_status = None self.websock_thread.page_status = None
self.send_to_chrome(method="Page.navigate", params={"url": page_url}) self.send_to_chrome(method="Page.navigate", params={"url": page_url})
@ -712,14 +711,14 @@ class Browser:
try: try:
out.append(str(urlcanon.whatwg(link))) out.append(str(urlcanon.whatwg(link)))
except AddressValueError: except AddressValueError:
self.logger.warning("skip invalid outlink: %s", link) self.logger.warning("skip invalid outlink", outlink=link)
return frozenset(out) return frozenset(out)
else: else:
# no links found # no links found
return frozenset() return frozenset()
else: else:
self.logger.error( self.logger.error(
"problem extracting outlinks, result message: %s", message "problem extracting outlinks", message=message
) )
return frozenset() return frozenset()
@ -791,7 +790,7 @@ class Browser:
while True: while True:
elapsed = time.time() - start elapsed = time.time() - start
if elapsed > timeout: if elapsed > timeout:
logging.info("behavior reached hard timeout after %.1fs", elapsed) self.logger.info("behavior reached hard timeout", elapsed=elapsed)
return return
brozzler.sleep(check_interval) brozzler.sleep(check_interval)