mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
browser: convert to structlog
This commit is contained in:
parent
a7e915b35f
commit
97f225d54c
@ -31,6 +31,7 @@ import base64
|
||||
from ipaddress import AddressValueError
|
||||
from brozzler.chrome import Chrome
|
||||
import socket
|
||||
import structlog
|
||||
import urlcanon
|
||||
|
||||
|
||||
@ -52,7 +53,7 @@ class BrowserPool:
|
||||
debugging protocol.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, size=3, **kwargs):
|
||||
"""
|
||||
@ -143,7 +144,7 @@ class BrowserPool:
|
||||
|
||||
|
||||
class WebsockReceiverThread(threading.Thread):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, websock, name=None, daemon=True):
|
||||
super().__init__(name=name, daemon=daemon)
|
||||
@ -193,7 +194,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
):
|
||||
self.logger.error("websocket closed, did chrome die?")
|
||||
else:
|
||||
self.logger.error("exception from websocket receiver thread", exc_info=1)
|
||||
self.logger.exception("exception from websocket receiver thread")
|
||||
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
||||
|
||||
def run(self):
|
||||
@ -213,10 +214,9 @@ class WebsockReceiverThread(threading.Thread):
|
||||
try:
|
||||
self._handle_message(websock, message)
|
||||
except:
|
||||
self.logger.error(
|
||||
"uncaught exception in _handle_message message=%s",
|
||||
message,
|
||||
exc_info=True,
|
||||
self.logger.exception(
|
||||
"uncaught exception in _handle_message",
|
||||
message=message,
|
||||
)
|
||||
|
||||
def _network_response_received(self, message):
|
||||
@ -231,7 +231,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
]
|
||||
)
|
||||
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
||||
self.logger.info("reached limit %s", self.reached_limit)
|
||||
self.logger.info("reached limit", limit=self.reached_limit)
|
||||
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
|
||||
else:
|
||||
self.logger.info(
|
||||
@ -245,7 +245,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
self.page_status = status
|
||||
|
||||
def _javascript_dialog_opening(self, message):
|
||||
self.logger.info("javascript dialog opened: %s", message)
|
||||
self.logger.info("javascript dialog opened", message=message)
|
||||
if message["params"]["type"] == "alert":
|
||||
accept = True
|
||||
else:
|
||||
@ -292,7 +292,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
message["params"]["message"]["text"],
|
||||
)
|
||||
elif message["method"] == "Runtime.exceptionThrown":
|
||||
self.logger.debug("uncaught exception: %s", message)
|
||||
self.logger.debug("uncaught exception", exception=message)
|
||||
elif message["method"] == "Page.javascriptDialogOpening":
|
||||
self._javascript_dialog_opening(message)
|
||||
elif (
|
||||
@ -322,7 +322,7 @@ class Browser:
|
||||
Manages an instance of Chrome for browsing pages.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
@ -365,11 +365,10 @@ class Browser:
|
||||
msg_id = next(self._command_id)
|
||||
kwargs["id"] = msg_id
|
||||
msg = json.dumps(kwargs, separators=",:")
|
||||
logging.log(
|
||||
logging.TRACE if suppress_logging else logging.DEBUG,
|
||||
"sending message to %s: %s",
|
||||
self.websock,
|
||||
msg,
|
||||
self.logger.debug(
|
||||
"sending message",
|
||||
websock=self.websock,
|
||||
message=msg,
|
||||
)
|
||||
self.websock.send(msg)
|
||||
return msg_id
|
||||
@ -397,7 +396,7 @@ class Browser:
|
||||
# Enable Console & Runtime output only when debugging.
|
||||
# After all, we just print these events with debug(), we don't use
|
||||
# them in Brozzler logic.
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
if self.logger.is_enabled_for(logging.DEBUG):
|
||||
self.send_to_chrome(method="Console.enable")
|
||||
self.send_to_chrome(method="Runtime.enable")
|
||||
self.send_to_chrome(method="ServiceWorker.enable")
|
||||
@ -432,8 +431,8 @@ class Browser:
|
||||
try:
|
||||
self.websock.close()
|
||||
except BaseException as e:
|
||||
self.logger.error(
|
||||
"exception closing websocket %s - %s", self.websock, e
|
||||
self.logger.exception(
|
||||
"exception closing websocket", websocket=self.websock
|
||||
)
|
||||
|
||||
self.chrome.stop()
|
||||
@ -460,7 +459,7 @@ class Browser:
|
||||
|
||||
self.websock_url = None
|
||||
except:
|
||||
self.logger.error("problem stopping", exc_info=True)
|
||||
self.logger.exception("problem stopping")
|
||||
|
||||
def is_running(self):
|
||||
return self.websock_url is not None
|
||||
@ -566,7 +565,7 @@ class Browser:
|
||||
# if login redirected us, return to page_url
|
||||
if page_url != self.url().split("#")[0]:
|
||||
self.logger.debug(
|
||||
"login navigated away from %s; returning!", page_url
|
||||
"login navigated away; returning!", page_url=page_url
|
||||
)
|
||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||
# If the target page HTTP status is 4xx/5xx, there is no point
|
||||
@ -608,7 +607,7 @@ class Browser:
|
||||
# more information, raise that one
|
||||
raise self.websock_thread.reached_limit
|
||||
except websocket.WebSocketConnectionClosedException as e:
|
||||
self.logger.error("websocket closed, did chrome die?")
|
||||
self.logger.exception("websocket closed, did chrome die?")
|
||||
raise BrowsingException(e)
|
||||
finally:
|
||||
self.is_browsing = False
|
||||
@ -630,7 +629,7 @@ class Browser:
|
||||
on_screenshot(jpeg_bytes)
|
||||
return
|
||||
except BrowsingTimeout as e:
|
||||
logging.error("attempt %s/3: %s", i + 1, e)
|
||||
self.logger.exception("attempt %s/3", i + 1)
|
||||
|
||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||
_hashtags = set(hashtags or [])
|
||||
@ -644,7 +643,7 @@ class Browser:
|
||||
# out which hashtags were visited already and skip those
|
||||
for hashtag in _hashtags:
|
||||
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||
self.logger.debug("navigating to hashtag %s", hashtag)
|
||||
self.logger.debug("navigating to hashtag", hashtag=hashtag)
|
||||
url = urlcanon.whatwg(page_url)
|
||||
url.hash_sign = b"#"
|
||||
url.fragment = hashtag[1:].encode("utf-8")
|
||||
@ -684,7 +683,7 @@ class Browser:
|
||||
)
|
||||
|
||||
def navigate_to_page(self, page_url, timeout=300):
|
||||
self.logger.info("navigating to page %s", page_url)
|
||||
self.logger.info("navigating to page", page_url=page_url)
|
||||
self.websock_thread.got_page_load_event = None
|
||||
self.websock_thread.page_status = None
|
||||
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
|
||||
@ -712,14 +711,14 @@ class Browser:
|
||||
try:
|
||||
out.append(str(urlcanon.whatwg(link)))
|
||||
except AddressValueError:
|
||||
self.logger.warning("skip invalid outlink: %s", link)
|
||||
self.logger.warning("skip invalid outlink", outlink=link)
|
||||
return frozenset(out)
|
||||
else:
|
||||
# no links found
|
||||
return frozenset()
|
||||
else:
|
||||
self.logger.error(
|
||||
"problem extracting outlinks, result message: %s", message
|
||||
"problem extracting outlinks", message=message
|
||||
)
|
||||
return frozenset()
|
||||
|
||||
@ -791,7 +790,7 @@ class Browser:
|
||||
while True:
|
||||
elapsed = time.time() - start
|
||||
if elapsed > timeout:
|
||||
logging.info("behavior reached hard timeout after %.1fs", elapsed)
|
||||
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
|
||||
return
|
||||
|
||||
brozzler.sleep(check_interval)
|
||||
|
Loading…
x
Reference in New Issue
Block a user