mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-22 06:29:13 -04:00
browser: convert to structlog
This commit is contained in:
parent
a7e915b35f
commit
97f225d54c
1 changed files with 27 additions and 28 deletions
|
@ -31,6 +31,7 @@ import base64
|
||||||
from ipaddress import AddressValueError
|
from ipaddress import AddressValueError
|
||||||
from brozzler.chrome import Chrome
|
from brozzler.chrome import Chrome
|
||||||
import socket
|
import socket
|
||||||
|
import structlog
|
||||||
import urlcanon
|
import urlcanon
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +53,7 @@ class BrowserPool:
|
||||||
debugging protocol.
|
debugging protocol.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, size=3, **kwargs):
|
def __init__(self, size=3, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
@ -143,7 +144,7 @@ class BrowserPool:
|
||||||
|
|
||||||
|
|
||||||
class WebsockReceiverThread(threading.Thread):
|
class WebsockReceiverThread(threading.Thread):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, websock, name=None, daemon=True):
|
def __init__(self, websock, name=None, daemon=True):
|
||||||
super().__init__(name=name, daemon=daemon)
|
super().__init__(name=name, daemon=daemon)
|
||||||
|
@ -193,7 +194,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
):
|
):
|
||||||
self.logger.error("websocket closed, did chrome die?")
|
self.logger.error("websocket closed, did chrome die?")
|
||||||
else:
|
else:
|
||||||
self.logger.error("exception from websocket receiver thread", exc_info=1)
|
self.logger.exception("exception from websocket receiver thread")
|
||||||
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
@ -213,10 +214,9 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
try:
|
try:
|
||||||
self._handle_message(websock, message)
|
self._handle_message(websock, message)
|
||||||
except:
|
except:
|
||||||
self.logger.error(
|
self.logger.exception(
|
||||||
"uncaught exception in _handle_message message=%s",
|
"uncaught exception in _handle_message",
|
||||||
message,
|
message=message,
|
||||||
exc_info=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
|
@ -231,7 +231,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
||||||
self.logger.info("reached limit %s", self.reached_limit)
|
self.logger.info("reached limit", limit=self.reached_limit)
|
||||||
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
|
brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
@ -245,7 +245,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.page_status = status
|
self.page_status = status
|
||||||
|
|
||||||
def _javascript_dialog_opening(self, message):
|
def _javascript_dialog_opening(self, message):
|
||||||
self.logger.info("javascript dialog opened: %s", message)
|
self.logger.info("javascript dialog opened", message=message)
|
||||||
if message["params"]["type"] == "alert":
|
if message["params"]["type"] == "alert":
|
||||||
accept = True
|
accept = True
|
||||||
else:
|
else:
|
||||||
|
@ -292,7 +292,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
message["params"]["message"]["text"],
|
message["params"]["message"]["text"],
|
||||||
)
|
)
|
||||||
elif message["method"] == "Runtime.exceptionThrown":
|
elif message["method"] == "Runtime.exceptionThrown":
|
||||||
self.logger.debug("uncaught exception: %s", message)
|
self.logger.debug("uncaught exception", exception=message)
|
||||||
elif message["method"] == "Page.javascriptDialogOpening":
|
elif message["method"] == "Page.javascriptDialogOpening":
|
||||||
self._javascript_dialog_opening(message)
|
self._javascript_dialog_opening(message)
|
||||||
elif (
|
elif (
|
||||||
|
@ -322,7 +322,7 @@ class Browser:
|
||||||
Manages an instance of Chrome for browsing pages.
|
Manages an instance of Chrome for browsing pages.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = structlog.get_logger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
@ -365,11 +365,10 @@ class Browser:
|
||||||
msg_id = next(self._command_id)
|
msg_id = next(self._command_id)
|
||||||
kwargs["id"] = msg_id
|
kwargs["id"] = msg_id
|
||||||
msg = json.dumps(kwargs, separators=",:")
|
msg = json.dumps(kwargs, separators=",:")
|
||||||
logging.log(
|
self.logger.debug(
|
||||||
logging.TRACE if suppress_logging else logging.DEBUG,
|
"sending message",
|
||||||
"sending message to %s: %s",
|
websock=self.websock,
|
||||||
self.websock,
|
message=msg,
|
||||||
msg,
|
|
||||||
)
|
)
|
||||||
self.websock.send(msg)
|
self.websock.send(msg)
|
||||||
return msg_id
|
return msg_id
|
||||||
|
@ -397,7 +396,7 @@ class Browser:
|
||||||
# Enable Console & Runtime output only when debugging.
|
# Enable Console & Runtime output only when debugging.
|
||||||
# After all, we just print these events with debug(), we don't use
|
# After all, we just print these events with debug(), we don't use
|
||||||
# them in Brozzler logic.
|
# them in Brozzler logic.
|
||||||
if self.logger.isEnabledFor(logging.DEBUG):
|
if self.logger.is_enabled_for(logging.DEBUG):
|
||||||
self.send_to_chrome(method="Console.enable")
|
self.send_to_chrome(method="Console.enable")
|
||||||
self.send_to_chrome(method="Runtime.enable")
|
self.send_to_chrome(method="Runtime.enable")
|
||||||
self.send_to_chrome(method="ServiceWorker.enable")
|
self.send_to_chrome(method="ServiceWorker.enable")
|
||||||
|
@ -432,8 +431,8 @@ class Browser:
|
||||||
try:
|
try:
|
||||||
self.websock.close()
|
self.websock.close()
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self.logger.error(
|
self.logger.exception(
|
||||||
"exception closing websocket %s - %s", self.websock, e
|
"exception closing websocket", websocket=self.websock
|
||||||
)
|
)
|
||||||
|
|
||||||
self.chrome.stop()
|
self.chrome.stop()
|
||||||
|
@ -460,7 +459,7 @@ class Browser:
|
||||||
|
|
||||||
self.websock_url = None
|
self.websock_url = None
|
||||||
except:
|
except:
|
||||||
self.logger.error("problem stopping", exc_info=True)
|
self.logger.exception("problem stopping")
|
||||||
|
|
||||||
def is_running(self):
|
def is_running(self):
|
||||||
return self.websock_url is not None
|
return self.websock_url is not None
|
||||||
|
@ -566,7 +565,7 @@ class Browser:
|
||||||
# if login redirected us, return to page_url
|
# if login redirected us, return to page_url
|
||||||
if page_url != self.url().split("#")[0]:
|
if page_url != self.url().split("#")[0]:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"login navigated away from %s; returning!", page_url
|
"login navigated away; returning!", page_url=page_url
|
||||||
)
|
)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
# If the target page HTTP status is 4xx/5xx, there is no point
|
# If the target page HTTP status is 4xx/5xx, there is no point
|
||||||
|
@ -608,7 +607,7 @@ class Browser:
|
||||||
# more information, raise that one
|
# more information, raise that one
|
||||||
raise self.websock_thread.reached_limit
|
raise self.websock_thread.reached_limit
|
||||||
except websocket.WebSocketConnectionClosedException as e:
|
except websocket.WebSocketConnectionClosedException as e:
|
||||||
self.logger.error("websocket closed, did chrome die?")
|
self.logger.exception("websocket closed, did chrome die?")
|
||||||
raise BrowsingException(e)
|
raise BrowsingException(e)
|
||||||
finally:
|
finally:
|
||||||
self.is_browsing = False
|
self.is_browsing = False
|
||||||
|
@ -630,7 +629,7 @@ class Browser:
|
||||||
on_screenshot(jpeg_bytes)
|
on_screenshot(jpeg_bytes)
|
||||||
return
|
return
|
||||||
except BrowsingTimeout as e:
|
except BrowsingTimeout as e:
|
||||||
logging.error("attempt %s/3: %s", i + 1, e)
|
self.logger.exception("attempt %s/3", i + 1)
|
||||||
|
|
||||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||||
_hashtags = set(hashtags or [])
|
_hashtags = set(hashtags or [])
|
||||||
|
@ -644,7 +643,7 @@ class Browser:
|
||||||
# out which hashtags were visited already and skip those
|
# out which hashtags were visited already and skip those
|
||||||
for hashtag in _hashtags:
|
for hashtag in _hashtags:
|
||||||
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||||
self.logger.debug("navigating to hashtag %s", hashtag)
|
self.logger.debug("navigating to hashtag", hashtag=hashtag)
|
||||||
url = urlcanon.whatwg(page_url)
|
url = urlcanon.whatwg(page_url)
|
||||||
url.hash_sign = b"#"
|
url.hash_sign = b"#"
|
||||||
url.fragment = hashtag[1:].encode("utf-8")
|
url.fragment = hashtag[1:].encode("utf-8")
|
||||||
|
@ -684,7 +683,7 @@ class Browser:
|
||||||
)
|
)
|
||||||
|
|
||||||
def navigate_to_page(self, page_url, timeout=300):
|
def navigate_to_page(self, page_url, timeout=300):
|
||||||
self.logger.info("navigating to page %s", page_url)
|
self.logger.info("navigating to page", page_url=page_url)
|
||||||
self.websock_thread.got_page_load_event = None
|
self.websock_thread.got_page_load_event = None
|
||||||
self.websock_thread.page_status = None
|
self.websock_thread.page_status = None
|
||||||
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
|
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
|
||||||
|
@ -712,14 +711,14 @@ class Browser:
|
||||||
try:
|
try:
|
||||||
out.append(str(urlcanon.whatwg(link)))
|
out.append(str(urlcanon.whatwg(link)))
|
||||||
except AddressValueError:
|
except AddressValueError:
|
||||||
self.logger.warning("skip invalid outlink: %s", link)
|
self.logger.warning("skip invalid outlink", outlink=link)
|
||||||
return frozenset(out)
|
return frozenset(out)
|
||||||
else:
|
else:
|
||||||
# no links found
|
# no links found
|
||||||
return frozenset()
|
return frozenset()
|
||||||
else:
|
else:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"problem extracting outlinks, result message: %s", message
|
"problem extracting outlinks", message=message
|
||||||
)
|
)
|
||||||
return frozenset()
|
return frozenset()
|
||||||
|
|
||||||
|
@ -791,7 +790,7 @@ class Browser:
|
||||||
while True:
|
while True:
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
if elapsed > timeout:
|
if elapsed > timeout:
|
||||||
logging.info("behavior reached hard timeout after %.1fs", elapsed)
|
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
|
||||||
return
|
return
|
||||||
|
|
||||||
brozzler.sleep(check_interval)
|
brozzler.sleep(check_interval)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue