behaviors: allow extracting outlinks

Currently, our JavaScript outlink extraction happens purely via our
non-configurable extract-outlinks.js script. However, given that
many sites can have unpredictable behaviour we may want special
handling for, it would be great to let us configure this on a per-
site basis. We already have a system for this for interacting with
sites using our behaviour system; if we expand this to also provide
outlinks, we can give ourselves a much more flexible system to
handle complex or special-case websites.

This extends the behaviour system so that we can now return a
JavaScript object with information about the site. That object
should contain at least the "finished" key, which is a boolean that
works like the simple boolean returned by older versions. The
object can additionally contain an "outlinks" key which, if present,
should be an array of links for brozzler to handle as outlinks.

I've retained backwards compatibility by checking to see if the
returned object is a boolean and handling it like we did previously.
This commit is contained in:
Misty De Méo 2025-12-16 11:04:26 -08:00
parent 874163beec
commit b6078ceee7

View file

@ -629,11 +629,14 @@ class Browser:
):
run_behaviors = False
behavior_outlinks: frozenset[str] = frozenset()
if run_behaviors and behavior_timeout > 0:
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters, behaviors_dir=behaviors_dir
)
self.run_behavior(behavior_script, timeout=behavior_timeout)
behavior_outlinks = self.run_behavior(
behavior_script, timeout=behavior_timeout
)
final_page_url = self.url()
if on_screenshot:
if simpler404:
@ -651,7 +654,7 @@ class Browser:
outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
if run_behaviors and not skip_visit_hashtags:
self.visit_hashtags(final_page_url, hashtags, outlinks)
return final_page_url, outlinks
return final_page_url, outlinks.union(behavior_outlinks)
except brozzler.ReachedLimit:
# websock_thread has stashed the ReachedLimit exception with
# more information, raise that one
@ -869,7 +872,7 @@ class Browser:
message = self.websock_thread.pop_result(msg_id)
return message["result"]["result"]["value"]
def run_behavior(self, behavior_script, timeout=900):
def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]:
self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
@ -882,7 +885,7 @@ class Browser:
elapsed = time.time() - start
if elapsed > timeout:
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
return
return frozenset()
brozzler.sleep(check_interval)
@ -890,7 +893,11 @@ class Browser:
msg_id = self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
params={"expression": "umbraBehaviorFinished()"},
params={
"expression": "umbraBehaviorFinished()",
# returnByValue ensures we can return more complicated types like dicts
"returnByValue": True,
},
)
try:
self._wait_for(
@ -905,11 +912,18 @@ class Browser:
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
)
and "result" in msg["result"]
and isinstance(msg["result"]["result"]["value"], bool)
and msg["result"]["result"]["value"]
):
self.logger.info("behavior decided it has finished")
return
if isinstance(msg["result"]["result"]["value"], bool):
if msg["result"]["result"]["value"]:
self.logger.info("behavior decided it has finished")
return frozenset()
# new-style response dict that has more than just a finished bool
elif isinstance(msg["result"]["result"]["value"], dict):
response = msg["result"]["result"]["value"]
if response["finished"]:
self.logger.info("behavior decided it has finished")
outlinks = frozenset(response.get("outlinks", []))
return outlinks
except BrowsingTimeout:
pass