mirror of
https://github.com/internetarchive/brozzler.git
synced 2026-01-05 02:55:34 -05:00
behaviors: allow extracting outlinks
Currently, our JavaScript outlink extraction happens purely via our non-configurable extract-outlinks.js script. However, given that many sites can have unpredictable behaviour we may want special handling for, it would be great to let us configure this on a per- site basis. We already have a system for this for interacting with sites using our behaviour system; if we expand this to also provide outlinks, we can give ourselves a much more flexible system to handle complex or special-case websites. This extends the behaviour system so that we can now return a JavaScript object with information about the site. That object should contain at least the "finished" key, which is a boolean that works like the simple boolean returned by older versions. The object can additionally contain an "outlinks" key which, if present, should be an array of links for brozzler to handle as outlinks. I've retained backwards compatibility by checking to see if the returned object is a boolean and handling it like we did previously.
This commit is contained in:
parent
874163beec
commit
b6078ceee7
1 changed files with 23 additions and 9 deletions
|
|
@ -629,11 +629,14 @@ class Browser:
|
|||
):
|
||||
run_behaviors = False
|
||||
|
||||
behavior_outlinks: frozenset[str] = frozenset()
|
||||
if run_behaviors and behavior_timeout > 0:
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters, behaviors_dir=behaviors_dir
|
||||
)
|
||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||
behavior_outlinks = self.run_behavior(
|
||||
behavior_script, timeout=behavior_timeout
|
||||
)
|
||||
final_page_url = self.url()
|
||||
if on_screenshot:
|
||||
if simpler404:
|
||||
|
|
@ -651,7 +654,7 @@ class Browser:
|
|||
outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
|
||||
if run_behaviors and not skip_visit_hashtags:
|
||||
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
||||
return final_page_url, outlinks
|
||||
return final_page_url, outlinks.union(behavior_outlinks)
|
||||
except brozzler.ReachedLimit:
|
||||
# websock_thread has stashed the ReachedLimit exception with
|
||||
# more information, raise that one
|
||||
|
|
@ -869,7 +872,7 @@ class Browser:
|
|||
message = self.websock_thread.pop_result(msg_id)
|
||||
return message["result"]["result"]["value"]
|
||||
|
||||
def run_behavior(self, behavior_script, timeout=900):
|
||||
def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]:
|
||||
self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
suppress_logging=True,
|
||||
|
|
@ -882,7 +885,7 @@ class Browser:
|
|||
elapsed = time.time() - start
|
||||
if elapsed > timeout:
|
||||
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
|
||||
return
|
||||
return frozenset()
|
||||
|
||||
brozzler.sleep(check_interval)
|
||||
|
||||
|
|
@ -890,7 +893,11 @@ class Browser:
|
|||
msg_id = self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
suppress_logging=True,
|
||||
params={"expression": "umbraBehaviorFinished()"},
|
||||
params={
|
||||
"expression": "umbraBehaviorFinished()",
|
||||
# returnByValue ensures we can return more complicated types like dicts
|
||||
"returnByValue": True,
|
||||
},
|
||||
)
|
||||
try:
|
||||
self._wait_for(
|
||||
|
|
@ -905,11 +912,18 @@ class Browser:
|
|||
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
|
||||
)
|
||||
and "result" in msg["result"]
|
||||
and isinstance(msg["result"]["result"]["value"], bool)
|
||||
and msg["result"]["result"]["value"]
|
||||
):
|
||||
self.logger.info("behavior decided it has finished")
|
||||
return
|
||||
if isinstance(msg["result"]["result"]["value"], bool):
|
||||
if msg["result"]["result"]["value"]:
|
||||
self.logger.info("behavior decided it has finished")
|
||||
return frozenset()
|
||||
# new-style response dict that has more than just a finished bool
|
||||
elif isinstance(msg["result"]["result"]["value"], dict):
|
||||
response = msg["result"]["result"]["value"]
|
||||
if response["finished"]:
|
||||
self.logger.info("behavior decided it has finished")
|
||||
outlinks = frozenset(response.get("outlinks", []))
|
||||
return outlinks
|
||||
except BrowsingTimeout:
|
||||
pass
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue