diff --git a/brozzler/browser.py b/brozzler/browser.py index 6b1c464..65765fd 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -629,11 +629,14 @@ class Browser: ): run_behaviors = False + behavior_outlinks: frozenset[str] = frozenset() if run_behaviors and behavior_timeout > 0: behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir ) - self.run_behavior(behavior_script, timeout=behavior_timeout) + behavior_outlinks = self.run_behavior( + behavior_script, timeout=behavior_timeout + ) final_page_url = self.url() if on_screenshot: if simpler404: @@ -651,7 +654,7 @@ class Browser: outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) - return final_page_url, outlinks + return final_page_url, outlinks.union(behavior_outlinks) except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with # more information, raise that one @@ -869,7 +872,7 @@ class Browser: message = self.websock_thread.pop_result(msg_id) return message["result"]["result"]["value"] - def run_behavior(self, behavior_script, timeout=900): + def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]: self.send_to_chrome( method="Runtime.evaluate", suppress_logging=True, @@ -882,7 +885,7 @@ class Browser: elapsed = time.time() - start if elapsed > timeout: self.logger.info("behavior reached hard timeout", elapsed=elapsed) - return + return frozenset() brozzler.sleep(check_interval) @@ -890,7 +893,11 @@ class Browser: msg_id = self.send_to_chrome( method="Runtime.evaluate", suppress_logging=True, - params={"expression": "umbraBehaviorFinished()"}, + params={ + "expression": "umbraBehaviorFinished()", + # returnByValue ensures we can return more complicated types like dicts + "returnByValue": True, + }, ) try: self._wait_for( @@ -905,11 +912,18 @@ class Browser: "wasThrown" in msg["result"] and msg["result"]["wasThrown"] ) and "result" in msg["result"] - and isinstance(msg["result"]["result"]["value"], bool) - and msg["result"]["result"]["value"] ): - self.logger.info("behavior decided it has finished") - return + if isinstance(msg["result"]["result"]["value"], bool): + if msg["result"]["result"]["value"]: + self.logger.info("behavior decided it has finished") + return frozenset() + # new-style response dict that has more than just a finished bool + elif isinstance(msg["result"]["result"]["value"], dict): + response = msg["result"]["result"]["value"] + if response["finished"]: + self.logger.info("behavior decided it has finished") + outlinks = frozenset(response.get("outlinks", [])) + return outlinks except BrowsingTimeout: pass