mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-12-24 04:41:05 -05:00
outlinks: simplify outlink parsing
The outlinks are collected as HTMLAnchorElement objects. The previous version handled stringifying them by collecting the entire set of objects into a single newline-delimited string, then splitting it back up again in Python. It seems easier to just send back a JSON array of strings and have Python iterate over them that way.
This commit is contained in:
parent
93bb1a9a35
commit
33fffdfefd
2 changed files with 8 additions and 4 deletions
|
|
@ -784,7 +784,11 @@ class Browser:
|
|||
# Now we actually do outlink extraction
|
||||
msg_id = self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression": "__brzl_outlinksString()"},
|
||||
params={
|
||||
"expression": "__brzl_extractOutlinks()",
|
||||
# returnByValue ensures we can receive an array response
|
||||
"returnByValue": True,
|
||||
},
|
||||
)
|
||||
self._wait_for(
|
||||
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
|
||||
|
|
@ -798,7 +802,7 @@ class Browser:
|
|||
):
|
||||
if message["result"]["result"]["value"]:
|
||||
out = []
|
||||
for link in message["result"]["result"]["value"].split("\n"):
|
||||
for link in message["result"]["result"]["value"]:
|
||||
try:
|
||||
out.append(str(urlcanon.whatwg(link)))
|
||||
except AddressValueError:
|
||||
|
|
|
|||
|
|
@ -36,6 +36,6 @@ var __brzl_compileOutlinks = function(frame) {
|
|||
|
||||
return outlinks;
|
||||
}
|
||||
var __brzl_outlinksString = function() {
|
||||
return __brzl_compileOutlinks(window).join('\n');
|
||||
var __brzl_extractOutlinks = function() {
|
||||
return __brzl_compileOutlinks(window).map(el => el.toString());
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue