Merge branch 'master' into develop

This commit is contained in:
Andrew Morgan 2022-06-28 15:19:48 +01:00
commit 6cba6a51af
5 changed files with 84 additions and 25 deletions

View file

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import itertools
import logging
import re
from typing import (
@ -21,7 +20,7 @@ from typing import (
Dict,
Generator,
Iterable,
Optional,
List, Optional,
Set,
Union,
)
@ -354,7 +353,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
from lxml import etree
TAGS_TO_REMOVE = (
TAGS_TO_REMOVE = {
"header",
"nav",
"aside",
@ -369,31 +368,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"img",
"picture",
etree.Comment,
)
}
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
tree: Optional["etree.Element"],
tags_to_ignore: Set[Union[str, "etree.Comment"]],
stack_limit: int = 1024,
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
Args:
tree: The parent element to iterate. Can be None if there isn't one.
tags_to_ignore: Set of tags to ignore
stack_limit: Maximum stack size limit for depth-first traversal.
Nodes will be dropped if this limit is hit, which may truncate the
textual result.
Intended to limit the maximum working memory when generating a preview.
"""
# This is basically a stack that we extend using itertools.chain.
# This will either consist of an element to iterate over *or* a string
if tree is None:
return
# This is a stack whose items are elements to iterate over *or* strings
# to be returned.
elements = iter([tree])
while True:
el = next(elements, None)
if el is None:
return
elements: List[Union[str, "etree.Element"]] = [tree]
while elements:
el = elements.pop()
if isinstance(el, str):
yield el
@ -407,17 +417,22 @@ def _iterate_over_text(
if el.text:
yield el.text
# We add to the stack all the elements children, interspersed with
# each child's tail text (if it exists). The tail text of a node
# is text that comes *after* the node, so we always include it even
# if we ignore the child node.
elements = itertools.chain(
itertools.chain.from_iterable( # Basically a flatmap
[child, child.tail] if child.tail else [child]
for child in el.iterchildren()
),
elements,
)
# We add to the stack all the element's children, interspersed with
# each child's tail text (if it exists).
#
# We iterate in reverse order so that earlier pieces of text appear
# closer to the top of the stack.
for child in el.iterchildren(reversed=True):
if len(elements) > stack_limit:
# We've hit our limit for working memory
break
if child.tail:
# The tail text of a node is text that comes *after* the node,
# so we always include it even if we ignore the child node.
elements.append(child.tail)
elements.append(child)
def summarize_paragraphs(