BookStack/app/Entities/Tools/PageIncludeParser.php
Dan Brown 4874dc1304
Includes: Updated logic regarding parent block els, added tests
Expanded tests with many more cases, and added fixes for failed
scenarios.
Updated logic to specifically handling parent <p> tags, and now assume
compatibility with parent block types elswhere to allow use in a
variety of scenarios (td, details, blockquote etc...).
2023-11-25 17:32:00 +00:00

213 lines
6.7 KiB
PHP

<?php
namespace BookStack\Entities\Tools;
use BookStack\Util\HtmlDocument;
use Closure;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMText;
class PageIncludeParser
{
protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/";
/**
* Elements to clean up and remove if left empty after a parsing operation.
* @var DOMElement[]
*/
protected array $toCleanup = [];
public function __construct(
protected string $pageHtml,
protected Closure $pageContentForId,
) {
}
/**
* Parse out the include tags.
*/
public function parse(): string
{
$doc = new HtmlDocument($this->pageHtml);
$tags = $this->locateAndIsolateIncludeTags($doc);
foreach ($tags as $tag) {
$htmlContent = $this->pageContentForId->call($this, $tag->getPageId());
$content = new PageIncludeContent($htmlContent, $tag);
if (!$content->isInline()) {
$parentP = $this->getParentParagraph($tag->domNode);
$isWithinParentP = $parentP === $tag->domNode->parentNode;
if ($parentP && $isWithinParentP) {
$this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode);
} else if ($parentP) {
$this->moveTagNodeToBesideParent($tag, $parentP);
}
}
$this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes());
}
$this->cleanup();
return $doc->getBodyInnerHtml();
}
/**
* Locate include tags within the given document, isolating them to their
* own nodes in the DOM for future targeted manipulation.
* @return PageIncludeTag[]
*/
protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array
{
$includeHosts = $doc->queryXPath("//body//*[text()[contains(., '{{@')]]");
$includeTags = [];
/** @var DOMNode $node */
/** @var DOMNode $childNode */
foreach ($includeHosts as $node) {
foreach ($node->childNodes as $childNode) {
if ($childNode->nodeName === '#text') {
array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
}
}
}
return $includeTags;
}
/**
* Takes a text DOMNode and splits its text content at include tags
* into multiple text nodes within the original parent.
* Returns found PageIncludeTag references.
* @return PageIncludeTag[]
*/
protected function splitTextNodesAtTags(DOMNode $textNode): array
{
$includeTags = [];
$text = $textNode->textContent;
preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);
$currentOffset = 0;
foreach ($matches[0] as $index => $fullTagMatch) {
$tagOuterContent = $fullTagMatch[0];
$tagInnerContent = $matches[1][$index][0];
$tagStartOffset = $fullTagMatch[1];
if ($currentOffset < $tagStartOffset) {
$previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
$textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);
}
$node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);
$includeTags[] = new PageIncludeTag($tagInnerContent, $node);
$currentOffset = $tagStartOffset + strlen($tagOuterContent);
}
if ($currentOffset > 0) {
$textNode->textContent = substr($text, $currentOffset);
}
return $includeTags;
}
/**
* Replace the given node with all those in $replacements
* @param DOMNode[] $replacements
*/
protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void
{
/** @var DOMDocument $targetDoc */
$targetDoc = $toReplace->ownerDocument;
foreach ($replacements as $replacement) {
if ($replacement->ownerDocument !== $targetDoc) {
$replacement = $targetDoc->adoptNode($replacement);
}
$toReplace->parentNode->insertBefore($replacement, $toReplace);
}
$toReplace->parentNode->removeChild($toReplace);
}
/**
* Move a tag node to become a sibling of the given parent.
* Will attempt to guess a position based upon the tag content within the parent.
*/
protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void
{
$parentText = $parent->textContent;
$tagPos = strpos($parentText, $tag->tagContent);
$before = $tagPos < (strlen($parentText) / 2);
if ($before) {
$parent->parentNode->insertBefore($tag->domNode, $parent);
} else {
$parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling);
}
}
/**
* Splits the given $parentNode at the location of the $domNode within it.
* Attempts replicate the original $parentNode, moving some of their parent
* children in where needed, before adding the $domNode between.
*/
protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void
{
$children = [...$parentNode->childNodes];
$splitPos = array_search($domNode, $children, true);
if ($splitPos === false) {
$splitPos = count($children) - 1;
}
$parentClone = $parentNode->cloneNode();
$parentNode->parentNode->insertBefore($parentClone, $parentNode);
$parentClone->removeAttribute('id');
/** @var DOMNode $child */
for ($i = 0; $i < $splitPos; $i++) {
$child = $children[$i];
$parentClone->appendChild($child);
}
$parentNode->parentNode->insertBefore($domNode, $parentNode);
$this->toCleanup[] = $parentNode;
$this->toCleanup[] = $parentClone;
}
/**
* Get the parent paragraph of the given node, if existing.
*/
protected function getParentParagraph(DOMNode $parent): ?DOMNode
{
do {
if (strtolower($parent->nodeName) === 'p') {
return $parent;
}
$parent = $parent->parentElement;
} while ($parent !== null);
return null;
}
/**
* Cleanup after a parse operation.
* Removes stranded elements we may have left during the parse.
*/
protected function cleanup(): void
{
foreach ($this->toCleanup as $element) {
$element->normalize();
if ($element->parentNode && !$element->hasChildNodes()) {
$element->parentNode->removeChild($element);
}
}
}
}