Fixed old deprecated encoding convert on HTML doc load

This commit is contained in:
Dan Brown 2023-02-23 22:59:26 +00:00
parent 2724b2867b
commit a031edec16
No known key found for this signature in database
GPG Key ID: 46D9F943C24A2EF9
4 changed files with 16 additions and 23 deletions

View File

@ -449,8 +449,8 @@ class PageContent
{ {
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$doc = new DOMDocument(); $doc = new DOMDocument();
$html = '<body>' . $html . '</body>'; $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $doc->loadHTML($html);
return $doc; return $doc;
} }

View File

@ -54,10 +54,10 @@ class CrossLinkParser
{ {
$links = []; $links = [];
$html = '<body>' . $html . '</body>'; $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $doc->loadHTML($html);
$xPath = new DOMXPath($doc); $xPath = new DOMXPath($doc);
$anchors = $xPath->query('//a[@href]'); $anchors = $xPath->query('//a[@href]');

View File

@ -15,25 +15,18 @@ class SearchIndex
{ {
/** /**
* A list of delimiter characters used to break-up parsed content into terms for indexing. * A list of delimiter characters used to break-up parsed content into terms for indexing.
*
* @var string
*/ */
public static $delimiters = " \n\t.,!?:;()[]{}<>`'\""; public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
/** public function __construct(
* @var EntityProvider protected EntityProvider $entityProvider
*/ ) {
protected $entityProvider;
public function __construct(EntityProvider $entityProvider)
{
$this->entityProvider = $entityProvider;
} }
/** /**
* Index the given entity. * Index the given entity.
*/ */
public function indexEntity(Entity $entity) public function indexEntity(Entity $entity): void
{ {
$this->deleteEntityTerms($entity); $this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity); $terms = $this->entityToTermDataArray($entity);
@ -45,7 +38,7 @@ class SearchIndex
* *
* @param Entity[] $entities * @param Entity[] $entities
*/ */
public function indexEntities(array $entities) public function indexEntities(array $entities): void
{ {
$terms = []; $terms = [];
foreach ($entities as $entity) { foreach ($entities as $entity) {
@ -69,7 +62,7 @@ class SearchIndex
* *
* @param callable(Entity, int, int):void|null $progressCallback * @param callable(Entity, int, int):void|null $progressCallback
*/ */
public function indexAllEntities(?callable $progressCallback = null) public function indexAllEntities(?callable $progressCallback = null): void
{ {
SearchTerm::query()->truncate(); SearchTerm::query()->truncate();
@ -101,7 +94,7 @@ class SearchIndex
/** /**
* Delete related Entity search terms. * Delete related Entity search terms.
*/ */
public function deleteEntityTerms(Entity $entity) public function deleteEntityTerms(Entity $entity): void
{ {
$entity->searchTerms()->delete(); $entity->searchTerms()->delete();
} }
@ -145,12 +138,12 @@ class SearchIndex
'h6' => 1.5, 'h6' => 1.5,
]; ];
$html = '<body>' . $html . '</body>'; $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html); $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $doc->loadHTML($html);
$topElems = $doc->documentElement->childNodes->item(0)->childNodes; $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */ /** @var DOMNode $child */

View File

@ -19,10 +19,10 @@ class HtmlContentFilter
return $html; return $html;
} }
$html = '<body>' . $html . '</body>'; $html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $doc->loadHTML($html);
$xPath = new DOMXPath($doc); $xPath = new DOMXPath($doc);
// Remove standard script tags // Remove standard script tags