diff --git a/app/Entities/Models/Book.php b/app/Entities/Models/Book.php index 982df5c90..359f7961c 100644 --- a/app/Entities/Models/Book.php +++ b/app/Entities/Models/Book.php @@ -24,7 +24,7 @@ class Book extends Entity implements HasCoverImage { use HasFactory; - public $searchFactor = 2; + public $searchFactor = 1.5; protected $fillable = ['name', 'description']; protected $hidden = ['restricted', 'pivot', 'image_id', 'deleted_at']; diff --git a/app/Entities/Models/Bookshelf.php b/app/Entities/Models/Bookshelf.php index 8fe9dbe41..b426858c3 100644 --- a/app/Entities/Models/Bookshelf.php +++ b/app/Entities/Models/Bookshelf.php @@ -13,7 +13,7 @@ class Bookshelf extends Entity implements HasCoverImage protected $table = 'bookshelves'; - public $searchFactor = 3; + public $searchFactor = 1.5; protected $fillable = ['name', 'description', 'image_id']; diff --git a/app/Entities/Models/Chapter.php b/app/Entities/Models/Chapter.php index 0e2917af3..f4d1a281d 100644 --- a/app/Entities/Models/Chapter.php +++ b/app/Entities/Models/Chapter.php @@ -16,7 +16,7 @@ class Chapter extends BookChild { use HasFactory; - public $searchFactor = 1.3; + public $searchFactor = 1.5; protected $fillable = ['name', 'description', 'priority', 'book_id']; protected $hidden = ['restricted', 'pivot', 'deleted_at']; diff --git a/app/Entities/Models/Entity.php b/app/Entities/Models/Entity.php index f5f9d91f0..4c4e55bb8 100644 --- a/app/Entities/Models/Entity.php +++ b/app/Entities/Models/Entity.php @@ -238,20 +238,12 @@ abstract class Entity extends Model implements Sluggable, Favouritable, Viewable return mb_substr($this->name, 0, $length - 3) . '...'; } - /** - * Get the body text of this entity. - */ - public function getText(): string - { - return $this->{$this->textField} ?? ''; - } - /** * Get an excerpt of this entity's descriptive content to the specified length. */ public function getExcerpt(int $length = 100): string { - $text = $this->getText(); + $text = $this->{$this->textField} ?? ''; if (mb_strlen($text) > $length) { $text = mb_substr($text, 0, $length - 3) . '...'; diff --git a/app/Entities/Models/Page.php b/app/Entities/Models/Page.php index 27d5dc6a4..c28b9a305 100644 --- a/app/Entities/Models/Page.php +++ b/app/Entities/Models/Page.php @@ -3,13 +3,13 @@ namespace BookStack\Entities\Models; use BookStack\Entities\Tools\PageContent; +use BookStack\Facades\Permissions; use BookStack\Uploads\Attachment; use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Collection; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Relations\BelongsTo; use Illuminate\Database\Eloquent\Relations\HasMany; -use Permissions; /** * Class Page. @@ -64,10 +64,8 @@ class Page extends BookChild /** * Check if this page has a chapter. - * - * @return bool */ - public function hasChapter() + public function hasChapter(): bool { return $this->chapter()->count() > 0; } diff --git a/app/Entities/Repos/PageRepo.php b/app/Entities/Repos/PageRepo.php index ffa06d459..98fe4ef55 100644 --- a/app/Entities/Repos/PageRepo.php +++ b/app/Entities/Repos/PageRepo.php @@ -157,8 +157,8 @@ class PageRepo */ public function publishDraft(Page $draft, array $input): Page { - $this->baseRepo->update($draft, $input); $this->updateTemplateStatusAndContentFromInput($draft, $input); + $this->baseRepo->update($draft, $input); $draft->draft = false; $draft->revision_count = 1; diff --git a/app/Entities/Tools/SearchIndex.php b/app/Entities/Tools/SearchIndex.php index 50e471bc9..bde5ef860 100644 --- a/app/Entities/Tools/SearchIndex.php +++ b/app/Entities/Tools/SearchIndex.php @@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools; use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; +use BookStack\Entities\Models\Page; use BookStack\Entities\Models\SearchTerm; +use DOMDocument; +use DOMNode; use Illuminate\Support\Collection; class SearchIndex @@ -64,7 +67,8 @@ class SearchIndex SearchTerm::query()->truncate(); foreach ($this->entityProvider->all() as $entityModel) { - $selectFields = ['id', 'name', $entityModel->textField]; + $indexContentField = $entityModel instanceof Page ? 'html' : 'description'; + $selectFields = ['id', 'name', $indexContentField]; $total = $entityModel->newQuery()->withTrashed()->count(); $chunkSize = 250; $processed = 0; @@ -93,11 +97,70 @@ class SearchIndex } /** - * Create a scored term array from the given text. + * Create a scored term array from the given text, where the keys are the terms + * and the values are their scores. * - * @returns array{term: string, score: float} + * @returns array */ - protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array + protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array + { + $termMap = $this->textToTermCountMap($text); + + foreach ($termMap as $term => $count) { + $termMap[$term] = $count * $scoreAdjustment; + } + + return $termMap; + } + + /** + * Create a scored term array from the given HTML, where the keys are the terms + * and the values are their scores. + * + * @returns array + */ + protected function generateTermScoreMapFromHtml(string $html): array + { + if (empty($html)) { + return []; + } + + $scoresByTerm = []; + $elementScoreAdjustmentMap = [ + 'h1' => 10, + 'h2' => 5, + 'h3' => 4, + 'h4' => 3, + 'h5' => 2, + 'h6' => 1.5, + ]; + + $html = '' . $html . ''; + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); + + $topElems = $doc->documentElement->childNodes->item(0)->childNodes; + /** @var DOMNode $child */ + foreach ($topElems as $child) { + $nodeName = $child->nodeName; + $termCounts = $this->textToTermCountMap(trim($child->textContent)); + foreach ($termCounts as $term => $count) { + $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1); + $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange; + } + } + + return $scoresByTerm; + } + + /** + * For the given text, return an array where the keys are the unique term words + * and the values are the frequency of that term. + * + * @returns array + */ + protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} $splitChars = " \n\t.,!?:;()[]{}<>`'\""; @@ -111,34 +174,61 @@ class SearchIndex $token = strtok($splitChars); } - $terms = []; - foreach ($tokenMap as $token => $count) { - $terms[] = [ - 'term' => $token, - 'score' => $count * $scoreAdjustment, - ]; - } - - return $terms; + return $tokenMap; } /** * For the given entity, Generate an array of term data details. * Is the raw term data, not instances of SearchTerm models. * - * @returns array{term: string, score: float}[] + * @returns array{term: string, score: float, entity_id: int, entity_type: string}[] */ protected function entityToTermDataArray(Entity $entity): array { - $nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor); - $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor); - $termData = array_merge($nameTerms, $bodyTerms); + $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor); - foreach ($termData as $index => $term) { - $termData[$index]['entity_type'] = $entity->getMorphClass(); - $termData[$index]['entity_id'] = $entity->id; + if ($entity instanceof Page) { + $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html); + } else { + $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor); } - return $termData; + $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap); + + $dataArray = []; + $entityId = $entity->id; + $entityType = $entity->getMorphClass(); + foreach ($mergedScoreMap as $term => $score) { + $dataArray[] = [ + 'term' => $term, + 'score' => $score, + 'entity_type' => $entityType, + 'entity_id' => $entityId, + ]; + } + + return $dataArray; + } + + + /** + * For the given term data arrays, Merge their contents by term + * while combining any scores. + * + * @param array[] ...$scoreMaps + * + * @returns array + */ + protected function mergeTermScoreMaps(...$scoreMaps): array + { + $mergedMap = []; + + foreach ($scoreMaps as $scoreMap) { + foreach ($scoreMap as $term => $score) { + $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score; + } + } + + return $mergedMap; } } diff --git a/tests/Entity/EntitySearchTest.php b/tests/Entity/EntitySearchTest.php index c30bb1d99..bd50a13ac 100644 --- a/tests/Entity/EntitySearchTest.php +++ b/tests/Entity/EntitySearchTest.php @@ -7,6 +7,7 @@ use BookStack\Entities\Models\Book; use BookStack\Entities\Models\Bookshelf; use BookStack\Entities\Models\Chapter; use BookStack\Entities\Models\Page; +use BookStack\Entities\Models\SearchTerm; use Tests\TestCase; class EntitySearchTest extends TestCase @@ -320,4 +321,43 @@ class EntitySearchTest extends TestCase $search->assertElementContains('.entity-list > .page', 'Test page B', 1); $search->assertElementContains('.entity-list > .page', 'Test page A', 2); } + + public function test_terms_in_headers_have_an_adjusted_index_score() + { + $page = $this->newPage(['name' => 'Test page A', 'html' => ' +

TermA

+

TermB TermNested

+

TermC

+

TermD

+

TermE

+
TermF
+
TermG
+ ']); + + $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page']; + $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term'); + + $this->assertEquals(1, $scoreByTerm->get('TermA')); + $this->assertEquals(10, $scoreByTerm->get('TermB')); + $this->assertEquals(10, $scoreByTerm->get('TermNested')); + $this->assertEquals(5, $scoreByTerm->get('TermC')); + $this->assertEquals(4, $scoreByTerm->get('TermD')); + $this->assertEquals(3, $scoreByTerm->get('TermE')); + $this->assertEquals(2, $scoreByTerm->get('TermF')); + // Is 1.5 but stored as integer, rounding up + $this->assertEquals(2, $scoreByTerm->get('TermG')); + } + + public function test_name_and_content_terms_are_merged_to_single_score() + { + $page = $this->newPage(['name' => 'TermA', 'html' => ' +

TermA

+ ']); + + $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page']; + $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term'); + + // Scores 40 for being in the name then 1 for being in the content + $this->assertEquals(41, $scoreByTerm->get('TermA')); + } }