mirror of
https://github.com/RetroShare/RetroShare.git
synced 2025-05-19 14:30:43 -04:00
Implement deep indexing and search for forums
RsGxsNetTunnelService::receiveSearchRequest handle no results case properly RsNxsObserver::handleDistantSearchRequest improve method behaviour documentation RsTurtleClientService Improve documentation
This commit is contained in:
parent
1b551d809f
commit
9c38eed648
13 changed files with 902 additions and 89 deletions
|
@ -168,13 +168,33 @@ std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc)
|
|||
|
||||
std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1));
|
||||
|
||||
// strip also CSS inside <style></style>
|
||||
oSize = retVal.size();
|
||||
auto styleTagBegin(retVal.find("<style"));
|
||||
if(styleTagBegin < oSize)
|
||||
{
|
||||
auto styleEnd(retVal.find("</style>", styleTagBegin));
|
||||
if(styleEnd < oSize)
|
||||
retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin);
|
||||
}
|
||||
|
||||
std::string::size_type oPos;
|
||||
std::string::size_type cPos;
|
||||
int itCount = 0;
|
||||
while((oPos = retVal.find("<")) < retVal.size())
|
||||
{
|
||||
if((cPos = retVal.find(">")) <= retVal.size())
|
||||
retVal.erase(oPos, 1+cPos-oPos);
|
||||
else break;
|
||||
|
||||
// Avoid infinite loop with crafty input
|
||||
if(itCount > 1000)
|
||||
{
|
||||
RS_WARN( "Breaking stripping loop due to max allowed iterations ",
|
||||
"rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal );
|
||||
break;
|
||||
}
|
||||
++itCount;
|
||||
}
|
||||
|
||||
return retVal;
|
||||
|
|
208
libretroshare/src/deep_search/forumsindex.cpp
Normal file
208
libretroshare/src/deep_search/forumsindex.cpp
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*******************************************************************************
|
||||
* RetroShare full text indexing and search implementation based on Xapian *
|
||||
* *
|
||||
* Copyright (C) 2021 Gioacchino Mazzurco <gio@eigenlab.org> *
|
||||
* Copyright (C) 2021 Asociación Civil Altermundi <info@altermundi.net> *
|
||||
* *
|
||||
* This program is free software: you can redistribute it and/or modify *
|
||||
* it under the terms of the GNU Affero General Public License version 3 as *
|
||||
* published by the Free Software Foundation. *
|
||||
* *
|
||||
* This program is distributed in the hope that it will be useful, *
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
* GNU Affero General Public License for more details. *
|
||||
* *
|
||||
* You should have received a copy of the GNU Affero General Public License *
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "deep_search/forumsindex.hpp"
|
||||
#include "deep_search/commonutils.hpp"
|
||||
#include "retroshare/rsinit.h"
|
||||
#include "retroshare/rsgxsforums.h"
|
||||
#include "util/rsdebuglevel4.h"
|
||||
|
||||
std::error_condition DeepForumsIndex::search(
|
||||
const std::string& queryStr,
|
||||
std::vector<DeepForumsSearchResult>& results, uint32_t maxResults )
|
||||
{
|
||||
results.clear();
|
||||
|
||||
std::unique_ptr<Xapian::Database> dbPtr(
|
||||
DeepSearch::openReadOnlyDatabase(mDbPath) );
|
||||
if(!dbPtr) return std::errc::bad_file_descriptor;
|
||||
|
||||
Xapian::Database& db(*dbPtr);
|
||||
|
||||
// Set up a QueryParser with a stemmer and suitable prefixes.
|
||||
Xapian::QueryParser queryparser;
|
||||
//queryparser.set_stemmer(Xapian::Stem("en"));
|
||||
queryparser.set_stemming_strategy(queryparser.STEM_SOME);
|
||||
// Start of prefix configuration.
|
||||
//queryparser.add_prefix("title", "S");
|
||||
//queryparser.add_prefix("description", "XD");
|
||||
// End of prefix configuration.
|
||||
|
||||
// And parse the query.
|
||||
Xapian::Query query = queryparser.parse_query(queryStr);
|
||||
|
||||
// Use an Enquire object on the database to run the query.
|
||||
Xapian::Enquire enquire(db);
|
||||
enquire.set_query(query);
|
||||
|
||||
Xapian::MSet mset = enquire.get_mset(
|
||||
0, maxResults ? maxResults : db.get_doccount() );
|
||||
|
||||
for( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m )
|
||||
{
|
||||
const Xapian::Document& doc = m.get_document();
|
||||
DeepForumsSearchResult s;
|
||||
s.mUrl = doc.get_value(URL_VALUENO);
|
||||
#if XAPIAN_AT_LEAST(1,3,5)
|
||||
s.mSnippet = mset.snippet(doc.get_data());
|
||||
#endif // XAPIAN_AT_LEAST(1,3,5)
|
||||
results.push_back(s);
|
||||
}
|
||||
|
||||
return std::error_condition();
|
||||
}
|
||||
|
||||
/*static*/ std::string DeepForumsIndex::forumIndexId(const RsGxsGroupId& grpId)
|
||||
{
|
||||
RsUrl forumIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL);
|
||||
forumIndexId.setQueryKV(
|
||||
RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString() );
|
||||
return forumIndexId.toString();
|
||||
}
|
||||
|
||||
/*static*/ std::string DeepForumsIndex::postIndexId(
|
||||
const RsGxsGroupId& grpId, const RsGxsMessageId& msgId )
|
||||
{
|
||||
RsUrl postIndexId(RsGxsForums::DEFAULT_FORUM_BASE_URL);
|
||||
postIndexId.setQueryKV(RsGxsForums::FORUM_URL_ID_FIELD, grpId.toStdString());
|
||||
postIndexId.setQueryKV(RsGxsForums::FORUM_URL_MSG_ID_FIELD, msgId.toStdString());
|
||||
return postIndexId.toString();
|
||||
}
|
||||
|
||||
std::error_condition DeepForumsIndex::indexForumGroup(
|
||||
const RsGxsForumGroup& forum )
|
||||
{
|
||||
// Set up a TermGenerator that we'll use in indexing.
|
||||
Xapian::TermGenerator termgenerator;
|
||||
//termgenerator.set_stemmer(Xapian::Stem("en"));
|
||||
|
||||
// We make a document and tell the term generator to use this.
|
||||
Xapian::Document doc;
|
||||
termgenerator.set_document(doc);
|
||||
|
||||
// Index each field with a suitable prefix.
|
||||
termgenerator.index_text(forum.mMeta.mGroupName, 1, "G");
|
||||
termgenerator.index_text(
|
||||
DeepSearch::timetToXapianDate(forum.mMeta.mPublishTs), 1, "D" );
|
||||
termgenerator.index_text(forum.mDescription, 1, "XD");
|
||||
|
||||
// Index fields without prefixes for general search.
|
||||
termgenerator.index_text(forum.mMeta.mGroupName);
|
||||
termgenerator.increase_termpos();
|
||||
termgenerator.index_text(forum.mDescription);
|
||||
|
||||
// store the RS link so we are able to retrive it on matching search
|
||||
const std::string rsLink(forumIndexId(forum.mMeta.mGroupId));
|
||||
doc.add_value(URL_VALUENO, rsLink);
|
||||
|
||||
/* Store some fields for display purposes. Retrieved later to provide the
|
||||
* matching snippet on search */
|
||||
doc.set_data(forum.mMeta.mGroupName + "\n" + forum.mDescription);
|
||||
|
||||
/* We use the identifier to ensure each object ends up in the database only
|
||||
* once no matter how many times we run the indexer.
|
||||
* "Q" prefix is a Xapian convention for unique id term. */
|
||||
const std::string idTerm("Q" + rsLink);
|
||||
doc.add_boolean_term(idTerm);
|
||||
|
||||
mWriteQueue.push([idTerm, doc](Xapian::WritableDatabase& db)
|
||||
{ db.replace_document(idTerm, doc); } );
|
||||
|
||||
return std::error_condition();
|
||||
}
|
||||
|
||||
std::error_condition DeepForumsIndex::removeForumFromIndex(
|
||||
const RsGxsGroupId& grpId )
|
||||
{
|
||||
mWriteQueue.push([grpId](Xapian::WritableDatabase& db)
|
||||
{ db.delete_document("Q" + forumIndexId(grpId)); });
|
||||
|
||||
return std::error_condition();
|
||||
}
|
||||
|
||||
std::error_condition DeepForumsIndex::indexForumPost(const RsGxsForumMsg& post)
|
||||
{
|
||||
RS_DBG4(post);
|
||||
|
||||
const auto& groupId = post.mMeta.mGroupId;
|
||||
const auto& msgId = post.mMeta.mMsgId;
|
||||
|
||||
if(groupId.isNull() || msgId.isNull())
|
||||
{
|
||||
RS_ERR("Got post with invalid id ", post);
|
||||
print_stacktrace();
|
||||
return std::errc::invalid_argument;
|
||||
}
|
||||
|
||||
// Set up a TermGenerator that we'll use in indexing.
|
||||
Xapian::TermGenerator termgenerator;
|
||||
//termgenerator.set_stemmer(Xapian::Stem("en"));
|
||||
|
||||
// We make a document and tell the term generator to use this.
|
||||
Xapian::Document doc;
|
||||
termgenerator.set_document(doc);
|
||||
|
||||
// Index each field with a suitable prefix.
|
||||
termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
|
||||
termgenerator.index_text(
|
||||
DeepSearch::timetToXapianDate(post.mMeta.mPublishTs), 1, "D" );
|
||||
|
||||
// Avoid indexing RetroShare-gui HTML tags
|
||||
const std::string cleanMsg = DeepSearch::simpleTextHtmlExtract(post.mMsg);
|
||||
termgenerator.index_text(cleanMsg, 1, "XD" );
|
||||
|
||||
// Index fields without prefixes for general search.
|
||||
termgenerator.index_text(post.mMeta.mMsgName);
|
||||
|
||||
termgenerator.increase_termpos();
|
||||
termgenerator.index_text(cleanMsg);
|
||||
// store the RS link so we are able to retrive it on matching search
|
||||
const std::string rsLink(postIndexId(groupId, msgId));
|
||||
doc.add_value(URL_VALUENO, rsLink);
|
||||
|
||||
// Store some fields for display purposes.
|
||||
doc.set_data(post.mMeta.mMsgName + "\n" + cleanMsg);
|
||||
|
||||
// We use the identifier to ensure each object ends up in the
|
||||
// database only once no matter how many times we run the
|
||||
// indexer.
|
||||
const std::string idTerm("Q" + rsLink);
|
||||
doc.add_boolean_term(idTerm);
|
||||
|
||||
mWriteQueue.push( [idTerm, doc](Xapian::WritableDatabase& db)
|
||||
{ db.replace_document(idTerm, doc); } );
|
||||
|
||||
|
||||
return std::error_condition();
|
||||
}
|
||||
|
||||
std::error_condition DeepForumsIndex::removeForumPostFromIndex(
|
||||
RsGxsGroupId grpId, RsGxsMessageId msgId )
|
||||
{
|
||||
// "Q" prefix is a Xapian convention for unique id term.
|
||||
std::string idTerm("Q" + postIndexId(grpId, msgId));
|
||||
mWriteQueue.push( [idTerm](Xapian::WritableDatabase& db)
|
||||
{ db.delete_document(idTerm); } );
|
||||
|
||||
return std::error_condition();
|
||||
}
|
||||
|
||||
/*static*/ std::string DeepForumsIndex::dbDefaultPath()
|
||||
{ return RsAccounts::AccountDirectory() + "/deep_forum_index_xapian_db"; }
|
81
libretroshare/src/deep_search/forumsindex.hpp
Normal file
81
libretroshare/src/deep_search/forumsindex.hpp
Normal file
|
@ -0,0 +1,81 @@
|
|||
/*******************************************************************************
|
||||
* RetroShare full text indexing and search implementation based on Xapian *
|
||||
* *
|
||||
* Copyright (C) 2021 Gioacchino Mazzurco <gio@eigenlab.org> *
|
||||
* Copyright (C) 2021 Asociación Civil Altermundi <info@altermundi.net> *
|
||||
* *
|
||||
* This program is free software: you can redistribute it and/or modify *
|
||||
* it under the terms of the GNU Affero General Public License version 3 as *
|
||||
* published by the Free Software Foundation. *
|
||||
* *
|
||||
* This program is distributed in the hope that it will be useful, *
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
* GNU Affero General Public License for more details. *
|
||||
* *
|
||||
* You should have received a copy of the GNU Affero General Public License *
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <system_error>
|
||||
#include <vector>
|
||||
#include <xapian.h>
|
||||
|
||||
#include "util/rstime.h"
|
||||
#include "retroshare/rsgxsforums.h"
|
||||
#include "retroshare/rsevents.h"
|
||||
#include "deep_search/commonutils.hpp"
|
||||
|
||||
struct DeepForumsSearchResult
|
||||
{
|
||||
std::string mUrl;
|
||||
double mWeight;
|
||||
std::string mSnippet;
|
||||
};
|
||||
|
||||
struct DeepForumsIndex
|
||||
{
|
||||
explicit DeepForumsIndex(const std::string& dbPath) :
|
||||
mDbPath(dbPath), mWriteQueue(dbPath) {}
|
||||
|
||||
/**
|
||||
* @brief Search indexed GXS groups and messages
|
||||
* @param[in] maxResults maximum number of acceptable search results, 0 for
|
||||
* no limits
|
||||
* @return search results count
|
||||
*/
|
||||
std::error_condition search( const std::string& queryStr,
|
||||
std::vector<DeepForumsSearchResult>& results,
|
||||
uint32_t maxResults = 100 );
|
||||
|
||||
std::error_condition indexForumGroup(const RsGxsForumGroup& chan);
|
||||
|
||||
std::error_condition removeForumFromIndex(const RsGxsGroupId& grpId);
|
||||
|
||||
std::error_condition indexForumPost(const RsGxsForumMsg& post);
|
||||
|
||||
std::error_condition removeForumPostFromIndex(
|
||||
RsGxsGroupId grpId, RsGxsMessageId msgId );
|
||||
|
||||
static std::string dbDefaultPath();
|
||||
|
||||
private:
|
||||
static std::string forumIndexId(const RsGxsGroupId& grpId);
|
||||
static std::string postIndexId(
|
||||
const RsGxsGroupId& grpId, const RsGxsMessageId& msgId );
|
||||
|
||||
enum : Xapian::valueno
|
||||
{
|
||||
/// Used to store retroshare url of indexed documents
|
||||
URL_VALUENO,
|
||||
|
||||
/// @see Xapian::BAD_VALUENO
|
||||
BAD_VALUENO = Xapian::BAD_VALUENO
|
||||
};
|
||||
|
||||
const std::string mDbPath;
|
||||
|
||||
DeepSearch::StubbornWriteOpQueue mWriteQueue;
|
||||
};
|
Loading…
Add table
Add a link
Reference in a new issue