DeepSearch index channels posts too

Improve indexing using RsUrl, store some relevant fields in stored url
This commit is contained in:
Gioacchino Mazzurco 2018-07-04 12:08:50 +02:00
parent 32014eaac1
commit d3e5b760a2
No known key found for this signature in database
GPG Key ID: A1FBCA3872E87051
2 changed files with 154 additions and 45 deletions

View File

@ -17,17 +17,18 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include <ctime>
#include <vector> #include <vector>
#include <xapian.h> #include <xapian.h>
#include "retroshare/rsgxschannels.h" #include "retroshare/rsgxschannels.h"
#include "retroshare/rsinit.h" #include "retroshare/rsinit.h"
#include "util/rsurl.h"
struct DeepSearch struct DeepSearch
{ {
struct SearchResult struct SearchResult
{ {
// TODO: Use RsUrl from extra_locators branch instead of plain string
std::string mUrl; std::string mUrl;
std::string mSnippet; std::string mSnippet;
}; };
@ -90,6 +91,11 @@ struct DeepSearch
// Index each field with a suitable prefix. // Index each field with a suitable prefix.
termgenerator.index_text(chan.mMeta.mGroupName, 1, "G"); termgenerator.index_text(chan.mMeta.mGroupName, 1, "G");
char date[] = "YYYYMMDD\0";
std::strftime(date, 9, "%Y%m%d", std::gmtime(&chan.mMeta.mPublishTs));
termgenerator.index_text(date, 1, "D");
termgenerator.index_text(chan.mDescription, 1, "XD"); termgenerator.index_text(chan.mDescription, 1, "XD");
// Index fields without prefixes for general search. // Index fields without prefixes for general search.
@ -97,8 +103,14 @@ struct DeepSearch
termgenerator.increase_termpos(); termgenerator.increase_termpos();
termgenerator.index_text(chan.mDescription); termgenerator.index_text(chan.mDescription);
std::string rsLink("retroshare://channel?id="); RsUrl chanUrl; chanUrl
rsLink += chan.mMeta.mGroupId.toStdString(); .setScheme("retroshare").setPath("/channel")
.setQueryKV("id", chan.mMeta.mGroupId.toStdString());
const std::string idTerm("Q" + chanUrl.toString());
chanUrl.setQueryKV("publishDate", date);
chanUrl.setQueryKV("name", chan.mMeta.mGroupName);
std::string rsLink(chanUrl.toString());
// store the RS link so we are able to retrive it on matching search // store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink); doc.add_value(URL_VALUENO, rsLink);
@ -109,7 +121,6 @@ struct DeepSearch
// We use the identifier to ensure each object ends up in the // We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the // database only once no matter how many times we run the
// indexer. "Q" prefix is a Xapian convention for unique id term. // indexer. "Q" prefix is a Xapian convention for unique id term.
const std::string idTerm("Q" + rsLink);
doc.add_boolean_term(idTerm); doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc); db.replace_document(idTerm, doc);
} }
@ -117,8 +128,10 @@ struct DeepSearch
static void removeChannelFromIndex(RsGxsGroupId grpId) static void removeChannelFromIndex(RsGxsGroupId grpId)
{ {
// "Q" prefix is a Xapian convention for unique id term. // "Q" prefix is a Xapian convention for unique id term.
std::string idTerm("Qretroshare://channel?id="); RsUrl chanUrl; chanUrl
idTerm += grpId.toStdString(); .setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString());
std::string idTerm("Q" + chanUrl.toString());
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
db.delete_document(idTerm); db.delete_document(idTerm);
@ -138,24 +151,72 @@ struct DeepSearch
// Index each field with a suitable prefix. // Index each field with a suitable prefix.
termgenerator.index_text(post.mMeta.mMsgName, 1, "S"); termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
char date[] = "YYYYMMDD\0";
std::strftime(date, 9, "%Y%m%d", std::gmtime(&post.mMeta.mPublishTs));
termgenerator.index_text(date, 1, "D");
// Avoid indexing HTML
bool isPlainMsg = post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>';
if(isPlainMsg)
termgenerator.index_text(post.mMsg, 1, "XD"); termgenerator.index_text(post.mMsg, 1, "XD");
// Index fields without prefixes for general search. // Index fields without prefixes for general search.
termgenerator.index_text(post.mMeta.mMsgName); termgenerator.index_text(post.mMeta.mMsgName);
if(isPlainMsg)
{
termgenerator.increase_termpos(); termgenerator.increase_termpos();
termgenerator.index_text(post.mMsg); termgenerator.index_text(post.mMsg);
}
for(const RsGxsFile& attachment : post.mFiles)
{
termgenerator.index_text(attachment.mName, 1, "F");
termgenerator.increase_termpos();
termgenerator.index_text(attachment.mName);
}
// We use the identifier to ensure each object ends up in the // We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the // database only once no matter how many times we run the
// indexer. // indexer.
std::string idTerm("Qretroshare://channel?id="); RsUrl postUrl; postUrl
idTerm += post.mMeta.mGroupId.toStdString(); .setScheme("retroshare").setPath("/channel")
idTerm += "&msgid="; .setQueryKV("id", post.mMeta.mGroupId.toStdString())
idTerm += post.mMeta.mMsgId.toStdString(); .setQueryKV("msgid", post.mMeta.mMsgId.toStdString());
std::string idTerm("Q" + postUrl.toString());
postUrl.setQueryKV("publishDate", date);
postUrl.setQueryKV("name", post.mMeta.mMsgName);
std::string rsLink(postUrl.toString());
// store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink);
// Store some fields for display purposes.
if(isPlainMsg)
doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg);
else doc.set_data(post.mMeta.mMsgName);
doc.add_boolean_term(idTerm); doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc); db.replace_document(idTerm, doc);
} }
static void removeChannelPostFromIndex(
RsGxsGroupId grpId, RsGxsMessageId msgId )
{
RsUrl postUrl; postUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString())
.setQueryKV("msgid", msgId.toStdString());
// "Q" prefix is a Xapian convention for unique id term.
std::string idTerm("Q" + postUrl.toString());
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
db.delete_document(idTerm);
}
private: private:
enum : Xapian::valueno enum : Xapian::valueno

View File

@ -166,6 +166,7 @@ bool RsGxsIntegrityCheck::check()
{ {
#ifdef RS_DEEP_SEARCH #ifdef RS_DEEP_SEARCH
bool isGxsChannels = dynamic_cast<p3GxsChannels*>(mGenExchangeClient); bool isGxsChannels = dynamic_cast<p3GxsChannels*>(mGenExchangeClient);
std::set<RsGxsGroupId> indexedGroups;
#endif #endif
// first take out all the groups // first take out all the groups
@ -232,6 +233,7 @@ bool RsGxsIntegrityCheck::check()
cgIt->toChannelGroup(cg, false); cgIt->toChannelGroup(cg, false);
cg.mMeta = meta; cg.mMeta = meta;
indexedGroups.insert(grp->grpId);
DeepSearch::indexChannelGroup(cg); DeepSearch::indexChannelGroup(cg);
} }
else else
@ -311,12 +313,15 @@ bool RsGxsIntegrityCheck::check()
if (nxsMsgIt == nxsMsgV.end()) if (nxsMsgIt == nxsMsgV.end())
{ {
msgsToDel[grpId].insert(msgId); msgsToDel[grpId].insert(msgId);
#ifdef RS_DEEP_SEARCH
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(grpId, msgId);
#endif
} }
} }
} }
GxsMsgResult::iterator mit = msgs.begin(); GxsMsgResult::iterator mit = msgs.begin();
for(; mit != msgs.end(); ++mit) for(; mit != msgs.end(); ++mit)
{ {
std::vector<RsNxsMsg*>& msgV = mit->second; std::vector<RsNxsMsg*>& msgV = mit->second;
@ -332,10 +337,56 @@ bool RsGxsIntegrityCheck::check()
if(msg->metaData == NULL || currHash != msg->metaData->mHash) if(msg->metaData == NULL || currHash != msg->metaData->mHash)
{ {
std::cerr << "(EE) deleting message data with wrong hash or null meta data. meta=" << (void*)msg->metaData << std::endl; std::cerr << __PRETTY_FUNCTION__ <<" (EE) deleting message data"
<< " with wrong hash or null meta data. meta="
<< (void*)msg->metaData << std::endl;
msgsToDel[msg->grpId].insert(msg->msgId); msgsToDel[msg->grpId].insert(msg->msgId);
#ifdef RS_DEEP_SEARCH
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId);
#endif
} }
else if(!msg->metaData->mAuthorId.isNull() && subscribed_groups.find(msg->metaData->mGroupId)!=subscribed_groups.end()) else if (subscribed_groups.count(msg->metaData->mGroupId))
{
#ifdef RS_DEEP_SEARCH
if( isGxsChannels
&& indexedGroups.count(msg->metaData->mGroupId) )
{
RsGxsMsgMetaData meta;
meta.deserialise(msg->meta.bin_data, &msg->meta.bin_len);
uint32_t blz = msg->msg.bin_len;
RsItem* rIt = mSerializer.deserialise(msg->msg.bin_data,
&blz);
if( RsGxsChannelPostItem* cgIt =
dynamic_cast<RsGxsChannelPostItem*>(rIt) )
{
RsGxsChannelPost cg;
cgIt->toChannelPost(cg, false);
cg.mMeta = meta;
DeepSearch::indexChannelPost(cg);
}
else if(dynamic_cast<RsGxsCommentItem*>(rIt)) {}
else if(dynamic_cast<RsGxsVoteItem*>(rIt)) {}
else
{
std::cerr << __PRETTY_FUNCTION__ << " Message: "
<< meta.mMsgId.toStdString()
<< " in group: "
<< meta.mGroupId.toStdString() << " "
<< " doesn't seems a channel post, please "
<< "report to developers"
<< std::endl;
print_stacktrace();
}
delete rIt;
}
#endif
if(!msg->metaData->mAuthorId.isNull())
{ {
#ifdef DEBUG_GXSUTIL #ifdef DEBUG_GXSUTIL
GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl; GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl;
@ -343,19 +394,16 @@ bool RsGxsIntegrityCheck::check()
if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE) if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE)
used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ; used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ;
} }
}
delete msg; delete msg;
} }
} }
#ifdef RS_DEEP_SEARCH
// TODO:remove msgsToDel from deep search index too
#endif
mDs->removeMsgs(msgsToDel); mDs->removeMsgs(msgsToDel);
{ {
RsStackMutex stack(mIntegrityMutex); RS_STACK_MUTEX(mIntegrityMutex);
std::vector<RsGxsGroupId>::iterator grpIt; std::vector<RsGxsGroupId>::iterator grpIt;
for(grpIt = grpsToDel.begin(); grpIt != grpsToDel.end(); ++grpIt) for(grpIt = grpsToDel.begin(); grpIt != grpsToDel.end(); ++grpIt)