From d3e5b760a2e8bf663bd002d85e5e52fed2fd46ec Mon Sep 17 00:00:00 2001 From: Gioacchino Mazzurco Date: Wed, 4 Jul 2018 12:08:50 +0200 Subject: [PATCH] DeepSearch index channels posts too Improve indexing using RsUrl, store some relevant fields in stored url --- libretroshare/src/deep_search/deep_search.h | 87 ++++++++++++--- libretroshare/src/gxs/rsgxsutil.cc | 112 ++++++++++++++------ 2 files changed, 154 insertions(+), 45 deletions(-) diff --git a/libretroshare/src/deep_search/deep_search.h b/libretroshare/src/deep_search/deep_search.h index 3d916a62a..7152e34ce 100644 --- a/libretroshare/src/deep_search/deep_search.h +++ b/libretroshare/src/deep_search/deep_search.h @@ -17,17 +17,18 @@ * along with this program. If not, see . */ +#include #include #include #include "retroshare/rsgxschannels.h" #include "retroshare/rsinit.h" +#include "util/rsurl.h" struct DeepSearch { struct SearchResult { - // TODO: Use RsUrl from extra_locators branch instead of plain string std::string mUrl; std::string mSnippet; }; @@ -90,6 +91,11 @@ struct DeepSearch // Index each field with a suitable prefix. termgenerator.index_text(chan.mMeta.mGroupName, 1, "G"); + + char date[] = "YYYYMMDD\0"; + std::strftime(date, 9, "%Y%m%d", std::gmtime(&chan.mMeta.mPublishTs)); + termgenerator.index_text(date, 1, "D"); + termgenerator.index_text(chan.mDescription, 1, "XD"); // Index fields without prefixes for general search. @@ -97,8 +103,14 @@ struct DeepSearch termgenerator.increase_termpos(); termgenerator.index_text(chan.mDescription); - std::string rsLink("retroshare://channel?id="); - rsLink += chan.mMeta.mGroupId.toStdString(); + RsUrl chanUrl; chanUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", chan.mMeta.mGroupId.toStdString()); + const std::string idTerm("Q" + chanUrl.toString()); + + chanUrl.setQueryKV("publishDate", date); + chanUrl.setQueryKV("name", chan.mMeta.mGroupName); + std::string rsLink(chanUrl.toString()); // store the RS link so we are able to retrive it on matching search doc.add_value(URL_VALUENO, rsLink); @@ -109,7 +121,6 @@ struct DeepSearch // We use the identifier to ensure each object ends up in the // database only once no matter how many times we run the // indexer. "Q" prefix is a Xapian convention for unique id term. - const std::string idTerm("Q" + rsLink); doc.add_boolean_term(idTerm); db.replace_document(idTerm, doc); } @@ -117,8 +128,10 @@ struct DeepSearch static void removeChannelFromIndex(RsGxsGroupId grpId) { // "Q" prefix is a Xapian convention for unique id term. - std::string idTerm("Qretroshare://channel?id="); - idTerm += grpId.toStdString(); + RsUrl chanUrl; chanUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", grpId.toStdString()); + std::string idTerm("Q" + chanUrl.toString()); Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); db.delete_document(idTerm); @@ -138,24 +151,72 @@ struct DeepSearch // Index each field with a suitable prefix. termgenerator.index_text(post.mMeta.mMsgName, 1, "S"); - termgenerator.index_text(post.mMsg, 1, "XD"); + + char date[] = "YYYYMMDD\0"; + std::strftime(date, 9, "%Y%m%d", std::gmtime(&post.mMeta.mPublishTs)); + termgenerator.index_text(date, 1, "D"); + + // Avoid indexing HTML + bool isPlainMsg = post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>'; + + if(isPlainMsg) + termgenerator.index_text(post.mMsg, 1, "XD"); // Index fields without prefixes for general search. termgenerator.index_text(post.mMeta.mMsgName); - termgenerator.increase_termpos(); - termgenerator.index_text(post.mMsg); + if(isPlainMsg) + { + termgenerator.increase_termpos(); + termgenerator.index_text(post.mMsg); + } + + for(const RsGxsFile& attachment : post.mFiles) + { + termgenerator.index_text(attachment.mName, 1, "F"); + + termgenerator.increase_termpos(); + termgenerator.index_text(attachment.mName); + } // We use the identifier to ensure each object ends up in the // database only once no matter how many times we run the // indexer. - std::string idTerm("Qretroshare://channel?id="); - idTerm += post.mMeta.mGroupId.toStdString(); - idTerm += "&msgid="; - idTerm += post.mMeta.mMsgId.toStdString(); + RsUrl postUrl; postUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", post.mMeta.mGroupId.toStdString()) + .setQueryKV("msgid", post.mMeta.mMsgId.toStdString()); + std::string idTerm("Q" + postUrl.toString()); + + postUrl.setQueryKV("publishDate", date); + postUrl.setQueryKV("name", post.mMeta.mMsgName); + std::string rsLink(postUrl.toString()); + + // store the RS link so we are able to retrive it on matching search + doc.add_value(URL_VALUENO, rsLink); + + // Store some fields for display purposes. + if(isPlainMsg) + doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg); + else doc.set_data(post.mMeta.mMsgName); + doc.add_boolean_term(idTerm); db.replace_document(idTerm, doc); } + static void removeChannelPostFromIndex( + RsGxsGroupId grpId, RsGxsMessageId msgId ) + { + RsUrl postUrl; postUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", grpId.toStdString()) + .setQueryKV("msgid", msgId.toStdString()); + // "Q" prefix is a Xapian convention for unique id term. + std::string idTerm("Q" + postUrl.toString()); + + Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); + db.delete_document(idTerm); + } + private: enum : Xapian::valueno diff --git a/libretroshare/src/gxs/rsgxsutil.cc b/libretroshare/src/gxs/rsgxsutil.cc index 60106b411..75b43da83 100644 --- a/libretroshare/src/gxs/rsgxsutil.cc +++ b/libretroshare/src/gxs/rsgxsutil.cc @@ -166,6 +166,7 @@ bool RsGxsIntegrityCheck::check() { #ifdef RS_DEEP_SEARCH bool isGxsChannels = dynamic_cast(mGenExchangeClient); + std::set indexedGroups; #endif // first take out all the groups @@ -232,6 +233,7 @@ bool RsGxsIntegrityCheck::check() cgIt->toChannelGroup(cg, false); cg.mMeta = meta; + indexedGroups.insert(grp->grpId); DeepSearch::indexChannelGroup(cg); } else @@ -309,53 +311,99 @@ bool RsGxsIntegrityCheck::check() } if (nxsMsgIt == nxsMsgV.end()) - { - msgsToDel[grpId].insert(msgId); + { + msgsToDel[grpId].insert(msgId); +#ifdef RS_DEEP_SEARCH + if(isGxsChannels) + DeepSearch::removeChannelPostFromIndex(grpId, msgId); +#endif } } } - GxsMsgResult::iterator mit = msgs.begin(); + GxsMsgResult::iterator mit = msgs.begin(); + for(; mit != msgs.end(); ++mit) + { + std::vector& msgV = mit->second; + std::vector::iterator vit = msgV.begin(); - for(; mit != msgs.end(); ++mit) - { - std::vector& msgV = mit->second; - std::vector::iterator vit = msgV.begin(); + for(; vit != msgV.end(); ++vit) + { + RsNxsMsg* msg = *vit; + RsFileHash currHash; + pqihash pHash; + pHash.addData(msg->msg.bin_data, msg->msg.bin_len); + pHash.Complete(currHash); - for(; vit != msgV.end(); ++vit) - { - RsNxsMsg* msg = *vit; - RsFileHash currHash; - pqihash pHash; - pHash.addData(msg->msg.bin_data, msg->msg.bin_len); - pHash.Complete(currHash); - - if(msg->metaData == NULL || currHash != msg->metaData->mHash) - { - std::cerr << "(EE) deleting message data with wrong hash or null meta data. meta=" << (void*)msg->metaData << std::endl; - msgsToDel[msg->grpId].insert(msg->msgId); - } - else if(!msg->metaData->mAuthorId.isNull() && subscribed_groups.find(msg->metaData->mGroupId)!=subscribed_groups.end()) - { -#ifdef DEBUG_GXSUTIL - GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl; + if(msg->metaData == NULL || currHash != msg->metaData->mHash) + { + std::cerr << __PRETTY_FUNCTION__ <<" (EE) deleting message data" + << " with wrong hash or null meta data. meta=" + << (void*)msg->metaData << std::endl; + msgsToDel[msg->grpId].insert(msg->msgId); +#ifdef RS_DEEP_SEARCH + if(isGxsChannels) + DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId); #endif - if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE) - used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ; - } + } + else if (subscribed_groups.count(msg->metaData->mGroupId)) + { +#ifdef RS_DEEP_SEARCH + if( isGxsChannels + && indexedGroups.count(msg->metaData->mGroupId) ) + { + RsGxsMsgMetaData meta; + meta.deserialise(msg->meta.bin_data, &msg->meta.bin_len); + + uint32_t blz = msg->msg.bin_len; + RsItem* rIt = mSerializer.deserialise(msg->msg.bin_data, + &blz); + + if( RsGxsChannelPostItem* cgIt = + dynamic_cast(rIt) ) + { + RsGxsChannelPost cg; + cgIt->toChannelPost(cg, false); + cg.mMeta = meta; + + DeepSearch::indexChannelPost(cg); + } + else if(dynamic_cast(rIt)) {} + else if(dynamic_cast(rIt)) {} + else + { + std::cerr << __PRETTY_FUNCTION__ << " Message: " + << meta.mMsgId.toStdString() + << " in group: " + << meta.mGroupId.toStdString() << " " + << " doesn't seems a channel post, please " + << "report to developers" + << std::endl; + print_stacktrace(); + } + + delete rIt; + } +#endif + + if(!msg->metaData->mAuthorId.isNull()) + { +#ifdef DEBUG_GXSUTIL + GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl; +#endif + if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE) + used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ; + } + } delete msg; } } -#ifdef RS_DEEP_SEARCH - // TODO:remove msgsToDel from deep search index too -#endif - mDs->removeMsgs(msgsToDel); { - RsStackMutex stack(mIntegrityMutex); + RS_STACK_MUTEX(mIntegrityMutex); std::vector::iterator grpIt; for(grpIt = grpsToDel.begin(); grpIt != grpsToDel.end(); ++grpIt)