mirror of
https://github.com/RetroShare/RetroShare.git
synced 2024-12-28 00:49:28 -05:00
DeepSearch index channels posts too
Improve indexing using RsUrl, store some relevant fields in stored url
This commit is contained in:
parent
32014eaac1
commit
d3e5b760a2
@ -17,17 +17,18 @@
|
|||||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <ctime>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
|
|
||||||
#include "retroshare/rsgxschannels.h"
|
#include "retroshare/rsgxschannels.h"
|
||||||
#include "retroshare/rsinit.h"
|
#include "retroshare/rsinit.h"
|
||||||
|
#include "util/rsurl.h"
|
||||||
|
|
||||||
struct DeepSearch
|
struct DeepSearch
|
||||||
{
|
{
|
||||||
struct SearchResult
|
struct SearchResult
|
||||||
{
|
{
|
||||||
// TODO: Use RsUrl from extra_locators branch instead of plain string
|
|
||||||
std::string mUrl;
|
std::string mUrl;
|
||||||
std::string mSnippet;
|
std::string mSnippet;
|
||||||
};
|
};
|
||||||
@ -90,6 +91,11 @@ struct DeepSearch
|
|||||||
|
|
||||||
// Index each field with a suitable prefix.
|
// Index each field with a suitable prefix.
|
||||||
termgenerator.index_text(chan.mMeta.mGroupName, 1, "G");
|
termgenerator.index_text(chan.mMeta.mGroupName, 1, "G");
|
||||||
|
|
||||||
|
char date[] = "YYYYMMDD\0";
|
||||||
|
std::strftime(date, 9, "%Y%m%d", std::gmtime(&chan.mMeta.mPublishTs));
|
||||||
|
termgenerator.index_text(date, 1, "D");
|
||||||
|
|
||||||
termgenerator.index_text(chan.mDescription, 1, "XD");
|
termgenerator.index_text(chan.mDescription, 1, "XD");
|
||||||
|
|
||||||
// Index fields without prefixes for general search.
|
// Index fields without prefixes for general search.
|
||||||
@ -97,8 +103,14 @@ struct DeepSearch
|
|||||||
termgenerator.increase_termpos();
|
termgenerator.increase_termpos();
|
||||||
termgenerator.index_text(chan.mDescription);
|
termgenerator.index_text(chan.mDescription);
|
||||||
|
|
||||||
std::string rsLink("retroshare://channel?id=");
|
RsUrl chanUrl; chanUrl
|
||||||
rsLink += chan.mMeta.mGroupId.toStdString();
|
.setScheme("retroshare").setPath("/channel")
|
||||||
|
.setQueryKV("id", chan.mMeta.mGroupId.toStdString());
|
||||||
|
const std::string idTerm("Q" + chanUrl.toString());
|
||||||
|
|
||||||
|
chanUrl.setQueryKV("publishDate", date);
|
||||||
|
chanUrl.setQueryKV("name", chan.mMeta.mGroupName);
|
||||||
|
std::string rsLink(chanUrl.toString());
|
||||||
|
|
||||||
// store the RS link so we are able to retrive it on matching search
|
// store the RS link so we are able to retrive it on matching search
|
||||||
doc.add_value(URL_VALUENO, rsLink);
|
doc.add_value(URL_VALUENO, rsLink);
|
||||||
@ -109,7 +121,6 @@ struct DeepSearch
|
|||||||
// We use the identifier to ensure each object ends up in the
|
// We use the identifier to ensure each object ends up in the
|
||||||
// database only once no matter how many times we run the
|
// database only once no matter how many times we run the
|
||||||
// indexer. "Q" prefix is a Xapian convention for unique id term.
|
// indexer. "Q" prefix is a Xapian convention for unique id term.
|
||||||
const std::string idTerm("Q" + rsLink);
|
|
||||||
doc.add_boolean_term(idTerm);
|
doc.add_boolean_term(idTerm);
|
||||||
db.replace_document(idTerm, doc);
|
db.replace_document(idTerm, doc);
|
||||||
}
|
}
|
||||||
@ -117,8 +128,10 @@ struct DeepSearch
|
|||||||
static void removeChannelFromIndex(RsGxsGroupId grpId)
|
static void removeChannelFromIndex(RsGxsGroupId grpId)
|
||||||
{
|
{
|
||||||
// "Q" prefix is a Xapian convention for unique id term.
|
// "Q" prefix is a Xapian convention for unique id term.
|
||||||
std::string idTerm("Qretroshare://channel?id=");
|
RsUrl chanUrl; chanUrl
|
||||||
idTerm += grpId.toStdString();
|
.setScheme("retroshare").setPath("/channel")
|
||||||
|
.setQueryKV("id", grpId.toStdString());
|
||||||
|
std::string idTerm("Q" + chanUrl.toString());
|
||||||
|
|
||||||
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
|
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
|
||||||
db.delete_document(idTerm);
|
db.delete_document(idTerm);
|
||||||
@ -138,24 +151,72 @@ struct DeepSearch
|
|||||||
|
|
||||||
// Index each field with a suitable prefix.
|
// Index each field with a suitable prefix.
|
||||||
termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
|
termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
|
||||||
termgenerator.index_text(post.mMsg, 1, "XD");
|
|
||||||
|
char date[] = "YYYYMMDD\0";
|
||||||
|
std::strftime(date, 9, "%Y%m%d", std::gmtime(&post.mMeta.mPublishTs));
|
||||||
|
termgenerator.index_text(date, 1, "D");
|
||||||
|
|
||||||
|
// Avoid indexing HTML
|
||||||
|
bool isPlainMsg = post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>';
|
||||||
|
|
||||||
|
if(isPlainMsg)
|
||||||
|
termgenerator.index_text(post.mMsg, 1, "XD");
|
||||||
|
|
||||||
// Index fields without prefixes for general search.
|
// Index fields without prefixes for general search.
|
||||||
termgenerator.index_text(post.mMeta.mMsgName);
|
termgenerator.index_text(post.mMeta.mMsgName);
|
||||||
termgenerator.increase_termpos();
|
if(isPlainMsg)
|
||||||
termgenerator.index_text(post.mMsg);
|
{
|
||||||
|
termgenerator.increase_termpos();
|
||||||
|
termgenerator.index_text(post.mMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(const RsGxsFile& attachment : post.mFiles)
|
||||||
|
{
|
||||||
|
termgenerator.index_text(attachment.mName, 1, "F");
|
||||||
|
|
||||||
|
termgenerator.increase_termpos();
|
||||||
|
termgenerator.index_text(attachment.mName);
|
||||||
|
}
|
||||||
|
|
||||||
// We use the identifier to ensure each object ends up in the
|
// We use the identifier to ensure each object ends up in the
|
||||||
// database only once no matter how many times we run the
|
// database only once no matter how many times we run the
|
||||||
// indexer.
|
// indexer.
|
||||||
std::string idTerm("Qretroshare://channel?id=");
|
RsUrl postUrl; postUrl
|
||||||
idTerm += post.mMeta.mGroupId.toStdString();
|
.setScheme("retroshare").setPath("/channel")
|
||||||
idTerm += "&msgid=";
|
.setQueryKV("id", post.mMeta.mGroupId.toStdString())
|
||||||
idTerm += post.mMeta.mMsgId.toStdString();
|
.setQueryKV("msgid", post.mMeta.mMsgId.toStdString());
|
||||||
|
std::string idTerm("Q" + postUrl.toString());
|
||||||
|
|
||||||
|
postUrl.setQueryKV("publishDate", date);
|
||||||
|
postUrl.setQueryKV("name", post.mMeta.mMsgName);
|
||||||
|
std::string rsLink(postUrl.toString());
|
||||||
|
|
||||||
|
// store the RS link so we are able to retrive it on matching search
|
||||||
|
doc.add_value(URL_VALUENO, rsLink);
|
||||||
|
|
||||||
|
// Store some fields for display purposes.
|
||||||
|
if(isPlainMsg)
|
||||||
|
doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg);
|
||||||
|
else doc.set_data(post.mMeta.mMsgName);
|
||||||
|
|
||||||
doc.add_boolean_term(idTerm);
|
doc.add_boolean_term(idTerm);
|
||||||
db.replace_document(idTerm, doc);
|
db.replace_document(idTerm, doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void removeChannelPostFromIndex(
|
||||||
|
RsGxsGroupId grpId, RsGxsMessageId msgId )
|
||||||
|
{
|
||||||
|
RsUrl postUrl; postUrl
|
||||||
|
.setScheme("retroshare").setPath("/channel")
|
||||||
|
.setQueryKV("id", grpId.toStdString())
|
||||||
|
.setQueryKV("msgid", msgId.toStdString());
|
||||||
|
// "Q" prefix is a Xapian convention for unique id term.
|
||||||
|
std::string idTerm("Q" + postUrl.toString());
|
||||||
|
|
||||||
|
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
|
||||||
|
db.delete_document(idTerm);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
enum : Xapian::valueno
|
enum : Xapian::valueno
|
||||||
|
@ -166,6 +166,7 @@ bool RsGxsIntegrityCheck::check()
|
|||||||
{
|
{
|
||||||
#ifdef RS_DEEP_SEARCH
|
#ifdef RS_DEEP_SEARCH
|
||||||
bool isGxsChannels = dynamic_cast<p3GxsChannels*>(mGenExchangeClient);
|
bool isGxsChannels = dynamic_cast<p3GxsChannels*>(mGenExchangeClient);
|
||||||
|
std::set<RsGxsGroupId> indexedGroups;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// first take out all the groups
|
// first take out all the groups
|
||||||
@ -232,6 +233,7 @@ bool RsGxsIntegrityCheck::check()
|
|||||||
cgIt->toChannelGroup(cg, false);
|
cgIt->toChannelGroup(cg, false);
|
||||||
cg.mMeta = meta;
|
cg.mMeta = meta;
|
||||||
|
|
||||||
|
indexedGroups.insert(grp->grpId);
|
||||||
DeepSearch::indexChannelGroup(cg);
|
DeepSearch::indexChannelGroup(cg);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -309,53 +311,99 @@ bool RsGxsIntegrityCheck::check()
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (nxsMsgIt == nxsMsgV.end())
|
if (nxsMsgIt == nxsMsgV.end())
|
||||||
{
|
{
|
||||||
msgsToDel[grpId].insert(msgId);
|
msgsToDel[grpId].insert(msgId);
|
||||||
|
#ifdef RS_DEEP_SEARCH
|
||||||
|
if(isGxsChannels)
|
||||||
|
DeepSearch::removeChannelPostFromIndex(grpId, msgId);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GxsMsgResult::iterator mit = msgs.begin();
|
GxsMsgResult::iterator mit = msgs.begin();
|
||||||
|
for(; mit != msgs.end(); ++mit)
|
||||||
|
{
|
||||||
|
std::vector<RsNxsMsg*>& msgV = mit->second;
|
||||||
|
std::vector<RsNxsMsg*>::iterator vit = msgV.begin();
|
||||||
|
|
||||||
for(; mit != msgs.end(); ++mit)
|
for(; vit != msgV.end(); ++vit)
|
||||||
{
|
{
|
||||||
std::vector<RsNxsMsg*>& msgV = mit->second;
|
RsNxsMsg* msg = *vit;
|
||||||
std::vector<RsNxsMsg*>::iterator vit = msgV.begin();
|
RsFileHash currHash;
|
||||||
|
pqihash pHash;
|
||||||
|
pHash.addData(msg->msg.bin_data, msg->msg.bin_len);
|
||||||
|
pHash.Complete(currHash);
|
||||||
|
|
||||||
for(; vit != msgV.end(); ++vit)
|
if(msg->metaData == NULL || currHash != msg->metaData->mHash)
|
||||||
{
|
{
|
||||||
RsNxsMsg* msg = *vit;
|
std::cerr << __PRETTY_FUNCTION__ <<" (EE) deleting message data"
|
||||||
RsFileHash currHash;
|
<< " with wrong hash or null meta data. meta="
|
||||||
pqihash pHash;
|
<< (void*)msg->metaData << std::endl;
|
||||||
pHash.addData(msg->msg.bin_data, msg->msg.bin_len);
|
msgsToDel[msg->grpId].insert(msg->msgId);
|
||||||
pHash.Complete(currHash);
|
#ifdef RS_DEEP_SEARCH
|
||||||
|
if(isGxsChannels)
|
||||||
if(msg->metaData == NULL || currHash != msg->metaData->mHash)
|
DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId);
|
||||||
{
|
|
||||||
std::cerr << "(EE) deleting message data with wrong hash or null meta data. meta=" << (void*)msg->metaData << std::endl;
|
|
||||||
msgsToDel[msg->grpId].insert(msg->msgId);
|
|
||||||
}
|
|
||||||
else if(!msg->metaData->mAuthorId.isNull() && subscribed_groups.find(msg->metaData->mGroupId)!=subscribed_groups.end())
|
|
||||||
{
|
|
||||||
#ifdef DEBUG_GXSUTIL
|
|
||||||
GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE)
|
}
|
||||||
used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ;
|
else if (subscribed_groups.count(msg->metaData->mGroupId))
|
||||||
}
|
{
|
||||||
|
#ifdef RS_DEEP_SEARCH
|
||||||
|
if( isGxsChannels
|
||||||
|
&& indexedGroups.count(msg->metaData->mGroupId) )
|
||||||
|
{
|
||||||
|
RsGxsMsgMetaData meta;
|
||||||
|
meta.deserialise(msg->meta.bin_data, &msg->meta.bin_len);
|
||||||
|
|
||||||
|
uint32_t blz = msg->msg.bin_len;
|
||||||
|
RsItem* rIt = mSerializer.deserialise(msg->msg.bin_data,
|
||||||
|
&blz);
|
||||||
|
|
||||||
|
if( RsGxsChannelPostItem* cgIt =
|
||||||
|
dynamic_cast<RsGxsChannelPostItem*>(rIt) )
|
||||||
|
{
|
||||||
|
RsGxsChannelPost cg;
|
||||||
|
cgIt->toChannelPost(cg, false);
|
||||||
|
cg.mMeta = meta;
|
||||||
|
|
||||||
|
DeepSearch::indexChannelPost(cg);
|
||||||
|
}
|
||||||
|
else if(dynamic_cast<RsGxsCommentItem*>(rIt)) {}
|
||||||
|
else if(dynamic_cast<RsGxsVoteItem*>(rIt)) {}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cerr << __PRETTY_FUNCTION__ << " Message: "
|
||||||
|
<< meta.mMsgId.toStdString()
|
||||||
|
<< " in group: "
|
||||||
|
<< meta.mGroupId.toStdString() << " "
|
||||||
|
<< " doesn't seems a channel post, please "
|
||||||
|
<< "report to developers"
|
||||||
|
<< std::endl;
|
||||||
|
print_stacktrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
delete rIt;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(!msg->metaData->mAuthorId.isNull())
|
||||||
|
{
|
||||||
|
#ifdef DEBUG_GXSUTIL
|
||||||
|
GXSUTIL_DEBUG() << "TimeStamping message authors' key ID " << msg->metaData->mAuthorId << " in message " << msg->msgId << ", group ID " << msg->grpId<< std::endl;
|
||||||
|
#endif
|
||||||
|
if(rsReputations!=NULL && rsReputations->overallReputationLevel(msg->metaData->mAuthorId) > RsReputations::REPUTATION_LOCALLY_NEGATIVE)
|
||||||
|
used_gxs_ids.insert(std::make_pair(msg->metaData->mAuthorId,RsIdentityUsage(mGenExchangeClient->serviceType(),RsIdentityUsage::MESSAGE_AUTHOR_KEEP_ALIVE,msg->metaData->mGroupId,msg->metaData->mMsgId))) ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
delete msg;
|
delete msg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef RS_DEEP_SEARCH
|
|
||||||
// TODO:remove msgsToDel from deep search index too
|
|
||||||
#endif
|
|
||||||
|
|
||||||
mDs->removeMsgs(msgsToDel);
|
mDs->removeMsgs(msgsToDel);
|
||||||
|
|
||||||
{
|
{
|
||||||
RsStackMutex stack(mIntegrityMutex);
|
RS_STACK_MUTEX(mIntegrityMutex);
|
||||||
|
|
||||||
std::vector<RsGxsGroupId>::iterator grpIt;
|
std::vector<RsGxsGroupId>::iterator grpIt;
|
||||||
for(grpIt = grpsToDel.begin(); grpIt != grpsToDel.end(); ++grpIt)
|
for(grpIt = grpsToDel.begin(); grpIt != grpsToDel.end(); ++grpIt)
|
||||||
|
Loading…
Reference in New Issue
Block a user