Implement deep indexing for files through Xapian

ATM it support extracting metadata only from OGG files.
The system has been designed to be easly extensible to more file formats
  registering more indexer functions which just need to extract metadata
  from a certain type of file and feed it to Xapian.
The system has been integrated into existent file search system to
  through generric search requests and results, it keep a good level of
  retro-compatibility due to some tricks.
The indexing system is released under AGPLv3  so when libretroshare is compiled
  with deep search enabled AGPLv3 must be honored instead of LGPLv3-or-later.
Cleaned up the debian copyright file using non-deprecated license
  code-names.
This commit is contained in:
Gioacchino Mazzurco 2019-06-20 17:24:18 +02:00
parent d46e3eb2b7
commit 3a26ccf6a5
No known key found for this signature in database
GPG key ID: A1FBCA3872E87051
25 changed files with 1364 additions and 438 deletions

View file

@ -257,8 +257,8 @@
#include "util/rsmemory.h"
#include "util/stacktrace.h"
#ifdef RS_DEEP_SEARCH
# include "deep_search/deep_search.h"
#ifdef RS_DEEP_CHANNEL_INDEX
# include "deep_search/channelsindex.hpp"
#endif
/***
@ -5148,13 +5148,13 @@ TurtleRequestId RsGxsNetService::turtleSearchRequest(const std::string& match_st
return mGxsNetTunnel->turtleSearchRequest(match_string,this) ;
}
#ifndef RS_DEEP_SEARCH
#ifndef RS_DEEP_CHANNEL_INDEX
static bool termSearch(const std::string& src, const std::string& substring)
{
/* always ignore case */
return src.end() != std::search( src.begin(), src.end(), substring.begin(), substring.end(), RsRegularExpression::CompareCharIC() );
}
#endif // ndef RS_DEEP_SEARCH
#endif // ndef RS_DEEP_CHANNEL_INDEX
bool RsGxsNetService::retrieveDistantSearchResults(TurtleRequestId req,std::map<RsGxsGroupId,RsGxsGroupSummary>& group_infos)
{
@ -5209,11 +5209,11 @@ void RsGxsNetService::receiveTurtleSearchResults(
for (const RsGxsGroupSummary& gps : group_infos)
{
#ifndef RS_DEEP_SEARCH
#ifndef RS_DEEP_CHANNEL_INDEX
/* Only keep groups that are not locally known, and groups that are
* not already in the mDistantSearchResults structure. */
if(grpMeta[gps.mGroupId]) continue;
#else // ndef RS_DEEP_SEARCH
#else // ndef RS_DEEP_CHANNEL_INDEX
/* When deep search is enabled search results may bring more info
* then we already have also about post that are indexed by xapian,
* so we don't apply this filter in this case. */
@ -5302,9 +5302,9 @@ bool RsGxsNetService::search( const std::string& substring,
{
group_infos.clear();
#ifdef RS_DEEP_SEARCH
std::vector<DeepSearch::SearchResult> results;
DeepSearch::search(substring, results);
#ifdef RS_DEEP_CHANNEL_INDEX
std::vector<DeepChannelsSearchResult> results;
DeepChannelsIndex::search(substring, results);
for(auto dsr : results)
{
@ -5324,7 +5324,7 @@ bool RsGxsNetService::search( const std::string& substring,
if((rit = uQ.find("name")) != uQ.end())
s.mGroupName = rit->second;
if((rit = uQ.find("signFlags")) != uQ.end())
s.mSignFlags = std::stoul(rit->second);
s.mSignFlags = static_cast<uint32_t>(std::stoul(rit->second));
if((rit = uQ.find("publishTs")) != uQ.end())
s.mPublishTs = static_cast<rstime_t>(std::stoll(rit->second));
if((rit = uQ.find("authorId")) != uQ.end())
@ -5340,7 +5340,7 @@ bool RsGxsNetService::search( const std::string& substring,
}
}
}
#else // RS_DEEP_SEARCH
#else // RS_DEEP_CHANNEL_INDEX
RsGxsGrpMetaTemporaryMap grpMetaMap;
{
RS_STACK_MUTEX(mNxsMutex) ;
@ -5366,7 +5366,7 @@ bool RsGxsNetService::search( const std::string& substring,
group_infos.push_back(s);
}
#endif // RS_DEEP_SEARCH
#endif // RS_DEEP_CHANNEL_INDEX
#ifdef NXS_NET_DEBUG_8
GXSNETDEBUG___ << " performing local substring search in response to distant request. Found " << group_infos.size() << " responses." << std::endl;

View file

@ -29,8 +29,8 @@
#include "pqi/pqihash.h"
#include "gxs/rsgixs.h"
#ifdef RS_DEEP_SEARCH
# include "deep_search/deep_search.h"
#ifdef RS_DEEP_CHANNEL_INDEX
# include "deep_search/channelsindex.hpp"
# include "services/p3gxschannels.h"
# include "rsitems/rsgxschannelitems.h"
#endif
@ -148,12 +148,12 @@ bool RsGxsMessageCleanUp::clean()
RsGxsIntegrityCheck::RsGxsIntegrityCheck(
RsGeneralDataService* const dataService, RsGenExchange* genex,
RsSerialType&
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
serializer
#endif
, RsGixs* gixs )
: mDs(dataService), mGenExchangeClient(genex),
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
mSerializer(serializer),
#endif
mDone(false), mIntegrityMutex("integrity"), mGixs(gixs) {}
@ -168,7 +168,7 @@ void RsGxsIntegrityCheck::run()
bool RsGxsIntegrityCheck::check()
{
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
bool isGxsChannels = mGenExchangeClient->serviceType() == RS_SERVICE_GXS_TYPE_CHANNELS;
std::set<RsGxsGroupId> indexedGroups;
#endif
@ -221,7 +221,7 @@ bool RsGxsIntegrityCheck::check()
}
else msgIds.erase(msgIds.find(grp->grpId));
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if( isGxsChannels
&& grp->metaData->mCircleType == GXS_CIRCLE_TYPE_PUBLIC
&& grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED )
@ -241,7 +241,7 @@ bool RsGxsIntegrityCheck::check()
cg.mMeta = meta;
indexedGroups.insert(grp->grpId);
DeepSearch::indexChannelGroup(cg);
DeepChannelsIndex::indexChannelGroup(cg);
}
else
{
@ -256,14 +256,15 @@ bool RsGxsIntegrityCheck::check()
delete rIt;
}
#endif
#endif // def RS_DEEP_CHANNEL_INDEX
}
else
{
grpsToDel.push_back(grp->grpId);
#ifdef RS_DEEP_SEARCH
if(isGxsChannels) DeepSearch::removeChannelFromIndex(grp->grpId);
#endif
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepChannelsIndex::removeChannelFromIndex(grp->grpId);
#endif // def RS_DEEP_CHANNEL_INDEX
}
if( !(grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED) &&
@ -320,10 +321,10 @@ bool RsGxsIntegrityCheck::check()
if (nxsMsgIt == nxsMsgV.end())
{
msgsToDel[grpId].insert(msgId);
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(grpId, msgId);
#endif
DeepChannelsIndex::removeChannelPostFromIndex(grpId, msgId);
#endif // def RS_DEEP_CHANNEL_INDEX
}
}
}
@ -348,14 +349,15 @@ bool RsGxsIntegrityCheck::check()
<< " with wrong hash or null meta data. meta="
<< (void*)msg->metaData << std::endl;
msgsToDel[msg->grpId].insert(msg->msgId);
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId);
#endif
DeepChannelsIndex::removeChannelPostFromIndex(
msg->grpId, msg->msgId );
#endif // def RS_DEEP_CHANNEL_INDEX
}
else if (subscribed_groups.count(msg->metaData->mGroupId))
{
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if( isGxsChannels
&& indexedGroups.count(msg->metaData->mGroupId) )
{
@ -373,7 +375,7 @@ bool RsGxsIntegrityCheck::check()
cgIt->toChannelPost(cg, false);
cg.mMeta = meta;
DeepSearch::indexChannelPost(cg);
DeepChannelsIndex::indexChannelPost(cg);
}
else if(dynamic_cast<RsGxsCommentItem*>(rIt)) {}
else if(dynamic_cast<RsGxsVoteItem*>(rIt)) {}
@ -391,7 +393,7 @@ bool RsGxsIntegrityCheck::check()
delete rIt;
}
#endif
#endif // def RS_DEEP_CHANNEL_INDEX
if(!msg->metaData->mAuthorId.isNull())
{

View file

@ -213,7 +213,7 @@ private:
RsGeneralDataService* const mDs;
RsGenExchange *mGenExchangeClient;
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
RsSerialType& mSerializer;
#endif
bool mDone;