Implement deep indexing for files through Xapian

ATM it support extracting metadata only from OGG files.
The system has been designed to be easly extensible to more file formats
  registering more indexer functions which just need to extract metadata
  from a certain type of file and feed it to Xapian.
The system has been integrated into existent file search system to
  through generric search requests and results, it keep a good level of
  retro-compatibility due to some tricks.
The indexing system is released under AGPLv3  so when libretroshare is compiled
  with deep search enabled AGPLv3 must be honored instead of LGPLv3-or-later.
Cleaned up the debian copyright file using non-deprecated license
  code-names.
This commit is contained in:
Gioacchino Mazzurco 2019-06-20 17:24:18 +02:00
parent d46e3eb2b7
commit 3a26ccf6a5
No known key found for this signature in database
GPG key ID: A1FBCA3872E87051
25 changed files with 1364 additions and 438 deletions

View file

@ -202,6 +202,52 @@ struct BannedFileEntry : RsSerializable
}
};
struct DeepFilesSearchResult;
struct TurtleFileInfoV2 : RsSerializable
{
TurtleFileInfoV2() : fSize(0), fWeight(0) {}
TurtleFileInfoV2(const TurtleFileInfo& oldInfo) :
fSize(oldInfo.size), fHash(oldInfo.hash), fName(oldInfo.name),
fWeight(0) {}
#ifdef RS_DEEP_FILES_INDEX
TurtleFileInfoV2(const DeepFilesSearchResult& dRes);
#endif // def RS_DEEP_FILES_INDEX
uint64_t fSize; /// File size
RsFileHash fHash; /// File hash
std::string fName; /// File name
/** @brief Xapian weight of the file which matched the search criteria
* This field is optional (its value is 0 when not specified).
* Given that Xapian weight for the same file is usually different on
* different nodes, it should not be used as an absolute refence, but just
* as an hint of how much the given file match the search criteria.
*/
float fWeight;
/** @brief Xapian snippet of the file which matched the search criteria
* This field is optional (its value is an empty string when not specified).
*/
std::string fSnippet;
/// @see RsSerializable::serial_process
void serial_process( RsGenericSerializer::SerializeJob j,
RsGenericSerializer::SerializeContext& ctx ) override
{
RS_SERIAL_PROCESS(fSize);
RS_SERIAL_PROCESS(fHash);
RS_SERIAL_PROCESS(fName);
RS_SERIAL_PROCESS(fWeight);
RS_SERIAL_PROCESS(fSnippet);
}
~TurtleFileInfoV2() override;
};
class RsFiles
{
public:
@ -209,7 +255,7 @@ public:
virtual ~RsFiles() {}
/**
* @brief Provides file data for the gui, media streaming or rpc clients.
* @brief Provides file data for the GUI, media streaming or API clients.
* It may return unverified chunks. This allows streaming without having to
* wait for hashes or completion of the file.
* This function returns an unspecified amount of bytes. Either as much data
@ -217,8 +263,8 @@ public:
* To get more data, call this function repeatedly with different offsets.
*
* @jsonapi{development,manualwrapper}
* note the missing @ the wrapper for this is written manually not
* autogenerated @see JsonApiServer.
* note the wrapper for this is written manually not autogenerated
* @see JsonApiServer.
*
* @param[in] hash hash of the file. The file has to be available on this node
* or it has to be in downloading state.
@ -356,7 +402,9 @@ public:
/**
* @brief Request remote files search
* @jsonapi{development}
* @param[in] matchString string to look for in the search
* @param[in] matchString string to look for in the search. If files deep
* indexing is enabled at compile time support advanced features described
* at https://xapian.org/docs/queryparser.html
* @param multiCallback function that will be called each time a search
* result is received
* @param[in] maxWait maximum wait time in seconds for search results
@ -364,7 +412,7 @@ public:
*/
virtual bool turtleSearchRequest(
const std::string& matchString,
const std::function<void (const std::list<TurtleFileInfo>& results)>& multiCallback,
const std::function<void (const std::vector<TurtleFileInfoV2>& results)>& multiCallback,
rstime_t maxWait = 300 ) = 0;
virtual TurtleRequestId turtleSearch(const std::string& string_to_match) = 0;
@ -627,8 +675,19 @@ public:
*/
virtual bool removeSharedDirectory(std::string dir) = 0;
virtual bool getIgnoreLists(std::list<std::string>& ignored_prefixes, std::list<std::string>& ignored_suffixes,uint32_t& flags) =0;
virtual void setIgnoreLists(const std::list<std::string>& ignored_prefixes, const std::list<std::string>& ignored_suffixes,uint32_t flags) =0;
/**
* @brief Get list of ignored file name prefixes and suffixes
* @param[out] ignoredPrefixes storage for ingored prefixes
* @param[out] ignoredSuffixes storage for ingored suffixes
* @param flags RS_FILE_SHARE_FLAGS_IGNORE_*
* @return false if something failed, true otherwhise
*/
virtual bool getIgnoreLists(
std::list<std::string>& ignoredPrefixes,
std::list<std::string>& ignoredSuffixes,
uint32_t& flags ) = 0;
virtual void setIgnoreLists(const std::list<std::string>& ignored_prefixes, const std::list<std::string>& ignored_suffixes,uint32_t flags) =0;
virtual void setWatchPeriod(int minutes) =0;
virtual void setWatchEnabled(bool b) =0;