Add support for indexing files dupported by taglib

Add FLAC standalone indexer
Add indexing common music tags for all formats supported by taglib
File indexer reindex files that have been indexed by older versions of
  the indexer and reindex them
Sparse improvements to deep indexing
This commit is contained in:
Gioacchino Mazzurco 2019-06-21 13:33:10 +02:00
parent 3a26ccf6a5
commit 63b71e383a
No known key found for this signature in database
GPG Key ID: A1FBCA3872E87051
9 changed files with 341 additions and 18 deletions

View File

@ -71,7 +71,7 @@ private:
static const std::string& dbPath()
{
static const std::string dbDir =
RsAccounts::AccountDirectory() + "/deep_search_xapian_db";
RsAccounts::AccountDirectory() + "/deep_channels_xapian_db";
return dbDir;
}
};

View File

@ -0,0 +1,156 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "util/rsdebug.h"
#include <xapian.h>
#include <string>
#include <FLAC++/metadata.h>
#include <cctype>
#include <memory>
struct RsDeepFlacFileIndexer
{
RsDeepFlacFileIndexer()
{
DeepFilesIndex::registerIndexer(31, indexFlacFile);
}
static uint32_t indexFlacFile(
const std::string& path, const std::string& /*name*/,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc )
{
Dbg3() << __PRETTY_FUNCTION__ << " " << path << std::endl;
using FlacChain = FLAC::Metadata::Chain;
std::unique_ptr<FlacChain> flacChain(new FlacChain);
if(!flacChain->is_valid())
{
RsErr() << __PRETTY_FUNCTION__ << " Failed creating FLAC Chain 1"
<< std::endl;
return 1;
}
if(!flacChain->read(path.c_str(), false))
{
Dbg3() << __PRETTY_FUNCTION__ << " Failed to open the file as FLAC"
<< std::endl;
flacChain.reset(new FlacChain);
if(!flacChain->is_valid())
{
RsErr() << __PRETTY_FUNCTION__
<< " Failed creating FLAC Chain 2" << std::endl;
return 1;
}
if(!flacChain->read(path.c_str(), true))
{
Dbg3() << __PRETTY_FUNCTION__
<< " Failed to open the file as OggFLAC"
<< std::endl;
return 0;
}
}
unsigned validCommentsCnt = 0;
std::string docData = xDoc.get_data();
FLAC::Metadata::Iterator mdit;
mdit.init(*flacChain);
if(!mdit.is_valid()) return 1;
do
{
::FLAC__MetadataType mdt = mdit.get_block_type();
if (mdt != FLAC__METADATA_TYPE_VORBIS_COMMENT) continue;
Dbg2() << __PRETTY_FUNCTION__ << " Found Vorbis Comment Block"
<< std::endl;
std::unique_ptr<FLAC::Metadata::Prototype> proto(mdit.get_block());
if(!proto) continue;
const FLAC::Metadata::VorbisComment* vc =
dynamic_cast<FLAC::Metadata::VorbisComment*>(proto.get());
if(!vc || !vc->is_valid()) continue;
unsigned numComments = vc->get_num_comments();
for(unsigned i = 0; i < numComments; ++i)
{
FLAC::Metadata::VorbisComment::Entry entry =
vc->get_comment(i);
if(!entry.is_valid()) continue;
std::string tagName( entry.get_field_name(),
entry.get_field_name_length() );
/* Vorbis tags should be uppercases but not all the softwares
* enforce it */
for (auto& c: tagName) c = static_cast<char>(toupper(c));
std::string tagValue( entry.get_field_value(),
entry.get_field_value_length() );
if(tagValue.empty()) continue;
if(tagName == "ARTIST")
xTG.index_text(tagValue, 1, "A");
else if (tagName == "DESCRIPTION")
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
// Index fields without prefixes for general search.
xTG.increase_termpos();
std::string fullComment(tagName + "=" + tagValue);
xTG.index_text(fullComment);
docData += fullComment + "\n";
Dbg2() << __PRETTY_FUNCTION__ << " Indexed " << fullComment
<< std::endl;
++validCommentsCnt;
}
}
while(mdit.next());
if(validCommentsCnt > 0)
{
Dbg1() << __PRETTY_FUNCTION__ << " Successfully indexed: " << path
<< std::endl;
xDoc.set_data(docData);
return 99;
}
/* Altought the file appears to be a valid FLAC, no vorbis comment has
* been found so return less then 50 maybe it has tagged only with ID3
* tags ? */
return 30;
}
RS_SET_CONTEXT_DEBUG_LEVEL(3)
};

View File

@ -22,6 +22,7 @@
#include "deep_search/commonutils.hpp"
#include "util/rsdebug.h"
#include "retroshare/rsinit.h"
#include "retroshare/rsversion.h"
#include <utility>
@ -37,31 +38,48 @@ bool DeepFilesIndex::indexFile(
if(!dbPtr) return false;
Xapian::WritableDatabase& db(*dbPtr);
if(db.term_exists("Q" + hash.toStdString()))
const std::string hashString = hash.toStdString();
const std::string idTerm("Q" + hashString);
Xapian::Document oldDoc;
Xapian::PostingIterator pIt = db.postlist_begin(idTerm);
if( pIt != db.postlist_end(idTerm) )
{
Dbg3() << __PRETTY_FUNCTION__ << " skipping laready indexed file: "
<< hash << " " << name << std::endl;
return true;
oldDoc = db.get_document(*pIt);
if( oldDoc.get_value(INDEXER_VERSION_VALUENO) ==
RS_HUMAN_READABLE_VERSION &&
std::stoull(oldDoc.get_value(INDEXERS_COUNT_VALUENO)) ==
indexersRegister.size() )
{
/* Looks like this file has already been indexed by this RetroShare
* exact version, so we can skip it. If the version was different it
* made sense to reindex it as better indexers might be available
* since last time it was indexed */
Dbg3() << __PRETTY_FUNCTION__ << " skipping laready indexed file: "
<< hash << " " << name << std::endl;
return true;
}
}
Xapian::Document doc;
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
// We make a document and tell the term generator to use this.
Xapian::Document doc;
termgenerator.set_document(doc);
for(auto& indexerPair : indexersRegister)
if(indexerPair.second(path, name, termgenerator, doc) > 50)
break;
const std::string hashString = hash.toStdString();
const std::string idTerm("Q" + hashString);
doc.add_boolean_term(idTerm);
termgenerator.index_text(name, 1, "N");
termgenerator.index_text(name);
doc.add_value(FILE_HASH_VALUENO, hashString);
doc.add_value(INDEXER_VERSION_VALUENO, RS_HUMAN_READABLE_VERSION);
doc.add_value(
INDEXERS_COUNT_VALUENO,
std::to_string(indexersRegister.size()) );
db.replace_document(idTerm, doc);
return true;
@ -141,3 +159,13 @@ uint32_t DeepFilesIndex::search(
# include "deep_search/filesoggindexer.hpp"
static RsDeepOggFileIndexer oggFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_OGG
#ifdef RS_DEEP_FILES_INDEX_FLAC
# include "deep_search/filesflacindexer.hpp"
static RsDeepFlacFileIndexer flacFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_FLAC
#ifdef RS_DEEP_FILES_INDEX_TAGLIB
# include "deep_search/filestaglibindexer.hpp"
static RsDeepTaglibFileIndexer taglibFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_TAGLIB

View File

@ -82,6 +82,14 @@ private:
/// Used to store RsFileHash of indexed documents
FILE_HASH_VALUENO,
/** Used to check if some file need reindex because was indexed with an
* older version of the indexer */
INDEXER_VERSION_VALUENO,
/** Used to check if some file need reindex because was indexed with an
* older version of the indexer */
INDEXERS_COUNT_VALUENO,
/// @see Xapian::BAD_VALUENO
BAD_VALUENO = Xapian::BAD_VALUENO
};
@ -91,5 +99,5 @@ private:
/** Storage for indexers function by order */
static std::multimap<int, IndexerFunType> indexersRegister;
RS_SET_CONTEXT_DEBUG_LEVEL(4)
RS_SET_CONTEXT_DEBUG_LEVEL(1)
};

View File

@ -74,7 +74,7 @@ struct RsDeepOggFileIndexer
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
if(tagName.find("COVERART") != tagName.npos)
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
@ -93,5 +93,5 @@ struct RsDeepOggFileIndexer
return 0;
}
RS_SET_CONTEXT_DEBUG_LEVEL(2)
RS_SET_CONTEXT_DEBUG_LEVEL(1)
};

View File

@ -0,0 +1,103 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "util/rsdebug.h"
#include <xapian.h>
#include <string>
#include <memory>
#include <taglib/tag.h>
#include <taglib/fileref.h>
#include <taglib/tpropertymap.h>
struct RsDeepTaglibFileIndexer
{
RsDeepTaglibFileIndexer()
{
DeepFilesIndex::registerIndexer(40, indexFile);
}
static uint32_t indexFile(
const std::string& path, const std::string& /*name*/,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc )
{
Dbg4() << __PRETTY_FUNCTION__ << " " << path << std::endl;
TagLib::FileRef tFile(path.c_str());
if(tFile.isNull()) return 0;
const TagLib::Tag* tag = tFile.tag();
if(!tag) return 0;
TagLib::PropertyMap tMap = tag->properties();
unsigned validCommentsCnt = 0;
std::string docData = xDoc.get_data();
for( TagLib::PropertyMap::ConstIterator mIt = tMap.begin();
mIt != tMap.end(); ++mIt )
{
if(mIt->first.isNull() || mIt->first.isEmpty()) continue;
std::string tagName(mIt->first.upper().to8Bit());
if(mIt->second.isEmpty()) continue;
std::string tagValue(mIt->second.toString(", ").to8Bit(true));
if(tagValue.empty()) continue;
if(tagName == "ARTIST")
xTG.index_text(tagValue, 1, "A");
else if (tagName == "DESCRIPTION")
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
// Index fields without prefixes for general search.
xTG.increase_termpos();
std::string fullComment(tagName + "=" + tagValue);
xTG.index_text(fullComment);
docData += fullComment + "\n";
Dbg2() << __PRETTY_FUNCTION__ << " Indexed " << tagName << "=\""
<< tagValue << '"' << std::endl;
++validCommentsCnt;
}
if(validCommentsCnt > 0)
{
Dbg1() << __PRETTY_FUNCTION__ << " Successfully indexed: " << path
<< std::endl;
xDoc.set_data(docData);
return 99;
}
/* Altought the file appears to be supported by taglib, no comments has
* been found so return less then 50 maybe another indexer is capable of
* extracting information */
return 30;
}
RS_SET_CONTEXT_DEBUG_LEVEL(3)
};

View File

@ -919,6 +919,14 @@ rs_deep_files_index_ogg {
HEADERS += deep_search/filesoggindexer.hpp
}
rs_deep_files_index_flac {
HEADERS += deep_search/filesflacindexer.hpp
}
rs_deep_files_index_taglib {
HEADERS += deep_search/filestaglibindexer.hpp
}
rs_broadcast_discovery {
HEADERS += retroshare/rsbroadcastdiscovery.h \
services/broadcastdiscoveryservice.h

View File

@ -77,6 +77,14 @@ rs_deep_files_index_ogg {
mLibs += vorbisfile
}
rs_deep_files_index_flac {
mLibs += FLAC++
}
rs_deep_files_index_taglib {
mLibs += tag
}
rs_broadcast_discovery {
no_rs_cross_compiling {
UDP_DISCOVERY_SRC_PATH=$$clean_path($${RS_SRC_PATH}/supportlibs/udp-discovery-cpp/)

View File

@ -170,15 +170,25 @@ rs_jsonapi:CONFIG -= no_rs_jsonapi
CONFIG *= no_rs_deep_channel_index
rs_deep_channel_index:CONFIG -= no_rs_deep_channel_index
# To enable file indexing append the following assignation to qmake command
# line "CONFIG+=rs_files_index"
# To enable deep files indexing append the following assignation to qmake
# command line "CONFIG+=rs_files_index"
CONFIG *= no_rs_deep_files_index
rs_deep_files_index:CONFIG -= no_rs_deep_files_index
# To enable Ogg file indexing append the following assignation to qmake command
# line "CONFIG+=rs_deep_files_index_ogg"
# To enable Ogg files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_ogg"
CONFIG *= no_rs_deep_files_index_ogg
rs_deep_files_index_ogg::CONFIG -= no_rs_deep_files_index_ogg
rs_deep_files_index_ogg:CONFIG -= no_rs_deep_files_index_ogg
# To enable FLAC files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_flac"
CONFIG *= no_rs_deep_files_index_flac
rs_deep_files_index_flac:CONFIG -= no_rs_deep_files_index_flac
# To enable taglib files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_taglib"
CONFIG *= no_rs_deep_files_index_taglib
rs_deep_files_index_taglib:CONFIG -= no_rs_deep_files_index_taglib
# To enable native dialogs append the following assignation to qmake command
# line "CONFIG+=rs_use_native_dialogs"
@ -578,6 +588,8 @@ rs_deep_channels_index:DEFINES *= RS_DEEP_CHANNEL_INDEX
rs_deep_files_index:DEFINES *= RS_DEEP_FILES_INDEX
rs_deep_files_index_ogg:DEFINES *= RS_DEEP_FILES_INDEX_OGG
rs_deep_files_index_flac:DEFINES *= RS_DEEP_FILES_INDEX_FLAC
rs_deep_files_index_taglib:DEFINES *= RS_DEEP_FILES_INDEX_TAGLIB
rs_use_native_dialogs:DEFINES *= RS_NATIVEDIALOGS