Merge pull request #1586 from G10h4ck/files_deep_search

Implement deep indexing for files through Xapian
This commit is contained in:
G10h4ck 2019-10-26 20:58:05 +02:00 committed by GitHub
commit a9510da61b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1687 additions and 438 deletions

View File

@ -1,19 +1,23 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: retroshare
Upstream-Contact: retroshare.team@gmail.com
Upstream-Contact: contact@retroshare.cc
Source: https://github.com/retroshare/retroshare
Files: openpgpsdk/*
Copyright: 2005-2008 Ben Laurie, Rachel Willmer, Retroshare Team <retroshare.team@gmail.com>
Copyright: 2005-2008 Ben Laurie, Rachel Willmer, Retroshare Team <contact@retroshare.cc>
License: Apache-2.0
Files: jsonapi-generator/* libretroshare/src/jsonapi/*
Copyright: 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org>
License: AGPL-3+
License: AGPL-3.0-or-later
Files: libretroshare/src/deep_search/*
Copyright: 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org>
License: AGPL-3.0-only
Files: libretroshare/*
Copyright: 2007-2018, Retroshare Team <retroshare.team@gmail.com>
License: LGPL-3+
Copyright: 2007-2019, Retroshare Team <contact@retroshare.cc>
License: LGPL-3.0-or-later
Files: src/retroshare-gui/src/TorControl/
Copyright: 2014, John Brooks <john.brooks@dereferenced.net>
@ -28,8 +32,8 @@ Copyright: 2013 Jeff Weinstein <jeff.weinstein@gmail.com>
License: MIT
Files: *
Copyright: 2007-2018, Retroshare Team <retroshare.team@gmail.com>
License: AGPL-3+
Copyright: 2007-2019, Retroshare Team <contact@retroshare.cc>
License: AGPL-3.0-only
#######
# TODO
@ -56,7 +60,7 @@ License: Apache-2.0
See the License for the specific language governing permissions and
limitations under the License.
License: LGPL-3+
License: LGPL-3.0-or-later
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
@ -75,7 +79,7 @@ License: LGPL-3+
OpenSSL that use the same license as OpenSSL), and distribute linked
combinations including the two.
License: AGPL-3+
License: AGPL-3.0-or-later
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
@ -86,11 +90,36 @@ License: AGPL-3+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
.
You should have received a copy of the GNU Lesser General Public License
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
.
As a special exception, the copyright holders give permission to link the
code of portions of this program with the OpenSSL library under certain
code or portions of this program with the OpenSSL library under certain
conditions as described in each individual source file and distribute
linked combinations including the program with the OpenSSL library. You
must comply with the GNU Affero General Public License in all respects for
all of the code used other than as permitted herein. If you modify file(s)
with this exception, you may extend this exception to your version of the
file(s), but you are not obligated to do so. If you do not wish to do so,
delete this exception statement from your version. If you delete this
exception statement from all source files in the program, then also delete
it in the license file.
License: AGPL-3.0-only
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License version 3 as
published by the Free Software Foundation.
.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
.
As a special exception, the copyright holders give permission to link the
code or portions of this program with the OpenSSL library under certain
conditions as described in each individual source file and distribute
linked combinations including the program with the OpenSSL library. You
must comply with the GNU Affero General Public License in all respects for

View File

@ -0,0 +1,230 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/channelsindex.hpp"
#include "deep_search/commonutils.hpp"
uint32_t DeepChannelsIndex::search(
const std::string& queryStr,
std::vector<DeepChannelsSearchResult>& results, uint32_t maxResults )
{
results.clear();
std::unique_ptr<Xapian::Database> dbPtr(
DeepSearch::openReadOnlyDatabase(dbPath()) );
if(!dbPtr) return 0;
Xapian::Database& db(*dbPtr);
// Set up a QueryParser with a stemmer and suitable prefixes.
Xapian::QueryParser queryparser;
//queryparser.set_stemmer(Xapian::Stem("en"));
queryparser.set_stemming_strategy(queryparser.STEM_SOME);
// Start of prefix configuration.
//queryparser.add_prefix("title", "S");
//queryparser.add_prefix("description", "XD");
// End of prefix configuration.
// And parse the query.
Xapian::Query query = queryparser.parse_query(queryStr);
// Use an Enquire object on the database to run the query.
Xapian::Enquire enquire(db);
enquire.set_query(query);
Xapian::MSet mset = enquire.get_mset(
0, maxResults ? maxResults : db.get_doccount() );
for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m )
{
const Xapian::Document& doc = m.get_document();
DeepChannelsSearchResult s;
s.mUrl = doc.get_value(URL_VALUENO);
#if XAPIAN_AT_LEAST(1,3,5)
s.mSnippet = mset.snippet(doc.get_data());
#endif // XAPIAN_AT_LEAST(1,3,5)
results.push_back(s);
}
return static_cast<uint32_t>(results.size());
}
void DeepChannelsIndex::indexChannelGroup(const RsGxsChannelGroup& chan)
{
std::unique_ptr<Xapian::WritableDatabase> dbPtr(
DeepSearch::openWritableDatabase(
dbPath(), Xapian::DB_CREATE_OR_OPEN ) );
if(!dbPtr) return;
Xapian::WritableDatabase& db(*dbPtr);
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
// We make a document and tell the term generator to use this.
Xapian::Document doc;
termgenerator.set_document(doc);
// Index each field with a suitable prefix.
termgenerator.index_text(chan.mMeta.mGroupName, 1, "G");
termgenerator.index_text(
DeepSearch::timetToXapianDate(chan.mMeta.mPublishTs), 1, "D" );
termgenerator.index_text(chan.mDescription, 1, "XD");
// Index fields without prefixes for general search.
termgenerator.index_text(chan.mMeta.mGroupName);
termgenerator.increase_termpos();
termgenerator.index_text(chan.mDescription);
RsUrl chanUrl; chanUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", chan.mMeta.mGroupId.toStdString());
const std::string idTerm("Q" + chanUrl.toString());
chanUrl.setQueryKV("publishTs", std::to_string(chan.mMeta.mPublishTs));
chanUrl.setQueryKV("name", chan.mMeta.mGroupName);
if(!chan.mMeta.mAuthorId.isNull())
chanUrl.setQueryKV("authorId", chan.mMeta.mAuthorId.toStdString());
if(chan.mMeta.mSignFlags)
chanUrl.setQueryKV( "signFlags",
std::to_string(chan.mMeta.mSignFlags) );
std::string rsLink(chanUrl.toString());
// store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink);
// Store some fields for display purposes.
doc.set_data(chan.mMeta.mGroupName + "\n" + chan.mDescription);
// We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the
// indexer. "Q" prefix is a Xapian convention for unique id term.
doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc);
}
void DeepChannelsIndex::removeChannelFromIndex(RsGxsGroupId grpId)
{
// "Q" prefix is a Xapian convention for unique id term.
RsUrl chanUrl; chanUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString());
std::string idTerm("Q" + chanUrl.toString());
std::unique_ptr<Xapian::WritableDatabase> dbPtr(
DeepSearch::openWritableDatabase(
dbPath(), Xapian::DB_CREATE_OR_OPEN ) );
if(!dbPtr) return;
Xapian::WritableDatabase& db(*dbPtr);
db.delete_document(idTerm);
}
void DeepChannelsIndex::indexChannelPost(const RsGxsChannelPost& post)
{
std::unique_ptr<Xapian::WritableDatabase> dbPtr(
DeepSearch::openWritableDatabase(
dbPath(), Xapian::DB_CREATE_OR_OPEN ) );
if(!dbPtr) return;
Xapian::WritableDatabase& db(*dbPtr);
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
// We make a document and tell the term generator to use this.
Xapian::Document doc;
termgenerator.set_document(doc);
// Index each field with a suitable prefix.
termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
termgenerator.index_text(
DeepSearch::timetToXapianDate(post.mMeta.mPublishTs), 1, "D" );
// TODO: we should strip out HTML tags instead of skipping indexing
// Avoid indexing HTML
bool isPlainMsg =
post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>';
if(isPlainMsg)
termgenerator.index_text(post.mMsg, 1, "XD");
// Index fields without prefixes for general search.
termgenerator.index_text(post.mMeta.mMsgName);
if(isPlainMsg)
{
termgenerator.increase_termpos();
termgenerator.index_text(post.mMsg);
}
for(const RsGxsFile& attachment : post.mFiles)
{
termgenerator.index_text(attachment.mName, 1, "F");
termgenerator.increase_termpos();
termgenerator.index_text(attachment.mName);
}
// We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the
// indexer.
RsUrl postUrl; postUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", post.mMeta.mGroupId.toStdString())
.setQueryKV("msgid", post.mMeta.mMsgId.toStdString());
std::string idTerm("Q" + postUrl.toString());
postUrl.setQueryKV("publishTs", std::to_string(post.mMeta.mPublishTs));
postUrl.setQueryKV("name", post.mMeta.mMsgName);
postUrl.setQueryKV("authorId", post.mMeta.mAuthorId.toStdString());
std::string rsLink(postUrl.toString());
// store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink);
// Store some fields for display purposes.
if(isPlainMsg)
doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg);
else doc.set_data(post.mMeta.mMsgName);
doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc);
}
void DeepChannelsIndex::removeChannelPostFromIndex(
RsGxsGroupId grpId, RsGxsMessageId msgId )
{
RsUrl postUrl; postUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString())
.setQueryKV("msgid", msgId.toStdString());
// "Q" prefix is a Xapian convention for unique id term.
std::string idTerm("Q" + postUrl.toString());
std::unique_ptr<Xapian::WritableDatabase> dbPtr(
DeepSearch::openWritableDatabase(
dbPath(), Xapian::DB_CREATE_OR_OPEN ) );
if(!dbPtr) return;
Xapian::WritableDatabase& db(*dbPtr);
db.delete_document(idTerm);
}

View File

@ -0,0 +1,77 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#pragma once
#include <vector>
#include <xapian.h>
#include "util/rstime.h"
#include "retroshare/rsgxschannels.h"
#include "retroshare/rsinit.h"
#include "util/rsurl.h"
struct DeepChannelsSearchResult
{
std::string mUrl;
double mWeight;
std::string mSnippet;
};
struct DeepChannelsIndex
{
/**
* @brief Search indexed GXS groups and messages
* @param[in] maxResults maximum number of acceptable search results, 0 for
* no limits
* @return search results count
*/
static uint32_t search( const std::string& queryStr,
std::vector<DeepChannelsSearchResult>& results,
uint32_t maxResults = 100 );
static void indexChannelGroup(const RsGxsChannelGroup& chan);
static void removeChannelFromIndex(RsGxsGroupId grpId);
static void indexChannelPost(const RsGxsChannelPost& post);
static void removeChannelPostFromIndex(
RsGxsGroupId grpId, RsGxsMessageId msgId );
static uint32_t indexFile(const std::string& path);
private:
enum : Xapian::valueno
{
/// Used to store retroshare url of indexed documents
URL_VALUENO,
/// @see Xapian::BAD_VALUENO
BAD_VALUENO = Xapian::BAD_VALUENO
};
static const std::string& dbPath()
{
static const std::string dbDir =
RsAccounts::AccountDirectory() + "/deep_channels_xapian_db";
return dbDir;
}
};

View File

@ -0,0 +1,93 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/commonutils.hpp"
#include "util/stacktrace.h"
#include "util/rsdebug.h"
namespace DeepSearch
{
std::unique_ptr<Xapian::WritableDatabase> openWritableDatabase(
const std::string& path, int flags, int blockSize )
{
try
{
std::unique_ptr<Xapian::WritableDatabase> dbPtr(
new Xapian::WritableDatabase(path, flags, blockSize) );
return dbPtr;
}
catch(Xapian::DatabaseLockError)
{
RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock "
<< path << std::endl;
print_stacktrace();
}
catch(...)
{
RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted "
<< "deleting it might help without causing any harm: "
<< path << std::endl;
print_stacktrace();
}
return nullptr;
}
std::unique_ptr<Xapian::Database> openReadOnlyDatabase(
const std::string& path, int flags )
{
try
{
std::unique_ptr<Xapian::Database> dbPtr(
new Xapian::Database(path, flags) );
return dbPtr;
}
catch(Xapian::DatabaseOpeningError e)
{
RsWarn() << __PRETTY_FUNCTION__ << " " << e.get_msg()
<< ", probably nothing has been indexed yet." << std::endl;
}
catch(Xapian::DatabaseLockError)
{
RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock "
<< path << std::endl;
print_stacktrace();
}
catch(...)
{
RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted "
<< "deleting it might help without causing any harm: "
<< path << std::endl;
print_stacktrace();
}
return nullptr;
}
std::string timetToXapianDate(const rstime_t& time)
{
char date[] = "YYYYMMDD\0";
time_t tTime = static_cast<time_t>(time);
std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime));
return date;
}
}

View File

@ -0,0 +1,45 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#pragma once
#include <xapian.h>
#include <memory>
#include "util/rstime.h"
#ifndef XAPIAN_AT_LEAST
#define XAPIAN_AT_LEAST(A,B,C) (XAPIAN_MAJOR_VERSION > (A) || \
(XAPIAN_MAJOR_VERSION == (A) && \
(XAPIAN_MINOR_VERSION > (B) || \
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif // ndef XAPIAN_AT_LEAST
namespace DeepSearch
{
std::unique_ptr<Xapian::WritableDatabase> openWritableDatabase(
const std::string& path, int flags = 0, int blockSize = 0 );
std::unique_ptr<Xapian::Database> openReadOnlyDatabase(
const std::string& path, int flags = 0 );
std::string timetToXapianDate(const rstime_t& time);
}

View File

@ -1,276 +0,0 @@
/*******************************************************************************
* libretroshare/src/crypto: crypto.h *
* *
* libretroshare: retroshare core library *
* *
* Copyright (C) 2018 Gioacchino Mazzurco <gio@eigenlab.org> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as *
* published by the Free Software Foundation, either version 3 of the *
* License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#pragma once
#include "util/rstime.h"
#include <vector>
#include <xapian.h>
#include "retroshare/rsgxschannels.h"
#include "retroshare/rsinit.h"
#include "util/rsurl.h"
#ifndef XAPIAN_AT_LEAST
#define XAPIAN_AT_LEAST(A,B,C) (XAPIAN_MAJOR_VERSION > (A) || \
(XAPIAN_MAJOR_VERSION == (A) && \
(XAPIAN_MINOR_VERSION > (B) || \
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif // ndef XAPIAN_AT_LEAST
struct DeepSearch
{
struct SearchResult
{
std::string mUrl;
std::string mSnippet;
};
/**
* @param[in] maxResults maximum number of acceptable search results, 0 for
* no limits
* @return search results count
*/
static uint32_t search( const std::string& queryStr,
std::vector<SearchResult>& results,
uint32_t maxResults = 100 )
{
results.clear();
Xapian::Database db;
// Open the database we're going to search.
try { db = Xapian::Database(dbPath()); }
catch(Xapian::DatabaseOpeningError e)
{
std::cerr << __PRETTY_FUNCTION__ << " " << e.get_msg()
<< ", probably nothing has been indexed yet."<< std::endl;
return 0;
}
catch(Xapian::DatabaseError e)
{
std::cerr << __PRETTY_FUNCTION__ << " " << e.get_msg()
<< " this is fishy, maybe " << dbPath()
<< " has been corrupted (deleting it may help in that "
<< "case without loosing data)" << std::endl;
return 0;
}
// Set up a QueryParser with a stemmer and suitable prefixes.
Xapian::QueryParser queryparser;
//queryparser.set_stemmer(Xapian::Stem("en"));
queryparser.set_stemming_strategy(queryparser.STEM_SOME);
// Start of prefix configuration.
//queryparser.add_prefix("title", "S");
//queryparser.add_prefix("description", "XD");
// End of prefix configuration.
// And parse the query.
Xapian::Query query = queryparser.parse_query(queryStr);
// Use an Enquire object on the database to run the query.
Xapian::Enquire enquire(db);
enquire.set_query(query);
Xapian::MSet mset = enquire.get_mset(
0, maxResults ? maxResults : db.get_doccount() );
for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m )
{
const Xapian::Document& doc = m.get_document();
SearchResult s;
s.mUrl = doc.get_value(URL_VALUENO);
#if XAPIAN_AT_LEAST(1,3,5)
s.mSnippet = mset.snippet(doc.get_data());
#endif // XAPIAN_AT_LEAST(1,3,5)
results.push_back(s);
}
return results.size();
}
static void indexChannelGroup(const RsGxsChannelGroup& chan)
{
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
// We make a document and tell the term generator to use this.
Xapian::Document doc;
termgenerator.set_document(doc);
// Index each field with a suitable prefix.
termgenerator.index_text(chan.mMeta.mGroupName, 1, "G");
termgenerator.index_text(timetToXapianDate(chan.mMeta.mPublishTs), 1, "D");
termgenerator.index_text(chan.mDescription, 1, "XD");
// Index fields without prefixes for general search.
termgenerator.index_text(chan.mMeta.mGroupName);
termgenerator.increase_termpos();
termgenerator.index_text(chan.mDescription);
RsUrl chanUrl; chanUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", chan.mMeta.mGroupId.toStdString());
const std::string idTerm("Q" + chanUrl.toString());
chanUrl.setQueryKV("publishTs", std::to_string(chan.mMeta.mPublishTs));
chanUrl.setQueryKV("name", chan.mMeta.mGroupName);
if(!chan.mMeta.mAuthorId.isNull())
chanUrl.setQueryKV("authorId", chan.mMeta.mAuthorId.toStdString());
if(chan.mMeta.mSignFlags)
chanUrl.setQueryKV( "signFlags",
std::to_string(chan.mMeta.mSignFlags) );
std::string rsLink(chanUrl.toString());
// store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink);
// Store some fields for display purposes.
doc.set_data(chan.mMeta.mGroupName + "\n" + chan.mDescription);
// We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the
// indexer. "Q" prefix is a Xapian convention for unique id term.
doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc);
}
static void removeChannelFromIndex(RsGxsGroupId grpId)
{
// "Q" prefix is a Xapian convention for unique id term.
RsUrl chanUrl; chanUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString());
std::string idTerm("Q" + chanUrl.toString());
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
db.delete_document(idTerm);
}
static void indexChannelPost(const RsGxsChannelPost& post)
{
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
// We make a document and tell the term generator to use this.
Xapian::Document doc;
termgenerator.set_document(doc);
// Index each field with a suitable prefix.
termgenerator.index_text(post.mMeta.mMsgName, 1, "S");
termgenerator.index_text(timetToXapianDate(post.mMeta.mPublishTs), 1, "D");
// Avoid indexing HTML
bool isPlainMsg = post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>';
if(isPlainMsg)
termgenerator.index_text(post.mMsg, 1, "XD");
// Index fields without prefixes for general search.
termgenerator.index_text(post.mMeta.mMsgName);
if(isPlainMsg)
{
termgenerator.increase_termpos();
termgenerator.index_text(post.mMsg);
}
for(const RsGxsFile& attachment : post.mFiles)
{
termgenerator.index_text(attachment.mName, 1, "F");
termgenerator.increase_termpos();
termgenerator.index_text(attachment.mName);
}
// We use the identifier to ensure each object ends up in the
// database only once no matter how many times we run the
// indexer.
RsUrl postUrl; postUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", post.mMeta.mGroupId.toStdString())
.setQueryKV("msgid", post.mMeta.mMsgId.toStdString());
std::string idTerm("Q" + postUrl.toString());
postUrl.setQueryKV("publishTs", std::to_string(post.mMeta.mPublishTs));
postUrl.setQueryKV("name", post.mMeta.mMsgName);
postUrl.setQueryKV("authorId", post.mMeta.mAuthorId.toStdString());
std::string rsLink(postUrl.toString());
// store the RS link so we are able to retrive it on matching search
doc.add_value(URL_VALUENO, rsLink);
// Store some fields for display purposes.
if(isPlainMsg)
doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg);
else doc.set_data(post.mMeta.mMsgName);
doc.add_boolean_term(idTerm);
db.replace_document(idTerm, doc);
}
static void removeChannelPostFromIndex(
RsGxsGroupId grpId, RsGxsMessageId msgId )
{
RsUrl postUrl; postUrl
.setScheme("retroshare").setPath("/channel")
.setQueryKV("id", grpId.toStdString())
.setQueryKV("msgid", msgId.toStdString());
// "Q" prefix is a Xapian convention for unique id term.
std::string idTerm("Q" + postUrl.toString());
Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN);
db.delete_document(idTerm);
}
private:
enum : Xapian::valueno
{
/// Used to store retroshare url of indexed documents
URL_VALUENO,
/// @see Xapian::BAD_VALUENO
BAD_VALUENO = Xapian::BAD_VALUENO
};
static const std::string& dbPath()
{
static const std::string dbDir =
RsAccounts::AccountDirectory() + "/deep_search_xapian_db";
return dbDir;
}
static std::string timetToXapianDate(const rstime_t& time)
{
char date[] = "YYYYMMDD\0";
time_t tTime = static_cast<time_t>(time);
std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime));
return date;
}
};

View File

@ -0,0 +1,156 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "util/rsdebug.h"
#include <xapian.h>
#include <string>
#include <FLAC++/metadata.h>
#include <cctype>
#include <memory>
struct RsDeepFlacFileIndexer
{
RsDeepFlacFileIndexer()
{
DeepFilesIndex::registerIndexer(31, indexFlacFile);
}
static uint32_t indexFlacFile(
const std::string& path, const std::string& /*name*/,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc )
{
Dbg3() << __PRETTY_FUNCTION__ << " " << path << std::endl;
using FlacChain = FLAC::Metadata::Chain;
std::unique_ptr<FlacChain> flacChain(new FlacChain);
if(!flacChain->is_valid())
{
RsErr() << __PRETTY_FUNCTION__ << " Failed creating FLAC Chain 1"
<< std::endl;
return 1;
}
if(!flacChain->read(path.c_str(), false))
{
Dbg3() << __PRETTY_FUNCTION__ << " Failed to open the file as FLAC"
<< std::endl;
flacChain.reset(new FlacChain);
if(!flacChain->is_valid())
{
RsErr() << __PRETTY_FUNCTION__
<< " Failed creating FLAC Chain 2" << std::endl;
return 1;
}
if(!flacChain->read(path.c_str(), true))
{
Dbg3() << __PRETTY_FUNCTION__
<< " Failed to open the file as OggFLAC"
<< std::endl;
return 0;
}
}
unsigned validCommentsCnt = 0;
std::string docData = xDoc.get_data();
FLAC::Metadata::Iterator mdit;
mdit.init(*flacChain);
if(!mdit.is_valid()) return 1;
do
{
::FLAC__MetadataType mdt = mdit.get_block_type();
if (mdt != FLAC__METADATA_TYPE_VORBIS_COMMENT) continue;
Dbg2() << __PRETTY_FUNCTION__ << " Found Vorbis Comment Block"
<< std::endl;
std::unique_ptr<FLAC::Metadata::Prototype> proto(mdit.get_block());
if(!proto) continue;
const FLAC::Metadata::VorbisComment* vc =
dynamic_cast<FLAC::Metadata::VorbisComment*>(proto.get());
if(!vc || !vc->is_valid()) continue;
unsigned numComments = vc->get_num_comments();
for(unsigned i = 0; i < numComments; ++i)
{
FLAC::Metadata::VorbisComment::Entry entry =
vc->get_comment(i);
if(!entry.is_valid()) continue;
std::string tagName( entry.get_field_name(),
entry.get_field_name_length() );
/* Vorbis tags should be uppercases but not all the softwares
* enforce it */
for (auto& c: tagName) c = static_cast<char>(toupper(c));
std::string tagValue( entry.get_field_value(),
entry.get_field_value_length() );
if(tagValue.empty()) continue;
if(tagName == "ARTIST")
xTG.index_text(tagValue, 1, "A");
else if (tagName == "DESCRIPTION")
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
// Index fields without prefixes for general search.
xTG.increase_termpos();
std::string fullComment(tagName + "=" + tagValue);
xTG.index_text(fullComment);
docData += fullComment + "\n";
Dbg2() << __PRETTY_FUNCTION__ << " Indexed " << fullComment
<< std::endl;
++validCommentsCnt;
}
}
while(mdit.next());
if(validCommentsCnt > 0)
{
Dbg1() << __PRETTY_FUNCTION__ << " Successfully indexed: " << path
<< std::endl;
xDoc.set_data(docData);
return 99;
}
/* Altought the file appears to be a valid FLAC, no vorbis comment has
* been found so return less then 50 maybe it has tagged only with ID3
* tags ? */
return 30;
}
RS_SET_CONTEXT_DEBUG_LEVEL(3)
};

View File

@ -0,0 +1,171 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "deep_search/commonutils.hpp"
#include "util/rsdebug.h"
#include "retroshare/rsinit.h"
#include "retroshare/rsversion.h"
#include <utility>
/*static*/ std::multimap<int, DeepFilesIndex::IndexerFunType>
DeepFilesIndex::indexersRegister = {};
bool DeepFilesIndex::indexFile(
const std::string& path, const std::string& name,
const RsFileHash& hash )
{
auto dbPtr = DeepSearch::openWritableDatabase(
mDbPath, Xapian::DB_CREATE_OR_OPEN );
if(!dbPtr) return false;
Xapian::WritableDatabase& db(*dbPtr);
const std::string hashString = hash.toStdString();
const std::string idTerm("Q" + hashString);
Xapian::Document oldDoc;
Xapian::PostingIterator pIt = db.postlist_begin(idTerm);
if( pIt != db.postlist_end(idTerm) )
{
oldDoc = db.get_document(*pIt);
if( oldDoc.get_value(INDEXER_VERSION_VALUENO) ==
RS_HUMAN_READABLE_VERSION &&
std::stoull(oldDoc.get_value(INDEXERS_COUNT_VALUENO)) ==
indexersRegister.size() )
{
/* Looks like this file has already been indexed by this RetroShare
* exact version, so we can skip it. If the version was different it
* made sense to reindex it as better indexers might be available
* since last time it was indexed */
Dbg3() << __PRETTY_FUNCTION__ << " skipping laready indexed file: "
<< hash << " " << name << std::endl;
return true;
}
}
Xapian::Document doc;
// Set up a TermGenerator that we'll use in indexing.
Xapian::TermGenerator termgenerator;
//termgenerator.set_stemmer(Xapian::Stem("en"));
termgenerator.set_document(doc);
for(auto& indexerPair : indexersRegister)
if(indexerPair.second(path, name, termgenerator, doc) > 50)
break;
doc.add_boolean_term(idTerm);
termgenerator.index_text(name, 1, "N");
termgenerator.index_text(name);
doc.add_value(FILE_HASH_VALUENO, hashString);
doc.add_value(INDEXER_VERSION_VALUENO, RS_HUMAN_READABLE_VERSION);
doc.add_value(
INDEXERS_COUNT_VALUENO,
std::to_string(indexersRegister.size()) );
db.replace_document(idTerm, doc);
return true;
}
bool DeepFilesIndex::removeFileFromIndex(const RsFileHash& hash)
{
Dbg3() << __PRETTY_FUNCTION__ << " removing file from index: "
<< hash << std::endl;
std::unique_ptr<Xapian::WritableDatabase> db =
DeepSearch::openWritableDatabase(mDbPath, Xapian::DB_CREATE_OR_OPEN);
if(!db) return false;
db->delete_document("Q" + hash.toStdString());
return true;
}
/*static*/ std::string DeepFilesIndex::dbDefaultPath()
{ return RsAccounts::AccountDirectory() + "/deep_files_index_xapian_db"; }
/*static*/ bool DeepFilesIndex::registerIndexer(
int order, const DeepFilesIndex::IndexerFunType& indexerFun )
{
Dbg1() << __PRETTY_FUNCTION__ << " " << order << std::endl;
indexersRegister.insert(std::make_pair(order, indexerFun));
return true;
}
uint32_t DeepFilesIndex::search(
const std::string& queryStr,
std::vector<DeepFilesSearchResult>& results, uint32_t maxResults )
{
results.clear();
auto dbPtr = DeepSearch::openReadOnlyDatabase(mDbPath);
if(!dbPtr) return 0;
Xapian::Database& db(*dbPtr);
// Set up a QueryParser with a stemmer and suitable prefixes.
Xapian::QueryParser queryparser;
//queryparser.set_stemmer(Xapian::Stem("en"));
queryparser.set_stemming_strategy(queryparser.STEM_SOME);
// Start of prefix configuration.
//queryparser.add_prefix("title", "S");
//queryparser.add_prefix("description", "XD");
// End of prefix configuration.
// And parse the query.
Xapian::Query query = queryparser.parse_query(queryStr);
// Use an Enquire object on the database to run the query.
Xapian::Enquire enquire(db);
enquire.set_query(query);
Xapian::MSet mset = enquire.get_mset(
0, maxResults ? maxResults : db.get_doccount() );
for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m )
{
const Xapian::Document& doc = m.get_document();
DeepFilesSearchResult s;
s.mFileHash = RsFileHash(doc.get_value(FILE_HASH_VALUENO));
s.mWeight = m.get_weight();
#if XAPIAN_AT_LEAST(1,3,5)
s.mSnippet = mset.snippet(doc.get_data());
#endif // XAPIAN_AT_LEAST(1,3,5)
results.push_back(s);
}
return static_cast<uint32_t>(results.size());
}
#ifdef RS_DEEP_FILES_INDEX_OGG
# include "deep_search/filesoggindexer.hpp"
static RsDeepOggFileIndexer oggFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_OGG
#ifdef RS_DEEP_FILES_INDEX_FLAC
# include "deep_search/filesflacindexer.hpp"
static RsDeepFlacFileIndexer flacFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_FLAC
#ifdef RS_DEEP_FILES_INDEX_TAGLIB
# include "deep_search/filestaglibindexer.hpp"
static RsDeepTaglibFileIndexer taglibFileIndexer;
#endif // def RS_DEEP_FILES_INDEX_TAGLIB

View File

@ -0,0 +1,103 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#pragma once
#include "retroshare/rstypes.h"
#include "util/rsdebug.h"
#include <string>
#include <cstdint>
#include <vector>
#include <xapian.h>
#include <map>
#include <functional>
struct DeepFilesSearchResult
{
DeepFilesSearchResult() : mWeight(0) {}
RsFileHash mFileHash;
double mWeight;
std::string mSnippet;
};
class DeepFilesIndex
{
public:
DeepFilesIndex(const std::string& dbPath) : mDbPath(dbPath) {}
/**
* @brief Search indexed files
* @param[in] maxResults maximum number of acceptable search results, 0 for
* no limits
* @return search results count
*/
uint32_t search( const std::string& queryStr,
std::vector<DeepFilesSearchResult>& results,
uint32_t maxResults = 100 );
/**
* @return false if file could not be indexed because of error or
* unsupported type, true otherwise.
*/
bool indexFile(
const std::string& path, const std::string& name,
const RsFileHash& hash );
/**
* @brief Remove file entry from database
* @return false on error, true otherwise.
*/
bool removeFileFromIndex(const RsFileHash& hash);
static std::string dbDefaultPath();
using IndexerFunType = std::function<
uint32_t( const std::string& path, const std::string& name,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc ) >;
static bool registerIndexer(
int order, const IndexerFunType& indexerFun );
private:
enum : Xapian::valueno
{
/// Used to store RsFileHash of indexed documents
FILE_HASH_VALUENO,
/** Used to check if some file need reindex because was indexed with an
* older version of the indexer */
INDEXER_VERSION_VALUENO,
/** Used to check if some file need reindex because was indexed with an
* older version of the indexer */
INDEXERS_COUNT_VALUENO,
/// @see Xapian::BAD_VALUENO
BAD_VALUENO = Xapian::BAD_VALUENO
};
const std::string mDbPath;
/** Storage for indexers function by order */
static std::multimap<int, IndexerFunType> indexersRegister;
RS_SET_CONTEXT_DEBUG_LEVEL(1)
};

View File

@ -0,0 +1,97 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "util/rsdebug.h"
#include <xapian.h>
#include <string>
#include <vorbis/vorbisfile.h>
#include <cctype>
struct RsDeepOggFileIndexer
{
RsDeepOggFileIndexer()
{
DeepFilesIndex::registerIndexer(30, indexOggFile);
}
static uint32_t indexOggFile(
const std::string& path, const std::string& /*name*/,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc )
{
Dbg3() << __PRETTY_FUNCTION__ << " " << path << std::endl;
OggVorbis_File vf;
int ret = ov_fopen(path.c_str(), &vf);
if(ret == 0 && vf.vc)
{
vorbis_comment& vc = *vf.vc;
std::string docData = xDoc.get_data();
for (int i = 0; i < vc.comments; ++i)
{
using szt = std::string::size_type;
std::string userComment(
vc.user_comments[i],
static_cast<szt>(vc.comment_lengths[i]) );
if(userComment.empty()) continue;
szt equalPos = userComment.find('=');
if(equalPos == std::string::npos) continue;
std::string tagName = userComment.substr(0, equalPos);
if(tagName.empty()) continue;
std::string tagValue = userComment.substr(equalPos + 1);
if(tagValue.empty()) continue;
/* Ogg tags should be uppercases but not all the softwares
* enforce it */
for (auto& c: tagName) c = static_cast<char>(toupper(c));
if(tagName == "ARTIST")
xTG.index_text(tagValue, 1, "A");
else if (tagName == "DESCRIPTION")
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
// Index fields without prefixes for general search.
xTG.increase_termpos();
xTG.index_text(userComment);
docData += userComment + "\n";
}
xDoc.set_data(docData);
ov_clear(&vf);
return 99;
}
return 0;
}
RS_SET_CONTEXT_DEBUG_LEVEL(1)
};

View File

@ -0,0 +1,103 @@
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2019 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include "deep_search/filesindex.hpp"
#include "util/rsdebug.h"
#include <xapian.h>
#include <string>
#include <memory>
#include <taglib/tag.h>
#include <taglib/fileref.h>
#include <taglib/tpropertymap.h>
struct RsDeepTaglibFileIndexer
{
RsDeepTaglibFileIndexer()
{
DeepFilesIndex::registerIndexer(40, indexFile);
}
static uint32_t indexFile(
const std::string& path, const std::string& /*name*/,
Xapian::TermGenerator& xTG, Xapian::Document& xDoc )
{
Dbg4() << __PRETTY_FUNCTION__ << " " << path << std::endl;
TagLib::FileRef tFile(path.c_str());
if(tFile.isNull()) return 0;
const TagLib::Tag* tag = tFile.tag();
if(!tag) return 0;
TagLib::PropertyMap tMap = tag->properties();
unsigned validCommentsCnt = 0;
std::string docData = xDoc.get_data();
for( TagLib::PropertyMap::ConstIterator mIt = tMap.begin();
mIt != tMap.end(); ++mIt )
{
if(mIt->first.isNull() || mIt->first.isEmpty()) continue;
std::string tagName(mIt->first.upper().to8Bit());
if(mIt->second.isEmpty()) continue;
std::string tagValue(mIt->second.toString(", ").to8Bit(true));
if(tagValue.empty()) continue;
if(tagName == "ARTIST")
xTG.index_text(tagValue, 1, "A");
else if (tagName == "DESCRIPTION")
xTG.index_text(tagValue, 1, "XD");
else if (tagName == "TITLE")
xTG.index_text(tagValue, 1, "S");
else if(tagName.find("COVERART") != tagName.npos)
continue; // Avoid polluting the index with binary data
else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos)
continue; // Avoid polluting the index with binary data
// Index fields without prefixes for general search.
xTG.increase_termpos();
std::string fullComment(tagName + "=" + tagValue);
xTG.index_text(fullComment);
docData += fullComment + "\n";
Dbg2() << __PRETTY_FUNCTION__ << " Indexed " << tagName << "=\""
<< tagValue << '"' << std::endl;
++validCommentsCnt;
}
if(validCommentsCnt > 0)
{
Dbg1() << __PRETTY_FUNCTION__ << " Successfully indexed: " << path
<< std::endl;
xDoc.set_data(docData);
return 99;
}
/* Altought the file appears to be supported by taglib, no comments has
* been found so return less then 50 maybe another indexer is capable of
* extracting information */
return 30;
}
RS_SET_CONTEXT_DEBUG_LEVEL(3)
};

View File

@ -21,15 +21,19 @@
******************************************************************************/
#include <sstream>
#include <algorithm>
#include "util/rstime.h"
#include "util/rsdir.h"
#include "util/rsprint.h"
#include "retroshare/rsexpr.h"
#include "dir_hierarchy.h"
#include "filelist_io.h"
#include "file_sharing_defaults.h"
#ifdef RS_DEEP_FILES_INDEX
# include "deep_search/filesindex.hpp"
#endif // def RS_DEEP_FILES_INDEX
//#define DEBUG_DIRECTORY_STORAGE 1
typedef FileListIO::read_error read_error;
@ -391,6 +395,11 @@ void InternalFileHierarchyStorage::deleteFileNode(uint32_t index)
{
FileEntry& fe(*static_cast<FileEntry*>(mNodes[index])) ;
#ifdef RS_DEEP_FILES_INDEX
DeepFilesIndex tfi(DeepFilesIndex::dbDefaultPath());
tfi.removeFileFromIndex(fe.file_hash);
#endif
if(mTotalSize >= fe.file_size)
mTotalSize -= fe.file_size ;
@ -753,7 +762,9 @@ int InternalFileHierarchyStorage::searchBoolExp(RsRegularExpression::Expression
return 0;
}
int InternalFileHierarchyStorage::searchTerms(const std::list<std::string>& terms, std::list<DirectoryStorage::EntryIndex> &results) const
int InternalFileHierarchyStorage::searchTerms(
const std::list<std::string>& terms,
std::list<DirectoryStorage::EntryIndex>& results ) const
{
// most entries are likely to be files, so we could do a linear search over the entries tab.
// instead we go through the table of hashes.

View File

@ -20,6 +20,7 @@
* *
******************************************************************************/
#include <set>
#include "util/rstime.h"
#include "serialiser/rstlvbinary.h"
#include "retroshare/rspeers.h"
@ -30,6 +31,10 @@
#include "dir_hierarchy.h"
#include "filelist_io.h"
#ifdef RS_DEEP_FILES_INDEX
# include "deep_search/filesindex.hpp"
#endif // def RS_DEEP_FILES_INDEX
//#define DEBUG_REMOTE_DIRECTORY_STORAGE 1
/******************************************************************************************************************/
@ -180,7 +185,9 @@ void DirectoryStorage::print()
mFileHierarchy->print();
}
int DirectoryStorage::searchTerms(const std::list<std::string>& terms, std::list<EntryIndex> &results) const
int DirectoryStorage::searchTerms(
const std::list<std::string>& terms,
std::list<EntryIndex>& results ) const
{
RS_STACK_MUTEX(mDirStorageMtx) ;
return mFileHierarchy->searchTerms(terms,results);
@ -501,18 +508,39 @@ void LocalDirectoryStorage::updateTimeStamps()
#endif
}
}
bool LocalDirectoryStorage::updateHash(const EntryIndex& index, const RsFileHash& hash, bool update_internal_hierarchy)
bool LocalDirectoryStorage::updateHash(
const EntryIndex& index, const RsFileHash& hash,
bool update_internal_hierarchy )
{
RS_STACK_MUTEX(mDirStorageMtx) ;
bool ret = false;
{
RS_STACK_MUTEX(mDirStorageMtx);
mEncryptedHashes[makeEncryptedHash(hash)] = hash ;
mChanged = true ;
#ifdef DEBUG_LOCAL_DIRECTORY_STORAGE
std::cerr << "Updating index of hash " << hash << " update_internal=" << update_internal_hierarchy << std::endl;
std::cerr << "Updating index of hash " << hash << " update_internal="
<< update_internal_hierarchy << std::endl;
#endif
return (!update_internal_hierarchy)|| mFileHierarchy->updateHash(index,hash);
ret = (!update_internal_hierarchy) ||
mFileHierarchy->updateHash(index,hash);
} // RS_STACK_MUTEX(mDirStorageMtx);
#ifdef RS_DEEP_FILES_INDEX
FileInfo fInfo;
if( ret && getFileInfo(index, fInfo) &&
fInfo.storage_permission_flags & DIR_FLAGS_ANONYMOUS_SEARCH )
{
DeepFilesIndex dfi(DeepFilesIndex::dbDefaultPath());
ret &= dfi.indexFile(fInfo.path, fInfo.fname, hash);
}
#endif // def RS_DEEP_FILES_INDEX
return ret;
}
std::string LocalDirectoryStorage::locked_findRealRootFromVirtualFilename(const std::string& virtual_rootdir) const
{

View File

@ -54,6 +54,10 @@
#include <iostream>
#include "util/rstime.h"
#ifdef RS_DEEP_FILES_INDEX
# include "deep_search/filesindex.hpp"
#endif // def RS_DEEP_FILES_INDEX
/***
* #define SERVER_DEBUG 1
* #define SERVER_DEBUG_CACHE 1
@ -65,9 +69,26 @@
static const rstime_t FILE_TRANSFER_LOW_PRIORITY_TASKS_PERIOD = 5 ; // low priority tasks handling every 5 seconds
static const rstime_t FILE_TRANSFER_MAX_DELAY_BEFORE_DROP_USAGE_RECORD = 10 ; // keep usage records for 10 secs at most.
#ifdef RS_DEEP_FILES_INDEX
TurtleFileInfoV2::TurtleFileInfoV2(const DeepFilesSearchResult& dRes) :
fHash(dRes.mFileHash), fWeight(static_cast<float>(dRes.mWeight)),
fSnippet(dRes.mSnippet)
{
FileInfo fInfo;
rsFiles->FileDetails(fHash, RS_FILE_HINTS_LOCAL, fInfo);
fSize = fInfo.size;
fName = fInfo.fname;
}
#endif // def RS_DEEP_FILES_INDEX
TurtleFileInfoV2::~TurtleFileInfoV2() = default;
/* Setup */
ftServer::ftServer(p3PeerMgr *pm, p3ServiceControl *sc)
: p3Service(),RsServiceSerializer(RS_SERVICE_TYPE_TURTLE), // should be FT, but this is for backward compatibility
ftServer::ftServer(p3PeerMgr *pm, p3ServiceControl *sc):
p3Service(),
// should be FT, but this is for backward compatibility
RsServiceSerializer(RS_SERVICE_TYPE_TURTLE),
mPeerMgr(pm), mServiceCtrl(sc),
mFileDatabase(NULL),
mFtController(NULL), mFtExtra(NULL),
@ -500,15 +521,24 @@ bool ftServer::FileDetails(const RsFileHash &hash, FileSearchFlags hintflags, Fi
return false;
}
RsItem *ftServer::create_item(uint16_t service,uint8_t item_type) const
RsItem *ftServer::create_item(uint16_t service, uint8_t item_type) const
{
#ifdef SERVER_DEBUG
FTSERVER_DEBUG() << "p3turtle: deserialising packet: " << std::endl ;
#endif
if (RS_SERVICE_TYPE_TURTLE != service)
RsServiceType serviceType = static_cast<RsServiceType>(service);
switch (serviceType)
{
FTSERVER_ERROR() << " Wrong type !!" << std::endl ;
return NULL; /* wrong type */
/* This one is here for retro-compatibility as turtle routing and file
* trasfer services were just one service before turle service was
* generalized */
case RsServiceType::TURTLE: break;
case RsServiceType::FILE_TRANSFER: break;
default:
RsErr() << __PRETTY_FUNCTION__ << " Wrong service type: " << service
<< std::endl;
return nullptr;
}
try
@ -521,16 +551,19 @@ RsItem *ftServer::create_item(uint16_t service,uint8_t item_type) const
case RS_TURTLE_SUBTYPE_FILE_MAP : return new RsTurtleFileMapItem();
case RS_TURTLE_SUBTYPE_CHUNK_CRC_REQUEST : return new RsTurtleChunkCrcRequestItem();
case RS_TURTLE_SUBTYPE_CHUNK_CRC : return new RsTurtleChunkCrcItem();
case static_cast<uint8_t>(RsFileItemType::FILE_SEARCH_REQUEST):
return new RsFileSearchRequestItem();
case static_cast<uint8_t>(RsFileItemType::FILE_SEARCH_RESULT):
return new RsFileSearchResultItem();
default:
return NULL ;
return nullptr;
}
}
catch(std::exception& e)
{
FTSERVER_ERROR() << "(EE) deserialisation error in " << __PRETTY_FUNCTION__ << ": " << e.what() << std::endl;
return NULL ;
return nullptr;
}
}
@ -1837,7 +1870,12 @@ void ftServer::ftReceiveSearchResult(RsTurtleFTSearchResultItem *item)
if(cbpt != mSearchCallbacksMap.end())
{
hasCallback = true;
cbpt->second.first(item->result);
std::vector<TurtleFileInfoV2> cRes;
for( const auto& tfiold : item->result)
cRes.push_back(tfiold);
cbpt->second.first(cRes);
}
} // end RS_STACK_MUTEX(mSearchCallbacksMapMutex);
@ -1845,6 +1883,99 @@ void ftServer::ftReceiveSearchResult(RsTurtleFTSearchResultItem *item)
RsServer::notify()->notifyTurtleSearchResult(item->PeerId(),item->request_id, item->result );
}
bool ftServer::receiveSearchRequest(
unsigned char* searchRequestData, uint32_t searchRequestDataLen,
unsigned char*& searchResultData, uint32_t& searchResultDataLen,
uint32_t& maxAllowsHits )
{
#ifdef RS_DEEP_FILES_INDEX
std::unique_ptr<RsItem> recvItem(
RsServiceSerializer::deserialise(
searchRequestData, &searchRequestDataLen ) );
if(!recvItem)
{
RsWarn() << __PRETTY_FUNCTION__ << " Search request deserialization "
<< "failed" << std::endl;
return false;
}
std::unique_ptr<RsFileSearchRequestItem> sReqItPtr(
dynamic_cast<RsFileSearchRequestItem*>(recvItem.get()) );
if(!sReqItPtr)
{
RsWarn() << __PRETTY_FUNCTION__ << " Received an invalid search request"
<< " " << *recvItem << std::endl;
return false;
}
recvItem.release();
RsFileSearchRequestItem& searchReq(*sReqItPtr);
std::vector<DeepFilesSearchResult> dRes;
DeepFilesIndex dfi(DeepFilesIndex::dbDefaultPath());
if(dfi.search(searchReq.queryString, dRes, maxAllowsHits) > 0)
{
RsFileSearchResultItem resIt;
for(const auto& dMatch : dRes)
resIt.mResults.push_back(TurtleFileInfoV2(dMatch));
searchResultDataLen = RsServiceSerializer::size(&resIt);
searchResultData = static_cast<uint8_t*>(malloc(searchResultDataLen));
return RsServiceSerializer::serialise(
&resIt, searchResultData, &searchResultDataLen );
}
#endif // def RS_DEEP_FILES_INDEX
searchResultData = nullptr;
searchResultDataLen = 0;
return false;
}
void ftServer::receiveSearchResult(
TurtleSearchRequestId requestId, unsigned char* searchResultData,
uint32_t searchResultDataLen )
{
if(!searchResultData || !searchResultDataLen)
{
RsWarn() << __PRETTY_FUNCTION__ << " got null paramethers "
<< "searchResultData: " << static_cast<void*>(searchResultData)
<< " searchResultDataLen: " << searchResultDataLen
<< " seems someone else in the network have a buggy RetroShare"
<< " implementation" << std::endl;
return;
}
RS_STACK_MUTEX(mSearchCallbacksMapMutex);
auto cbpt = mSearchCallbacksMap.find(requestId);
if(cbpt != mSearchCallbacksMap.end())
{
RsItem* recvItem = RsServiceSerializer::deserialise(
searchResultData, &searchResultDataLen );
if(!recvItem)
{
RsWarn() << __PRETTY_FUNCTION__ << " Search result deserialization "
<< "failed" << std::endl;
return;
}
std::unique_ptr<RsFileSearchResultItem> resItPtr(
dynamic_cast<RsFileSearchResultItem*>(recvItem) );
if(!resItPtr)
{
RsWarn() << __PRETTY_FUNCTION__ << " Received invalid search result"
<< std::endl;
delete recvItem;
return;
}
cbpt->second.first(resItPtr->mResults);
}
}
/***************************** CONFIG ****************************/
bool ftServer::addConfiguration(p3ConfigMgr *cfgmgr)
@ -1857,20 +1988,66 @@ bool ftServer::addConfiguration(p3ConfigMgr *cfgmgr)
return true;
}
#ifdef RS_DEEP_FILES_INDEX
static std::vector<std::string> xapianQueryKeywords =
{
" AND ", " OR ", " NOT ", " XOR ", " +", " -", " ( ", " ) ", " NEAR ",
" ADJ ", " \"", "\" "
};
#endif
bool ftServer::turtleSearchRequest(
const std::string& matchString,
const std::function<void (const std::list<TurtleFileInfo>& results)>& multiCallback,
const std::function<void (const std::vector<TurtleFileInfoV2>& results)>& multiCallback,
rstime_t maxWait )
{
if(matchString.empty())
{
std::cerr << __PRETTY_FUNCTION__ << " match string can't be empty!"
RsWarn() << __PRETTY_FUNCTION__ << " match string can't be empty!"
<< std::endl;
return false;
}
TurtleRequestId sId = turtleSearch(matchString);
#ifdef RS_DEEP_FILES_INDEX
RsFileSearchRequestItem sItem;
sItem.queryString = matchString;
uint32_t iSize = RsServiceSerializer::size(&sItem);
uint8_t* iBuf = static_cast<uint8_t*>(malloc(iSize));
RsServiceSerializer::serialise(&sItem, iBuf, &iSize);
Dbg3() << __PRETTY_FUNCTION__ << " sending search request:" << sItem
<< std::endl;
TurtleRequestId xsId = mTurtleRouter->turtleSearch(iBuf, iSize, this);
{ RS_STACK_MUTEX(mSearchCallbacksMapMutex);
mSearchCallbacksMap.emplace(
xsId,
std::make_pair(
multiCallback,
std::chrono::system_clock::now() +
std::chrono::seconds(maxWait) ) );
} // RS_STACK_MUTEX(mSearchCallbacksMapMutex);
/* Trick to keep receiving more or less usable results from old peers */
std::string strippedQuery = matchString;
for(const std::string& xKeyword : xapianQueryKeywords)
{
std::string::size_type pos = std::string::npos;
while( (pos = strippedQuery.find(xKeyword)) != std::string::npos )
strippedQuery.replace(pos, xKeyword.length(), " ");
}
Dbg3() << __PRETTY_FUNCTION__ << " sending stripped query for "
<< "retro-compatibility: " << strippedQuery << std::endl;
TurtleRequestId sId = mTurtleRouter->turtleSearch(strippedQuery);
#else // def RS_DEEP_FILES_INDEX
TurtleRequestId sId = mTurtleRouter->turtleSearch(matchString);
#endif // def RS_DEEP_FILES_INDEX
{
RS_STACK_MUTEX(mSearchCallbacksMapMutex);
mSearchCallbacksMap.emplace(
sId,
@ -1878,6 +2055,7 @@ bool ftServer::turtleSearchRequest(
multiCallback,
std::chrono::system_clock::now() +
std::chrono::seconds(maxWait) ) );
}
return true;
}
@ -1913,4 +2091,13 @@ bool ftServer::isHashBanned(const RsFileHash& hash)
return mFileDatabase->isFileBanned(hash);
}
RsFileItem::~RsFileItem() = default;
RsFileItem::RsFileItem(RsFileItemType subtype) :
RsItem( RS_PKT_VERSION_SERVICE,
static_cast<uint16_t>(RsServiceType::FILE_TRANSFER),
static_cast<uint8_t>(subtype) ) {}
void RsFileSearchRequestItem::clear() { queryString.clear(); }
void RsFileSearchResultItem::clear() { mResults.clear(); }

View File

@ -46,7 +46,8 @@
#include "turtle/turtleclientservice.h"
#include "services/p3service.h"
#include "retroshare/rsfiles.h"
#include "rsitems/rsitem.h"
#include "serialiser/rsserial.h"
#include "pqi/pqi.h"
#include "pqi/p3cfgmgr.h"
@ -67,7 +68,53 @@ class p3PeerMgr;
class p3ServiceControl;
class p3FileDatabase;
class ftServer: public p3Service, public RsFiles, public ftDataSend, public RsTurtleClientService, public RsServiceSerializer
enum class RsFileItemType : uint8_t
{
NONE = 0x00, /// Only to detect ununitialized
FILE_SEARCH_REQUEST = 0x57,
FILE_SEARCH_RESULT = 0x58
};
struct RsFileItem : RsItem
{
~RsFileItem() override;
protected:
RsFileItem(RsFileItemType subtype);
};
struct RsFileSearchRequestItem : RsFileItem
{
RsFileSearchRequestItem() : RsFileItem(RsFileItemType::FILE_SEARCH_REQUEST)
{ setPriorityLevel(QOS_PRIORITY_RS_TURTLE_SEARCH_REQUEST); }
std::string queryString;
void serial_process( RsGenericSerializer::SerializeJob j,
RsGenericSerializer::SerializeContext& ctx ) override
{ RS_SERIAL_PROCESS(queryString); }
void clear() override;
};
struct RsFileSearchResultItem : RsFileItem
{
RsFileSearchResultItem() : RsFileItem(RsFileItemType::FILE_SEARCH_RESULT)
{ setPriorityLevel(QOS_PRIORITY_RS_TURTLE_SEARCH_RESULT); }
std::vector<TurtleFileInfoV2> mResults;
void serial_process( RsGenericSerializer::SerializeJob j,
RsGenericSerializer::SerializeContext& ctx ) override
{ RS_SERIAL_PROCESS(mResults); }
void clear() override;
};
class ftServer :
public p3Service, public RsFiles, public ftDataSend,
public RsTurtleClientService, public RsServiceSerializer
{
public:
@ -98,7 +145,21 @@ public:
uint16_t serviceId() const { return RS_SERVICE_TYPE_FILE_TRANSFER ; }
virtual bool handleTunnelRequest(const RsFileHash& hash,const RsPeerId& peer_id) ;
virtual void receiveTurtleData(const RsTurtleGenericTunnelItem *item,const RsFileHash& hash,const RsPeerId& virtual_peer_id,RsTurtleGenericTunnelItem::Direction direction) ;
virtual void ftReceiveSearchResult(RsTurtleFTSearchResultItem *item); // We dont use TurtleClientService::receiveSearchResult() because of backward compatibility.
/// We keep this for retro-compatibility @see RsTurtleClientService
virtual void ftReceiveSearchResult(RsTurtleFTSearchResultItem *item);
/// @see RsTurtleClientService
bool receiveSearchRequest(
unsigned char* searchRequestData, uint32_t searchRequestDataLen,
unsigned char*& search_result_data, uint32_t& searchResultDataLen,
uint32_t& maxAllowsHits ) override;
/// @see RsTurtleClientService
void receiveSearchResult(
TurtleSearchRequestId requestId, unsigned char* searchResultData,
uint32_t searchResultDataLen ) override;
virtual RsItem *create_item(uint16_t service,uint8_t item_type) const ;
virtual RsServiceSerializer *serializer() { return this ; }
@ -148,7 +209,7 @@ public:
/// @see RsFiles
virtual bool turtleSearchRequest(
const std::string& matchString,
const std::function<void (const std::list<TurtleFileInfo>& results)>& multiCallback,
const std::function<void (const std::vector<TurtleFileInfoV2>& results)>& multiCallback,
rstime_t maxWait = 300 );
virtual TurtleSearchRequestId turtleSearch(const std::string& string_to_match) ;
@ -337,13 +398,15 @@ private:
std::map<
TurtleRequestId,
std::pair<
std::function<void (const std::list<TurtleFileInfo>& results)>,
std::function<void (const std::vector<TurtleFileInfoV2>& results)>,
std::chrono::system_clock::time_point >
> mSearchCallbacksMap;
RsMutex mSearchCallbacksMapMutex;
/// Cleanup mSearchCallbacksMap
void cleanTimedOutSearches();
RS_SET_CONTEXT_DEBUG_LEVEL(1)
};

View File

@ -257,8 +257,8 @@
#include "util/rsmemory.h"
#include "util/stacktrace.h"
#ifdef RS_DEEP_SEARCH
# include "deep_search/deep_search.h"
#ifdef RS_DEEP_CHANNEL_INDEX
# include "deep_search/channelsindex.hpp"
#endif
/***
@ -5148,13 +5148,13 @@ TurtleRequestId RsGxsNetService::turtleSearchRequest(const std::string& match_st
return mGxsNetTunnel->turtleSearchRequest(match_string,this) ;
}
#ifndef RS_DEEP_SEARCH
#ifndef RS_DEEP_CHANNEL_INDEX
static bool termSearch(const std::string& src, const std::string& substring)
{
/* always ignore case */
return src.end() != std::search( src.begin(), src.end(), substring.begin(), substring.end(), RsRegularExpression::CompareCharIC() );
}
#endif // ndef RS_DEEP_SEARCH
#endif // ndef RS_DEEP_CHANNEL_INDEX
bool RsGxsNetService::retrieveDistantSearchResults(TurtleRequestId req,std::map<RsGxsGroupId,RsGxsGroupSummary>& group_infos)
{
@ -5209,11 +5209,11 @@ void RsGxsNetService::receiveTurtleSearchResults(
for (const RsGxsGroupSummary& gps : group_infos)
{
#ifndef RS_DEEP_SEARCH
#ifndef RS_DEEP_CHANNEL_INDEX
/* Only keep groups that are not locally known, and groups that are
* not already in the mDistantSearchResults structure. */
if(grpMeta[gps.mGroupId]) continue;
#else // ndef RS_DEEP_SEARCH
#else // ndef RS_DEEP_CHANNEL_INDEX
/* When deep search is enabled search results may bring more info
* then we already have also about post that are indexed by xapian,
* so we don't apply this filter in this case. */
@ -5302,9 +5302,9 @@ bool RsGxsNetService::search( const std::string& substring,
{
group_infos.clear();
#ifdef RS_DEEP_SEARCH
std::vector<DeepSearch::SearchResult> results;
DeepSearch::search(substring, results);
#ifdef RS_DEEP_CHANNEL_INDEX
std::vector<DeepChannelsSearchResult> results;
DeepChannelsIndex::search(substring, results);
for(auto dsr : results)
{
@ -5324,7 +5324,7 @@ bool RsGxsNetService::search( const std::string& substring,
if((rit = uQ.find("name")) != uQ.end())
s.mGroupName = rit->second;
if((rit = uQ.find("signFlags")) != uQ.end())
s.mSignFlags = std::stoul(rit->second);
s.mSignFlags = static_cast<uint32_t>(std::stoul(rit->second));
if((rit = uQ.find("publishTs")) != uQ.end())
s.mPublishTs = static_cast<rstime_t>(std::stoll(rit->second));
if((rit = uQ.find("authorId")) != uQ.end())
@ -5340,7 +5340,7 @@ bool RsGxsNetService::search( const std::string& substring,
}
}
}
#else // RS_DEEP_SEARCH
#else // RS_DEEP_CHANNEL_INDEX
RsGxsGrpMetaTemporaryMap grpMetaMap;
{
RS_STACK_MUTEX(mNxsMutex) ;
@ -5366,7 +5366,7 @@ bool RsGxsNetService::search( const std::string& substring,
group_infos.push_back(s);
}
#endif // RS_DEEP_SEARCH
#endif // RS_DEEP_CHANNEL_INDEX
#ifdef NXS_NET_DEBUG_8
GXSNETDEBUG___ << " performing local substring search in response to distant request. Found " << group_infos.size() << " responses." << std::endl;

View File

@ -29,8 +29,8 @@
#include "pqi/pqihash.h"
#include "gxs/rsgixs.h"
#ifdef RS_DEEP_SEARCH
# include "deep_search/deep_search.h"
#ifdef RS_DEEP_CHANNEL_INDEX
# include "deep_search/channelsindex.hpp"
# include "services/p3gxschannels.h"
# include "rsitems/rsgxschannelitems.h"
#endif
@ -148,12 +148,12 @@ bool RsGxsMessageCleanUp::clean()
RsGxsIntegrityCheck::RsGxsIntegrityCheck(
RsGeneralDataService* const dataService, RsGenExchange* genex,
RsSerialType&
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
serializer
#endif
, RsGixs* gixs )
: mDs(dataService), mGenExchangeClient(genex),
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
mSerializer(serializer),
#endif
mDone(false), mIntegrityMutex("integrity"), mGixs(gixs) {}
@ -168,7 +168,7 @@ void RsGxsIntegrityCheck::run()
bool RsGxsIntegrityCheck::check()
{
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
bool isGxsChannels = mGenExchangeClient->serviceType() == RS_SERVICE_GXS_TYPE_CHANNELS;
std::set<RsGxsGroupId> indexedGroups;
#endif
@ -221,7 +221,7 @@ bool RsGxsIntegrityCheck::check()
}
else msgIds.erase(msgIds.find(grp->grpId));
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if( isGxsChannels
&& grp->metaData->mCircleType == GXS_CIRCLE_TYPE_PUBLIC
&& grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED )
@ -241,7 +241,7 @@ bool RsGxsIntegrityCheck::check()
cg.mMeta = meta;
indexedGroups.insert(grp->grpId);
DeepSearch::indexChannelGroup(cg);
DeepChannelsIndex::indexChannelGroup(cg);
}
else
{
@ -256,14 +256,15 @@ bool RsGxsIntegrityCheck::check()
delete rIt;
}
#endif
#endif // def RS_DEEP_CHANNEL_INDEX
}
else
{
grpsToDel.push_back(grp->grpId);
#ifdef RS_DEEP_SEARCH
if(isGxsChannels) DeepSearch::removeChannelFromIndex(grp->grpId);
#endif
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepChannelsIndex::removeChannelFromIndex(grp->grpId);
#endif // def RS_DEEP_CHANNEL_INDEX
}
if( !(grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED) &&
@ -320,10 +321,10 @@ bool RsGxsIntegrityCheck::check()
if (nxsMsgIt == nxsMsgV.end())
{
msgsToDel[grpId].insert(msgId);
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(grpId, msgId);
#endif
DeepChannelsIndex::removeChannelPostFromIndex(grpId, msgId);
#endif // def RS_DEEP_CHANNEL_INDEX
}
}
}
@ -348,14 +349,15 @@ bool RsGxsIntegrityCheck::check()
<< " with wrong hash or null meta data. meta="
<< (void*)msg->metaData << std::endl;
msgsToDel[msg->grpId].insert(msg->msgId);
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if(isGxsChannels)
DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId);
#endif
DeepChannelsIndex::removeChannelPostFromIndex(
msg->grpId, msg->msgId );
#endif // def RS_DEEP_CHANNEL_INDEX
}
else if (subscribed_groups.count(msg->metaData->mGroupId))
{
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
if( isGxsChannels
&& indexedGroups.count(msg->metaData->mGroupId) )
{
@ -373,7 +375,7 @@ bool RsGxsIntegrityCheck::check()
cgIt->toChannelPost(cg, false);
cg.mMeta = meta;
DeepSearch::indexChannelPost(cg);
DeepChannelsIndex::indexChannelPost(cg);
}
else if(dynamic_cast<RsGxsCommentItem*>(rIt)) {}
else if(dynamic_cast<RsGxsVoteItem*>(rIt)) {}
@ -391,7 +393,7 @@ bool RsGxsIntegrityCheck::check()
delete rIt;
}
#endif
#endif // def RS_DEEP_CHANNEL_INDEX
if(!msg->metaData->mAuthorId.isNull())
{

View File

@ -213,7 +213,7 @@ private:
RsGeneralDataService* const mDs;
RsGenExchange *mGenExchangeClient;
#ifdef RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
RsSerialType& mSerializer;
#endif
bool mDone;

View File

@ -899,8 +899,32 @@ rs_jsonapi {
SOURCES += jsonapi/jsonapi.cpp
}
rs_deep_search {
HEADERS += deep_search/deep_search.h
rs_deep_channels_index {
HEADERS *= deep_search/commonutils.hpp
SOURCES *= deep_search/commonutils.cpp
HEADERS += deep_search/channelsindex.hpp
SOURCES += deep_search/channelsindex.cpp
}
rs_deep_files_index {
HEADERS *= deep_search/commonutils.hpp
SOURCES *= deep_search/commonutils.cpp
HEADERS += deep_search/filesindex.hpp
SOURCES += deep_search/filesindex.cpp
}
rs_deep_files_index_ogg {
HEADERS += deep_search/filesoggindexer.hpp
}
rs_deep_files_index_flac {
HEADERS += deep_search/filesflacindexer.hpp
}
rs_deep_files_index_taglib {
HEADERS += deep_search/filestaglibindexer.hpp
}
rs_broadcast_discovery {

View File

@ -202,6 +202,52 @@ struct BannedFileEntry : RsSerializable
}
};
struct DeepFilesSearchResult;
struct TurtleFileInfoV2 : RsSerializable
{
TurtleFileInfoV2() : fSize(0), fWeight(0) {}
TurtleFileInfoV2(const TurtleFileInfo& oldInfo) :
fSize(oldInfo.size), fHash(oldInfo.hash), fName(oldInfo.name),
fWeight(0) {}
#ifdef RS_DEEP_FILES_INDEX
TurtleFileInfoV2(const DeepFilesSearchResult& dRes);
#endif // def RS_DEEP_FILES_INDEX
uint64_t fSize; /// File size
RsFileHash fHash; /// File hash
std::string fName; /// File name
/** @brief Xapian weight of the file which matched the search criteria
* This field is optional (its value is 0 when not specified).
* Given that Xapian weight for the same file is usually different on
* different nodes, it should not be used as an absolute refence, but just
* as an hint of how much the given file match the search criteria.
*/
float fWeight;
/** @brief Xapian snippet of the file which matched the search criteria
* This field is optional (its value is an empty string when not specified).
*/
std::string fSnippet;
/// @see RsSerializable::serial_process
void serial_process( RsGenericSerializer::SerializeJob j,
RsGenericSerializer::SerializeContext& ctx ) override
{
RS_SERIAL_PROCESS(fSize);
RS_SERIAL_PROCESS(fHash);
RS_SERIAL_PROCESS(fName);
RS_SERIAL_PROCESS(fWeight);
RS_SERIAL_PROCESS(fSnippet);
}
~TurtleFileInfoV2() override;
};
class RsFiles
{
public:
@ -209,7 +255,7 @@ public:
virtual ~RsFiles() {}
/**
* @brief Provides file data for the gui, media streaming or rpc clients.
* @brief Provides file data for the GUI, media streaming or API clients.
* It may return unverified chunks. This allows streaming without having to
* wait for hashes or completion of the file.
* This function returns an unspecified amount of bytes. Either as much data
@ -217,8 +263,8 @@ public:
* To get more data, call this function repeatedly with different offsets.
*
* @jsonapi{development,manualwrapper}
* note the missing @ the wrapper for this is written manually not
* autogenerated @see JsonApiServer.
* note the wrapper for this is written manually not autogenerated
* @see JsonApiServer.
*
* @param[in] hash hash of the file. The file has to be available on this node
* or it has to be in downloading state.
@ -356,7 +402,9 @@ public:
/**
* @brief Request remote files search
* @jsonapi{development}
* @param[in] matchString string to look for in the search
* @param[in] matchString string to look for in the search. If files deep
* indexing is enabled at compile time support advanced features described
* at https://xapian.org/docs/queryparser.html
* @param multiCallback function that will be called each time a search
* result is received
* @param[in] maxWait maximum wait time in seconds for search results
@ -364,7 +412,7 @@ public:
*/
virtual bool turtleSearchRequest(
const std::string& matchString,
const std::function<void (const std::list<TurtleFileInfo>& results)>& multiCallback,
const std::function<void (const std::vector<TurtleFileInfoV2>& results)>& multiCallback,
rstime_t maxWait = 300 ) = 0;
virtual TurtleRequestId turtleSearch(const std::string& string_to_match) = 0;
@ -627,7 +675,18 @@ public:
*/
virtual bool removeSharedDirectory(std::string dir) = 0;
virtual bool getIgnoreLists(std::list<std::string>& ignored_prefixes, std::list<std::string>& ignored_suffixes,uint32_t& flags) =0;
/**
* @brief Get list of ignored file name prefixes and suffixes
* @param[out] ignoredPrefixes storage for ingored prefixes
* @param[out] ignoredSuffixes storage for ingored suffixes
* @param flags RS_FILE_SHARE_FLAGS_IGNORE_*
* @return false if something failed, true otherwhise
*/
virtual bool getIgnoreLists(
std::list<std::string>& ignoredPrefixes,
std::list<std::string>& ignoredSuffixes,
uint32_t& flags ) = 0;
virtual void setIgnoreLists(const std::list<std::string>& ignored_prefixes, const std::list<std::string>& ignored_suffixes,uint32_t flags) =0;
virtual void setWatchPeriod(int minutes) =0;

View File

@ -44,12 +44,10 @@ extern RsTurtle* rsTurtle;
typedef uint32_t TurtleRequestId ;
typedef RsPeerId TurtleVirtualPeerId;
/**
* This is the structure used to send back results of the turtle search,
* to other peers, to the notifyBase class, to the search caller or to the GUI.
*/
struct TurtleFileInfo : RsSerializable
{
TurtleFileInfo() : size(0) {}
uint64_t size; /// File size
RsFileHash hash; /// File hash
std::string name; /// File name
@ -65,7 +63,7 @@ struct TurtleFileInfo : RsSerializable
RsTypeSerializer::serial_process(
j, ctx, TLV_TYPE_STR_NAME, name, "name" );
}
};
} RS_DEPRECATED_FOR(TurtleFileInfoV2);
struct TurtleTunnelRequestDisplayInfo
{
@ -120,10 +118,9 @@ public:
virtual void setSessionEnabled(bool) = 0 ;
virtual bool sessionEnabled() const = 0 ;
// Lauches a search request through the pipes, and immediately returns
// the request id, which will be further used by the gui to store results
// as they come back.
//
/** Lauches a search request through the pipes, and immediately returns
* the request id, which will be further used by client services to
* handle results as they come back. */
virtual TurtleRequestId turtleSearch(
unsigned char *search_bin_data, uint32_t search_bin_data_len,
RsTurtleClientService* client_service ) = 0;

View File

@ -27,6 +27,7 @@
#include "serialiser/rsserializer.h"
#include "serialiser/rstypeserializer.h"
#include "util/stacktrace.h"
#include "util/rsdebug.h"
const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_NONE ( 0x0000 );
const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_CONFIG ( 0x0001 );
@ -36,6 +37,16 @@ const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_YIELDING ( 0
RsItem *RsServiceSerializer::deserialise(void *data, uint32_t *size)
{
if(!data || !size || !*size)
{
RsErr() << __PRETTY_FUNCTION__ << " Called with null paramethers data: "
<< data << " size: " << static_cast<void*>(size) << " *size: "
<< (size ? *size : 0) << " this should never happen!"
<< std::endl;
print_stacktrace();
return nullptr;
}
if(mFlags & SERIALIZATION_FLAG_SKIP_HEADER)
{
std::cerr << "(EE) Cannot deserialise item with flags SERIALIZATION_FLAG_SKIP_HEADER. Check your code!" << std::endl;

View File

@ -44,9 +44,9 @@
#include "util/rsrandom.h"
#include "util/rsstring.h"
#ifdef RS_DEEP_SEARCH
# include "deep_search/deep_search.h"
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
# include "deep_search/channelsindex.hpp"
#endif // RS_DEEP_CHANNEL_INDEX
/****
@ -1149,9 +1149,9 @@ bool p3GxsChannels::createChannelV2(
channelId = channel.mMeta.mGroupId;
#ifdef RS_DEEP_SEARCH
DeepSearch::indexChannelGroup(channel);
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
DeepChannelsIndex::indexChannelGroup(channel);
#endif // RS_DEEP_CHANNEL_INDEX
return true;
}
@ -1180,9 +1180,9 @@ bool p3GxsChannels::createChannel(RsGxsChannelGroup& channel)
return false;
}
#ifdef RS_DEEP_SEARCH
DeepSearch::indexChannelGroup(channel);
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
DeepChannelsIndex::indexChannelGroup(channel);
#endif // RS_DEEP_CHANNEL_INDEX
return true;
}
@ -1333,9 +1333,9 @@ bool p3GxsChannels::editChannel(RsGxsChannelGroup& channel)
return false;
}
#ifdef RS_DEEP_SEARCH
DeepSearch::indexChannelGroup(channel);
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
DeepChannelsIndex::indexChannelGroup(channel);
#endif // RS_DEEP_CHANNEL_INDEX
return true;
}
@ -1401,9 +1401,9 @@ bool p3GxsChannels::createPostV2(
if(RsGenExchange::getPublishedMsgMeta(token,post.mMeta))
{
#ifdef RS_DEEP_SEARCH
DeepSearch::indexChannelPost(post);
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
DeepChannelsIndex::indexChannelPost(post);
#endif // RS_DEEP_CHANNEL_INDEX
postId = post.mMeta.mMsgId;
return true;
@ -1787,9 +1787,9 @@ bool p3GxsChannels::createPost(RsGxsChannelPost& post)
if(RsGenExchange::getPublishedMsgMeta(token,post.mMeta))
{
#ifdef RS_DEEP_SEARCH
DeepSearch::indexChannelPost(post);
#endif // RS_DEEP_SEARCH
#ifdef RS_DEEP_CHANNEL_INDEX
DeepChannelsIndex::indexChannelPost(post);
#endif // RS_DEEP_CHANNEL_INDEX
return true;
}

View File

@ -865,6 +865,8 @@ int p3turtle::handleIncoming()
//
void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item)
{
Dbg3() << __PRETTY_FUNCTION__ << " " << *item << std::endl;
// take a look at the item and test against inconsistent values
// - If the item destimation is
@ -877,11 +879,12 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item)
if(item_size > TURTLE_MAX_SEARCH_REQ_ACCEPTED_SERIAL_SIZE)
{
#ifdef P3TURTLE_DEBUG
std::cerr << " Dropping, because the serial size exceeds the accepted limit." << std::endl ;
#endif
std::cerr << " Caught a turtle search item with arbitrary large size from " << item->PeerId() << " of size " << item_size << " and depth " << item->depth << ". This is not allowed => dropping." << std::endl;
return ;
RsWarn() << __PRETTY_FUNCTION__
<< " Got a turtle search item with arbitrary large size from "
<< item->PeerId() << " of size " << item_size << " and depth "
<< item->depth << ". This is not allowed => dropping."
<< std::endl;
return;
}
{
@ -889,22 +892,20 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item)
if(_search_requests_origins.size() > MAX_ALLOWED_SR_IN_CACHE)
{
#ifdef P3TURTLE_DEBUG
std::cerr << " Dropping, because the search request cache is full." << std::endl ;
#endif
std::cerr << " More than " << MAX_ALLOWED_SR_IN_CACHE << " search request in cache. A peer is probably trying to flood your network See the depth charts to find him." << std::endl;
return ;
RsWarn() << __PRETTY_FUNCTION__ << " More than "
<< MAX_ALLOWED_SR_IN_CACHE << " search request in cache. "
<< "A peer is probably trying to flood your network See "
"the depth charts to find him." << std::endl;
return;
}
// If the item contains an already handled search request, give up. This
// happens when the same search request gets relayed by different peers
//
if(_search_requests_origins.find(item->request_id) != _search_requests_origins.end())
if( _search_requests_origins.find(item->request_id) !=
_search_requests_origins.end() )
{
#ifdef P3TURTLE_DEBUG
std::cerr << " This is a bouncing request. Ignoring and deleting it." << std::endl ;
#endif
return ;
/* If the item contains an already handled search request, give up.
* This happens when the same search request gets relayed by
* different peers */
return;
}
}
@ -1013,13 +1014,21 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item)
// This function should be removed in the future, when file search will also use generic search items.
void p3turtle::performLocalSearch(RsTurtleSearchRequestItem *item,uint32_t& req_result_count,std::list<RsTurtleSearchResultItem*>& search_results,uint32_t& max_allowed_hits)
void p3turtle::performLocalSearch(
RsTurtleSearchRequestItem *item, uint32_t& req_result_count,
std::list<RsTurtleSearchResultItem*>& search_results,
uint32_t& max_allowed_hits )
{
RsTurtleFileSearchRequestItem *ftsearch = dynamic_cast<RsTurtleFileSearchRequestItem*>(item) ;
Dbg3() << __PRETTY_FUNCTION__ << " " << item << std::endl;
RsTurtleFileSearchRequestItem* ftsearch =
dynamic_cast<RsTurtleFileSearchRequestItem*>(item);
if(ftsearch != NULL)
{
performLocalSearch_files(ftsearch,req_result_count,search_results,max_allowed_hits) ;
performLocalSearch_files(
ftsearch, req_result_count, search_results,
max_allowed_hits );
return ;
}
@ -1060,12 +1069,13 @@ void p3turtle::performLocalSearch_generic(RsTurtleGenericSearchRequestItem *item
}
}
void p3turtle::performLocalSearch_files(RsTurtleFileSearchRequestItem *item,uint32_t& req_result_count,std::list<RsTurtleSearchResultItem*>& result,uint32_t& max_allowed_hits)
void p3turtle::performLocalSearch_files(
RsTurtleFileSearchRequestItem *item, uint32_t& req_result_count,
std::list<RsTurtleSearchResultItem*>& result,
uint32_t& max_allowed_hits )
{
#ifdef P3TURTLE_DEBUG
std::cerr << "Performing rsFiles->search()" << std::endl ;
#endif
// now, search!
Dbg3() << __PRETTY_FUNCTION__ << " " << *item << std::endl;
std::list<TurtleFileInfo> initialResults ;
item->search(initialResults) ;
@ -1104,6 +1114,9 @@ void p3turtle::performLocalSearch_files(RsTurtleFileSearchRequestItem *item,uint
res_item = NULL ; // forces creation of a new item.
}
}
Dbg3() << __PRETTY_FUNCTION__ << " found " << req_result_count << " results"
<< std::endl;
}
void p3turtle::handleSearchResult(RsTurtleSearchResultItem *item)

View File

@ -19,6 +19,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#pragma once
//====================================== General setup of the router ===================================//
//
@ -130,10 +131,6 @@
// - should tunnels be re-used ? nope. The only useful case would be when two peers are exchanging files, which happens quite rarely.
//
#ifndef MRK_PQI_TURTLE_H
#define MRK_PQI_TURTLE_H
#include <string>
#include <list>
#include <set>
@ -464,6 +461,8 @@ class p3turtle: public p3Service, public RsTurtle, public p3Config
uint32_t _service_type ;
RS_SET_CONTEXT_DEBUG_LEVEL(1)
#ifdef P3TURTLE_DEBUG
// debug function
void dumpState() ;
@ -472,5 +471,3 @@ class p3turtle: public p3Service, public RsTurtle, public p3Config
void TS_dumpState();
#endif
};
#endif

View File

@ -68,11 +68,23 @@ linux-* {
mLibs += dl
}
rs_deep_search {
rs_deep_channels_index | rs_deep_files_index {
mLibs += xapian
win32-g++:mLibs += rpcrt4
}
rs_deep_files_index_ogg {
mLibs += vorbisfile
}
rs_deep_files_index_flac {
mLibs += FLAC++
}
rs_deep_files_index_taglib {
mLibs += tag
}
rs_broadcast_discovery {
no_rs_cross_compiling {
UDP_DISCOVERY_SRC_PATH=$$clean_path($${RS_SRC_PATH}/supportlibs/udp-discovery-cpp/)

View File

@ -165,10 +165,30 @@ rs_macos10.14:CONFIG -= rs_macos10.11
CONFIG *= no_rs_jsonapi
rs_jsonapi:CONFIG -= no_rs_jsonapi
# To enable deep search append the following assignation to qmake command line
# CONFIG *= rs_deep_search
CONFIG *= no_rs_deep_search
rs_deep_search:CONFIG -= no_rs_deep_search
# To enable channel indexing append the following assignation to qmake command
# line "CONFIG+=rs_deep_channel_index"
CONFIG *= no_rs_deep_channel_index
rs_deep_channel_index:CONFIG -= no_rs_deep_channel_index
# To enable deep files indexing append the following assignation to qmake
# command line "CONFIG+=rs_files_index"
CONFIG *= no_rs_deep_files_index
rs_deep_files_index:CONFIG -= no_rs_deep_files_index
# To enable Ogg files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_ogg"
CONFIG *= no_rs_deep_files_index_ogg
rs_deep_files_index_ogg:CONFIG -= no_rs_deep_files_index_ogg
# To enable FLAC files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_flac"
CONFIG *= no_rs_deep_files_index_flac
rs_deep_files_index_flac:CONFIG -= no_rs_deep_files_index_flac
# To enable taglib files deep indexing append the following assignation to qmake
# command line "CONFIG+=rs_deep_files_index_taglib"
CONFIG *= no_rs_deep_files_index_taglib
rs_deep_files_index_taglib:CONFIG -= no_rs_deep_files_index_taglib
# To enable native dialogs append the following assignation to qmake command
# line "CONFIG+=rs_use_native_dialogs"
@ -564,15 +584,12 @@ retroshare_qml_app {
warning("QMAKE: you have enabled retroshare_qml_app which is deprecated")
}
rs_deep_search {
DEFINES *= RS_DEEP_SEARCH
rs_deep_channels_index:DEFINES *= RS_DEEP_CHANNEL_INDEX
linux {
exists("/usr/include/xapian-1.3") {
INCLUDEPATH += /usr/include/xapian-1.3
}
}
}
rs_deep_files_index:DEFINES *= RS_DEEP_FILES_INDEX
rs_deep_files_index_ogg:DEFINES *= RS_DEEP_FILES_INDEX_OGG
rs_deep_files_index_flac:DEFINES *= RS_DEEP_FILES_INDEX_FLAC
rs_deep_files_index_taglib:DEFINES *= RS_DEEP_FILES_INDEX_TAGLIB
rs_use_native_dialogs:DEFINES *= RS_NATIVEDIALOGS