From 3a26ccf6a54e96a5813bc1d651774a4a5844d398 Mon Sep 17 00:00:00 2001 From: Gioacchino Mazzurco Date: Thu, 20 Jun 2019 17:24:18 +0200 Subject: [PATCH] Implement deep indexing for files through Xapian ATM it support extracting metadata only from OGG files. The system has been designed to be easly extensible to more file formats registering more indexer functions which just need to extract metadata from a certain type of file and feed it to Xapian. The system has been integrated into existent file search system to through generric search requests and results, it keep a good level of retro-compatibility due to some tricks. The indexing system is released under AGPLv3 so when libretroshare is compiled with deep search enabled AGPLv3 must be honored instead of LGPLv3-or-later. Cleaned up the debian copyright file using non-deprecated license code-names. --- build_scripts/Debian/debian/copyright | 53 +++- .../src/deep_search/channelsindex.cpp | 230 +++++++++++++++ .../src/deep_search/channelsindex.hpp | 77 +++++ libretroshare/src/deep_search/commonutils.cpp | 93 ++++++ libretroshare/src/deep_search/commonutils.hpp | 45 +++ libretroshare/src/deep_search/deep_search.h | 276 ------------------ libretroshare/src/deep_search/filesindex.cpp | 143 +++++++++ libretroshare/src/deep_search/filesindex.hpp | 95 ++++++ .../src/deep_search/filesoggindexer.hpp | 97 ++++++ .../src/file_sharing/dir_hierarchy.cc | 15 +- .../src/file_sharing/directory_storage.cc | 44 ++- libretroshare/src/ft/ftserver.cc | 229 +++++++++++++-- libretroshare/src/ft/ftserver.h | 73 ++++- libretroshare/src/gxs/rsgxsnetservice.cc | 24 +- libretroshare/src/gxs/rsgxsutil.cc | 42 +-- libretroshare/src/gxs/rsgxsutil.h | 2 +- libretroshare/src/libretroshare.pro | 20 +- libretroshare/src/retroshare/rsfiles.h | 73 ++++- libretroshare/src/retroshare/rsturtle.h | 15 +- libretroshare/src/serialiser/rsserializer.cc | 11 + libretroshare/src/services/p3gxschannels.cc | 36 +-- libretroshare/src/turtle/p3turtle.cc | 65 +++-- libretroshare/src/turtle/p3turtle.h | 9 +- libretroshare/src/use_libretroshare.pri | 6 +- retroshare.pri | 29 +- 25 files changed, 1364 insertions(+), 438 deletions(-) create mode 100644 libretroshare/src/deep_search/channelsindex.cpp create mode 100644 libretroshare/src/deep_search/channelsindex.hpp create mode 100644 libretroshare/src/deep_search/commonutils.cpp create mode 100644 libretroshare/src/deep_search/commonutils.hpp delete mode 100644 libretroshare/src/deep_search/deep_search.h create mode 100644 libretroshare/src/deep_search/filesindex.cpp create mode 100644 libretroshare/src/deep_search/filesindex.hpp create mode 100644 libretroshare/src/deep_search/filesoggindexer.hpp diff --git a/build_scripts/Debian/debian/copyright b/build_scripts/Debian/debian/copyright index c94948a36..fe0eb5010 100644 --- a/build_scripts/Debian/debian/copyright +++ b/build_scripts/Debian/debian/copyright @@ -1,19 +1,23 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: retroshare -Upstream-Contact: retroshare.team@gmail.com +Upstream-Contact: contact@retroshare.cc Source: https://github.com/retroshare/retroshare Files: openpgpsdk/* -Copyright: 2005-2008 Ben Laurie, Rachel Willmer, Retroshare Team +Copyright: 2005-2008 Ben Laurie, Rachel Willmer, Retroshare Team License: Apache-2.0 Files: jsonapi-generator/* libretroshare/src/jsonapi/* Copyright: 2018-2019 Gioacchino Mazzurco -License: AGPL-3+ +License: AGPL-3.0-or-later + +Files: libretroshare/src/deep_search/* +Copyright: 2018-2019 Gioacchino Mazzurco +License: AGPL-3.0-only Files: libretroshare/* -Copyright: 2007-2018, Retroshare Team -License: LGPL-3+ +Copyright: 2007-2019, Retroshare Team +License: LGPL-3.0-or-later Files: src/retroshare-gui/src/TorControl/ Copyright: 2014, John Brooks @@ -28,8 +32,8 @@ Copyright: 2013 Jeff Weinstein License: MIT Files: * -Copyright: 2007-2018, Retroshare Team -License: AGPL-3+ +Copyright: 2007-2019, Retroshare Team +License: AGPL-3.0-only ####### # TODO @@ -56,7 +60,7 @@ License: Apache-2.0 See the License for the specific language governing permissions and limitations under the License. -License: LGPL-3+ +License: LGPL-3.0-or-later This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the @@ -75,7 +79,7 @@ License: LGPL-3+ OpenSSL that use the same license as OpenSSL), and distribute linked combinations including the two. -License: AGPL-3+ +License: AGPL-3.0-or-later This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the @@ -86,11 +90,36 @@ License: AGPL-3+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. . - You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see . + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . . As a special exception, the copyright holders give permission to link the - code of portions of this program with the OpenSSL library under certain + code or portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the GNU Affero General Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. + +License: AGPL-3.0-only + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License version 3 as + published by the Free Software Foundation. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + . + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + . + As a special exception, the copyright holders give permission to link the + code or portions of this program with the OpenSSL library under certain conditions as described in each individual source file and distribute linked combinations including the program with the OpenSSL library. You must comply with the GNU Affero General Public License in all respects for diff --git a/libretroshare/src/deep_search/channelsindex.cpp b/libretroshare/src/deep_search/channelsindex.cpp new file mode 100644 index 000000000..cd1c374fc --- /dev/null +++ b/libretroshare/src/deep_search/channelsindex.cpp @@ -0,0 +1,230 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ + +#include "deep_search/channelsindex.hpp" +#include "deep_search/commonutils.hpp" + +uint32_t DeepChannelsIndex::search( + const std::string& queryStr, + std::vector& results, uint32_t maxResults ) +{ + results.clear(); + + std::unique_ptr dbPtr( + DeepSearch::openReadOnlyDatabase(dbPath()) ); + if(!dbPtr) return 0; + + Xapian::Database& db(*dbPtr); + + // Set up a QueryParser with a stemmer and suitable prefixes. + Xapian::QueryParser queryparser; + //queryparser.set_stemmer(Xapian::Stem("en")); + queryparser.set_stemming_strategy(queryparser.STEM_SOME); + // Start of prefix configuration. + //queryparser.add_prefix("title", "S"); + //queryparser.add_prefix("description", "XD"); + // End of prefix configuration. + + // And parse the query. + Xapian::Query query = queryparser.parse_query(queryStr); + + // Use an Enquire object on the database to run the query. + Xapian::Enquire enquire(db); + enquire.set_query(query); + + Xapian::MSet mset = enquire.get_mset( + 0, maxResults ? maxResults : db.get_doccount() ); + + for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m ) + { + const Xapian::Document& doc = m.get_document(); + DeepChannelsSearchResult s; + s.mUrl = doc.get_value(URL_VALUENO); +#if XAPIAN_AT_LEAST(1,3,5) + s.mSnippet = mset.snippet(doc.get_data()); +#endif // XAPIAN_AT_LEAST(1,3,5) + results.push_back(s); + } + + return static_cast(results.size()); +} + +void DeepChannelsIndex::indexChannelGroup(const RsGxsChannelGroup& chan) +{ + std::unique_ptr dbPtr( + DeepSearch::openWritableDatabase( + dbPath(), Xapian::DB_CREATE_OR_OPEN ) ); + if(!dbPtr) return; + + Xapian::WritableDatabase& db(*dbPtr); + + // Set up a TermGenerator that we'll use in indexing. + Xapian::TermGenerator termgenerator; + //termgenerator.set_stemmer(Xapian::Stem("en")); + + // We make a document and tell the term generator to use this. + Xapian::Document doc; + termgenerator.set_document(doc); + + // Index each field with a suitable prefix. + termgenerator.index_text(chan.mMeta.mGroupName, 1, "G"); + termgenerator.index_text( + DeepSearch::timetToXapianDate(chan.mMeta.mPublishTs), 1, "D" ); + termgenerator.index_text(chan.mDescription, 1, "XD"); + + // Index fields without prefixes for general search. + termgenerator.index_text(chan.mMeta.mGroupName); + termgenerator.increase_termpos(); + termgenerator.index_text(chan.mDescription); + + RsUrl chanUrl; chanUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", chan.mMeta.mGroupId.toStdString()); + const std::string idTerm("Q" + chanUrl.toString()); + + chanUrl.setQueryKV("publishTs", std::to_string(chan.mMeta.mPublishTs)); + chanUrl.setQueryKV("name", chan.mMeta.mGroupName); + if(!chan.mMeta.mAuthorId.isNull()) + chanUrl.setQueryKV("authorId", chan.mMeta.mAuthorId.toStdString()); + if(chan.mMeta.mSignFlags) + chanUrl.setQueryKV( "signFlags", + std::to_string(chan.mMeta.mSignFlags) ); + std::string rsLink(chanUrl.toString()); + + // store the RS link so we are able to retrive it on matching search + doc.add_value(URL_VALUENO, rsLink); + + // Store some fields for display purposes. + doc.set_data(chan.mMeta.mGroupName + "\n" + chan.mDescription); + + // We use the identifier to ensure each object ends up in the + // database only once no matter how many times we run the + // indexer. "Q" prefix is a Xapian convention for unique id term. + doc.add_boolean_term(idTerm); + db.replace_document(idTerm, doc); +} + +void DeepChannelsIndex::removeChannelFromIndex(RsGxsGroupId grpId) +{ + // "Q" prefix is a Xapian convention for unique id term. + RsUrl chanUrl; chanUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", grpId.toStdString()); + std::string idTerm("Q" + chanUrl.toString()); + + std::unique_ptr dbPtr( + DeepSearch::openWritableDatabase( + dbPath(), Xapian::DB_CREATE_OR_OPEN ) ); + if(!dbPtr) return; + + Xapian::WritableDatabase& db(*dbPtr); + db.delete_document(idTerm); +} + +void DeepChannelsIndex::indexChannelPost(const RsGxsChannelPost& post) +{ + std::unique_ptr dbPtr( + DeepSearch::openWritableDatabase( + dbPath(), Xapian::DB_CREATE_OR_OPEN ) ); + if(!dbPtr) return; + + Xapian::WritableDatabase& db(*dbPtr); + + // Set up a TermGenerator that we'll use in indexing. + Xapian::TermGenerator termgenerator; + //termgenerator.set_stemmer(Xapian::Stem("en")); + + // We make a document and tell the term generator to use this. + Xapian::Document doc; + termgenerator.set_document(doc); + + // Index each field with a suitable prefix. + termgenerator.index_text(post.mMeta.mMsgName, 1, "S"); + termgenerator.index_text( + DeepSearch::timetToXapianDate(post.mMeta.mPublishTs), 1, "D" ); + + // TODO: we should strip out HTML tags instead of skipping indexing + // Avoid indexing HTML + bool isPlainMsg = + post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>'; + + if(isPlainMsg) + termgenerator.index_text(post.mMsg, 1, "XD"); + + // Index fields without prefixes for general search. + termgenerator.index_text(post.mMeta.mMsgName); + if(isPlainMsg) + { + termgenerator.increase_termpos(); + termgenerator.index_text(post.mMsg); + } + + for(const RsGxsFile& attachment : post.mFiles) + { + termgenerator.index_text(attachment.mName, 1, "F"); + + termgenerator.increase_termpos(); + termgenerator.index_text(attachment.mName); + } + + // We use the identifier to ensure each object ends up in the + // database only once no matter how many times we run the + // indexer. + RsUrl postUrl; postUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", post.mMeta.mGroupId.toStdString()) + .setQueryKV("msgid", post.mMeta.mMsgId.toStdString()); + std::string idTerm("Q" + postUrl.toString()); + + postUrl.setQueryKV("publishTs", std::to_string(post.mMeta.mPublishTs)); + postUrl.setQueryKV("name", post.mMeta.mMsgName); + postUrl.setQueryKV("authorId", post.mMeta.mAuthorId.toStdString()); + std::string rsLink(postUrl.toString()); + + // store the RS link so we are able to retrive it on matching search + doc.add_value(URL_VALUENO, rsLink); + + // Store some fields for display purposes. + if(isPlainMsg) + doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg); + else doc.set_data(post.mMeta.mMsgName); + + doc.add_boolean_term(idTerm); + db.replace_document(idTerm, doc); +} + +void DeepChannelsIndex::removeChannelPostFromIndex( + RsGxsGroupId grpId, RsGxsMessageId msgId ) +{ + RsUrl postUrl; postUrl + .setScheme("retroshare").setPath("/channel") + .setQueryKV("id", grpId.toStdString()) + .setQueryKV("msgid", msgId.toStdString()); + // "Q" prefix is a Xapian convention for unique id term. + std::string idTerm("Q" + postUrl.toString()); + + std::unique_ptr dbPtr( + DeepSearch::openWritableDatabase( + dbPath(), Xapian::DB_CREATE_OR_OPEN ) ); + if(!dbPtr) return; + + Xapian::WritableDatabase& db(*dbPtr); + db.delete_document(idTerm); +} diff --git a/libretroshare/src/deep_search/channelsindex.hpp b/libretroshare/src/deep_search/channelsindex.hpp new file mode 100644 index 000000000..e9e015b7b --- /dev/null +++ b/libretroshare/src/deep_search/channelsindex.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ +#pragma once + +#include +#include + +#include "util/rstime.h" +#include "retroshare/rsgxschannels.h" +#include "retroshare/rsinit.h" +#include "util/rsurl.h" + +struct DeepChannelsSearchResult +{ + std::string mUrl; + double mWeight; + std::string mSnippet; +}; + +struct DeepChannelsIndex +{ + /** + * @brief Search indexed GXS groups and messages + * @param[in] maxResults maximum number of acceptable search results, 0 for + * no limits + * @return search results count + */ + static uint32_t search( const std::string& queryStr, + std::vector& results, + uint32_t maxResults = 100 ); + + static void indexChannelGroup(const RsGxsChannelGroup& chan); + + static void removeChannelFromIndex(RsGxsGroupId grpId); + + static void indexChannelPost(const RsGxsChannelPost& post); + + static void removeChannelPostFromIndex( + RsGxsGroupId grpId, RsGxsMessageId msgId ); + + static uint32_t indexFile(const std::string& path); + +private: + + enum : Xapian::valueno + { + /// Used to store retroshare url of indexed documents + URL_VALUENO, + + /// @see Xapian::BAD_VALUENO + BAD_VALUENO = Xapian::BAD_VALUENO + }; + + static const std::string& dbPath() + { + static const std::string dbDir = + RsAccounts::AccountDirectory() + "/deep_search_xapian_db"; + return dbDir; + } +}; diff --git a/libretroshare/src/deep_search/commonutils.cpp b/libretroshare/src/deep_search/commonutils.cpp new file mode 100644 index 000000000..eecbd4ec6 --- /dev/null +++ b/libretroshare/src/deep_search/commonutils.cpp @@ -0,0 +1,93 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ + +#include "deep_search/commonutils.hpp" +#include "util/stacktrace.h" +#include "util/rsdebug.h" + +namespace DeepSearch +{ + +std::unique_ptr openWritableDatabase( + const std::string& path, int flags, int blockSize ) +{ + try + { + std::unique_ptr dbPtr( + new Xapian::WritableDatabase(path, flags, blockSize) ); + return dbPtr; + } + catch(Xapian::DatabaseLockError) + { + RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock " + << path << std::endl; + print_stacktrace(); + } + catch(...) + { + RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted " + << "deleting it might help without causing any harm: " + << path << std::endl; + print_stacktrace(); + } + + return nullptr; +} + +std::unique_ptr openReadOnlyDatabase( + const std::string& path, int flags ) +{ + try + { + std::unique_ptr dbPtr( + new Xapian::Database(path, flags) ); + return dbPtr; + } + catch(Xapian::DatabaseOpeningError e) + { + RsWarn() << __PRETTY_FUNCTION__ << " " << e.get_msg() + << ", probably nothing has been indexed yet." << std::endl; + } + catch(Xapian::DatabaseLockError) + { + RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock " + << path << std::endl; + print_stacktrace(); + } + catch(...) + { + RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted " + << "deleting it might help without causing any harm: " + << path << std::endl; + print_stacktrace(); + } + + return nullptr; +} + +std::string timetToXapianDate(const rstime_t& time) +{ + char date[] = "YYYYMMDD\0"; + time_t tTime = static_cast(time); + std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime)); + return date; +} + +} diff --git a/libretroshare/src/deep_search/commonutils.hpp b/libretroshare/src/deep_search/commonutils.hpp new file mode 100644 index 000000000..28961bc09 --- /dev/null +++ b/libretroshare/src/deep_search/commonutils.hpp @@ -0,0 +1,45 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ +#pragma once + +#include +#include + +#include "util/rstime.h" + +#ifndef XAPIAN_AT_LEAST +#define XAPIAN_AT_LEAST(A,B,C) (XAPIAN_MAJOR_VERSION > (A) || \ + (XAPIAN_MAJOR_VERSION == (A) && \ + (XAPIAN_MINOR_VERSION > (B) || \ + (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C))))) +#endif // ndef XAPIAN_AT_LEAST + +namespace DeepSearch +{ + +std::unique_ptr openWritableDatabase( + const std::string& path, int flags = 0, int blockSize = 0 ); + +std::unique_ptr openReadOnlyDatabase( + const std::string& path, int flags = 0 ); + +std::string timetToXapianDate(const rstime_t& time); + +} diff --git a/libretroshare/src/deep_search/deep_search.h b/libretroshare/src/deep_search/deep_search.h deleted file mode 100644 index b67c93055..000000000 --- a/libretroshare/src/deep_search/deep_search.h +++ /dev/null @@ -1,276 +0,0 @@ -/******************************************************************************* - * libretroshare/src/crypto: crypto.h * - * * - * libretroshare: retroshare core library * - * * - * Copyright (C) 2018 Gioacchino Mazzurco * - * * - * This program is free software: you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as * - * published by the Free Software Foundation, either version 3 of the * - * License, or (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program. If not, see . * - * * - *******************************************************************************/ -#pragma once - -#include "util/rstime.h" -#include -#include - -#include "retroshare/rsgxschannels.h" -#include "retroshare/rsinit.h" -#include "util/rsurl.h" - -#ifndef XAPIAN_AT_LEAST -#define XAPIAN_AT_LEAST(A,B,C) (XAPIAN_MAJOR_VERSION > (A) || \ - (XAPIAN_MAJOR_VERSION == (A) && \ - (XAPIAN_MINOR_VERSION > (B) || \ - (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C))))) -#endif // ndef XAPIAN_AT_LEAST - -struct DeepSearch -{ - struct SearchResult - { - std::string mUrl; - std::string mSnippet; - }; - - /** - * @param[in] maxResults maximum number of acceptable search results, 0 for - * no limits - * @return search results count - */ - static uint32_t search( const std::string& queryStr, - std::vector& results, - uint32_t maxResults = 100 ) - { - results.clear(); - - Xapian::Database db; - - // Open the database we're going to search. - try { db = Xapian::Database(dbPath()); } - catch(Xapian::DatabaseOpeningError e) - { - std::cerr << __PRETTY_FUNCTION__ << " " << e.get_msg() - << ", probably nothing has been indexed yet."<< std::endl; - return 0; - } - catch(Xapian::DatabaseError e) - { - std::cerr << __PRETTY_FUNCTION__ << " " << e.get_msg() - << " this is fishy, maybe " << dbPath() - << " has been corrupted (deleting it may help in that " - << "case without loosing data)" << std::endl; - return 0; - } - - // Set up a QueryParser with a stemmer and suitable prefixes. - Xapian::QueryParser queryparser; - //queryparser.set_stemmer(Xapian::Stem("en")); - queryparser.set_stemming_strategy(queryparser.STEM_SOME); - // Start of prefix configuration. - //queryparser.add_prefix("title", "S"); - //queryparser.add_prefix("description", "XD"); - // End of prefix configuration. - - // And parse the query. - Xapian::Query query = queryparser.parse_query(queryStr); - - // Use an Enquire object on the database to run the query. - Xapian::Enquire enquire(db); - enquire.set_query(query); - - Xapian::MSet mset = enquire.get_mset( - 0, maxResults ? maxResults : db.get_doccount() ); - - for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m ) - { - const Xapian::Document& doc = m.get_document(); - SearchResult s; - s.mUrl = doc.get_value(URL_VALUENO); -#if XAPIAN_AT_LEAST(1,3,5) - s.mSnippet = mset.snippet(doc.get_data()); -#endif // XAPIAN_AT_LEAST(1,3,5) - results.push_back(s); - } - - return results.size(); - } - - - static void indexChannelGroup(const RsGxsChannelGroup& chan) - { - Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); - - // Set up a TermGenerator that we'll use in indexing. - Xapian::TermGenerator termgenerator; - //termgenerator.set_stemmer(Xapian::Stem("en")); - - // We make a document and tell the term generator to use this. - Xapian::Document doc; - termgenerator.set_document(doc); - - // Index each field with a suitable prefix. - termgenerator.index_text(chan.mMeta.mGroupName, 1, "G"); - termgenerator.index_text(timetToXapianDate(chan.mMeta.mPublishTs), 1, "D"); - termgenerator.index_text(chan.mDescription, 1, "XD"); - - // Index fields without prefixes for general search. - termgenerator.index_text(chan.mMeta.mGroupName); - termgenerator.increase_termpos(); - termgenerator.index_text(chan.mDescription); - - RsUrl chanUrl; chanUrl - .setScheme("retroshare").setPath("/channel") - .setQueryKV("id", chan.mMeta.mGroupId.toStdString()); - const std::string idTerm("Q" + chanUrl.toString()); - - chanUrl.setQueryKV("publishTs", std::to_string(chan.mMeta.mPublishTs)); - chanUrl.setQueryKV("name", chan.mMeta.mGroupName); - if(!chan.mMeta.mAuthorId.isNull()) - chanUrl.setQueryKV("authorId", chan.mMeta.mAuthorId.toStdString()); - if(chan.mMeta.mSignFlags) - chanUrl.setQueryKV( "signFlags", - std::to_string(chan.mMeta.mSignFlags) ); - std::string rsLink(chanUrl.toString()); - - // store the RS link so we are able to retrive it on matching search - doc.add_value(URL_VALUENO, rsLink); - - // Store some fields for display purposes. - doc.set_data(chan.mMeta.mGroupName + "\n" + chan.mDescription); - - // We use the identifier to ensure each object ends up in the - // database only once no matter how many times we run the - // indexer. "Q" prefix is a Xapian convention for unique id term. - doc.add_boolean_term(idTerm); - db.replace_document(idTerm, doc); - } - - static void removeChannelFromIndex(RsGxsGroupId grpId) - { - // "Q" prefix is a Xapian convention for unique id term. - RsUrl chanUrl; chanUrl - .setScheme("retroshare").setPath("/channel") - .setQueryKV("id", grpId.toStdString()); - std::string idTerm("Q" + chanUrl.toString()); - - Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); - db.delete_document(idTerm); - } - - static void indexChannelPost(const RsGxsChannelPost& post) - { - Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); - - // Set up a TermGenerator that we'll use in indexing. - Xapian::TermGenerator termgenerator; - //termgenerator.set_stemmer(Xapian::Stem("en")); - - // We make a document and tell the term generator to use this. - Xapian::Document doc; - termgenerator.set_document(doc); - - // Index each field with a suitable prefix. - termgenerator.index_text(post.mMeta.mMsgName, 1, "S"); - termgenerator.index_text(timetToXapianDate(post.mMeta.mPublishTs), 1, "D"); - - // Avoid indexing HTML - bool isPlainMsg = post.mMsg[0] != '<' || post.mMsg[post.mMsg.size() - 1] != '>'; - - if(isPlainMsg) - termgenerator.index_text(post.mMsg, 1, "XD"); - - // Index fields without prefixes for general search. - termgenerator.index_text(post.mMeta.mMsgName); - if(isPlainMsg) - { - termgenerator.increase_termpos(); - termgenerator.index_text(post.mMsg); - } - - for(const RsGxsFile& attachment : post.mFiles) - { - termgenerator.index_text(attachment.mName, 1, "F"); - - termgenerator.increase_termpos(); - termgenerator.index_text(attachment.mName); - } - - // We use the identifier to ensure each object ends up in the - // database only once no matter how many times we run the - // indexer. - RsUrl postUrl; postUrl - .setScheme("retroshare").setPath("/channel") - .setQueryKV("id", post.mMeta.mGroupId.toStdString()) - .setQueryKV("msgid", post.mMeta.mMsgId.toStdString()); - std::string idTerm("Q" + postUrl.toString()); - - postUrl.setQueryKV("publishTs", std::to_string(post.mMeta.mPublishTs)); - postUrl.setQueryKV("name", post.mMeta.mMsgName); - postUrl.setQueryKV("authorId", post.mMeta.mAuthorId.toStdString()); - std::string rsLink(postUrl.toString()); - - // store the RS link so we are able to retrive it on matching search - doc.add_value(URL_VALUENO, rsLink); - - // Store some fields for display purposes. - if(isPlainMsg) - doc.set_data(post.mMeta.mMsgName + "\n" + post.mMsg); - else doc.set_data(post.mMeta.mMsgName); - - doc.add_boolean_term(idTerm); - db.replace_document(idTerm, doc); - } - - static void removeChannelPostFromIndex( - RsGxsGroupId grpId, RsGxsMessageId msgId ) - { - RsUrl postUrl; postUrl - .setScheme("retroshare").setPath("/channel") - .setQueryKV("id", grpId.toStdString()) - .setQueryKV("msgid", msgId.toStdString()); - // "Q" prefix is a Xapian convention for unique id term. - std::string idTerm("Q" + postUrl.toString()); - - Xapian::WritableDatabase db(dbPath(), Xapian::DB_CREATE_OR_OPEN); - db.delete_document(idTerm); - } - -private: - - enum : Xapian::valueno - { - /// Used to store retroshare url of indexed documents - URL_VALUENO, - - /// @see Xapian::BAD_VALUENO - BAD_VALUENO = Xapian::BAD_VALUENO - }; - - static const std::string& dbPath() - { - static const std::string dbDir = - RsAccounts::AccountDirectory() + "/deep_search_xapian_db"; - return dbDir; - } - - static std::string timetToXapianDate(const rstime_t& time) - { - char date[] = "YYYYMMDD\0"; - time_t tTime = static_cast(time); - std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime)); - return date; - } -}; - diff --git a/libretroshare/src/deep_search/filesindex.cpp b/libretroshare/src/deep_search/filesindex.cpp new file mode 100644 index 000000000..f59069a39 --- /dev/null +++ b/libretroshare/src/deep_search/filesindex.cpp @@ -0,0 +1,143 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ + +#include "deep_search/filesindex.hpp" +#include "deep_search/commonutils.hpp" +#include "util/rsdebug.h" +#include "retroshare/rsinit.h" + +#include + +/*static*/ std::multimap +DeepFilesIndex::indexersRegister = {}; + +bool DeepFilesIndex::indexFile( + const std::string& path, const std::string& name, + const RsFileHash& hash ) +{ + auto dbPtr = DeepSearch::openWritableDatabase( + mDbPath, Xapian::DB_CREATE_OR_OPEN ); + if(!dbPtr) return false; + Xapian::WritableDatabase& db(*dbPtr); + + if(db.term_exists("Q" + hash.toStdString())) + { + Dbg3() << __PRETTY_FUNCTION__ << " skipping laready indexed file: " + << hash << " " << name << std::endl; + return true; + } + + // Set up a TermGenerator that we'll use in indexing. + Xapian::TermGenerator termgenerator; + //termgenerator.set_stemmer(Xapian::Stem("en")); + + // We make a document and tell the term generator to use this. + Xapian::Document doc; + termgenerator.set_document(doc); + + for(auto& indexerPair : indexersRegister) + if(indexerPair.second(path, name, termgenerator, doc) > 50) + break; + + const std::string hashString = hash.toStdString(); + const std::string idTerm("Q" + hashString); + doc.add_boolean_term(idTerm); + termgenerator.index_text(name, 1, "N"); + termgenerator.index_text(name); + doc.add_value(FILE_HASH_VALUENO, hashString); + db.replace_document(idTerm, doc); + + return true; +} + +bool DeepFilesIndex::removeFileFromIndex(const RsFileHash& hash) +{ + Dbg3() << __PRETTY_FUNCTION__ << " removing file from index: " + << hash << std::endl; + + std::unique_ptr db = + DeepSearch::openWritableDatabase(mDbPath, Xapian::DB_CREATE_OR_OPEN); + if(!db) return false; + + db->delete_document("Q" + hash.toStdString()); + return true; +} + +/*static*/ std::string DeepFilesIndex::dbDefaultPath() +{ return RsAccounts::AccountDirectory() + "/deep_files_index_xapian_db"; } + +/*static*/ bool DeepFilesIndex::registerIndexer( + int order, const DeepFilesIndex::IndexerFunType& indexerFun ) +{ + Dbg1() << __PRETTY_FUNCTION__ << " " << order << std::endl; + + indexersRegister.insert(std::make_pair(order, indexerFun)); + return true; +} + +uint32_t DeepFilesIndex::search( + const std::string& queryStr, + std::vector& results, uint32_t maxResults ) +{ + results.clear(); + + auto dbPtr = DeepSearch::openReadOnlyDatabase(mDbPath); + if(!dbPtr) return 0; + Xapian::Database& db(*dbPtr); + + // Set up a QueryParser with a stemmer and suitable prefixes. + Xapian::QueryParser queryparser; + //queryparser.set_stemmer(Xapian::Stem("en")); + queryparser.set_stemming_strategy(queryparser.STEM_SOME); + // Start of prefix configuration. + //queryparser.add_prefix("title", "S"); + //queryparser.add_prefix("description", "XD"); + // End of prefix configuration. + + // And parse the query. + Xapian::Query query = queryparser.parse_query(queryStr); + + // Use an Enquire object on the database to run the query. + Xapian::Enquire enquire(db); + enquire.set_query(query); + + Xapian::MSet mset = enquire.get_mset( + 0, maxResults ? maxResults : db.get_doccount() ); + + for ( Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m ) + { + const Xapian::Document& doc = m.get_document(); + DeepFilesSearchResult s; + s.mFileHash = RsFileHash(doc.get_value(FILE_HASH_VALUENO)); + s.mWeight = m.get_weight(); +#if XAPIAN_AT_LEAST(1,3,5) + s.mSnippet = mset.snippet(doc.get_data()); +#endif // XAPIAN_AT_LEAST(1,3,5) + results.push_back(s); + } + + return static_cast(results.size()); +} + + +#ifdef RS_DEEP_FILES_INDEX_OGG +# include "deep_search/filesoggindexer.hpp" +static RsDeepOggFileIndexer oggFileIndexer; +#endif // def RS_DEEP_FILES_INDEX_OGG diff --git a/libretroshare/src/deep_search/filesindex.hpp b/libretroshare/src/deep_search/filesindex.hpp new file mode 100644 index 000000000..feb172ddc --- /dev/null +++ b/libretroshare/src/deep_search/filesindex.hpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ +#pragma once + +#include "retroshare/rstypes.h" +#include "util/rsdebug.h" + +#include +#include +#include +#include +#include +#include + +struct DeepFilesSearchResult +{ + DeepFilesSearchResult() : mWeight(0) {} + + RsFileHash mFileHash; + double mWeight; + std::string mSnippet; +}; + +class DeepFilesIndex +{ +public: + DeepFilesIndex(const std::string& dbPath) : mDbPath(dbPath) {} + + /** + * @brief Search indexed files + * @param[in] maxResults maximum number of acceptable search results, 0 for + * no limits + * @return search results count + */ + uint32_t search( const std::string& queryStr, + std::vector& results, + uint32_t maxResults = 100 ); + + /** + * @return false if file could not be indexed because of error or + * unsupported type, true otherwise. + */ + bool indexFile( + const std::string& path, const std::string& name, + const RsFileHash& hash ); + + /** + * @brief Remove file entry from database + * @return false on error, true otherwise. + */ + bool removeFileFromIndex(const RsFileHash& hash); + + static std::string dbDefaultPath(); + + using IndexerFunType = std::function< + uint32_t( const std::string& path, const std::string& name, + Xapian::TermGenerator& xTG, Xapian::Document& xDoc ) >; + + static bool registerIndexer( + int order, const IndexerFunType& indexerFun ); + +private: + enum : Xapian::valueno + { + /// Used to store RsFileHash of indexed documents + FILE_HASH_VALUENO, + + /// @see Xapian::BAD_VALUENO + BAD_VALUENO = Xapian::BAD_VALUENO + }; + + const std::string mDbPath; + + /** Storage for indexers function by order */ + static std::multimap indexersRegister; + + RS_SET_CONTEXT_DEBUG_LEVEL(4) +}; diff --git a/libretroshare/src/deep_search/filesoggindexer.hpp b/libretroshare/src/deep_search/filesoggindexer.hpp new file mode 100644 index 000000000..babaa2b71 --- /dev/null +++ b/libretroshare/src/deep_search/filesoggindexer.hpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * RetroShare full text indexing and search implementation based on Xapian * + * * + * Copyright (C) 2018-2019 Gioacchino Mazzurco * + * Copyright (C) 2019 Asociación Civil Altermundi * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License version 3 as * + * published by the Free Software Foundation. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + * * + *******************************************************************************/ + +#include "deep_search/filesindex.hpp" +#include "util/rsdebug.h" + +#include +#include +#include +#include + +struct RsDeepOggFileIndexer +{ + RsDeepOggFileIndexer() + { + DeepFilesIndex::registerIndexer(30, indexOggFile); + } + + static uint32_t indexOggFile( + const std::string& path, const std::string& /*name*/, + Xapian::TermGenerator& xTG, Xapian::Document& xDoc ) + { + Dbg3() << __PRETTY_FUNCTION__ << " " << path << std::endl; + + OggVorbis_File vf; + int ret = ov_fopen(path.c_str(), &vf); + + if(ret == 0 && vf.vc) + { + vorbis_comment& vc = *vf.vc; + std::string docData = xDoc.get_data(); + for (int i = 0; i < vc.comments; ++i) + { + using szt = std::string::size_type; + std::string userComment( + vc.user_comments[i], + static_cast(vc.comment_lengths[i]) ); + + if(userComment.empty()) continue; + + szt equalPos = userComment.find('='); + if(equalPos == std::string::npos) continue; + + std::string tagName = userComment.substr(0, equalPos); + if(tagName.empty()) continue; + + std::string tagValue = userComment.substr(equalPos + 1); + if(tagValue.empty()) continue; + + /* Ogg tags should be uppercases but not all the softwares + * enforce it */ + for (auto& c: tagName) c = static_cast(toupper(c)); + + if(tagName == "ARTIST") + xTG.index_text(tagValue, 1, "A"); + else if (tagName == "DESCRIPTION") + xTG.index_text(tagValue, 1, "XD"); + else if (tagName == "TITLE") + xTG.index_text(tagValue, 1, "S"); + if(tagName.find("COVERART") != tagName.npos) + continue; // Avoid polluting the index with binary data + else if (tagName.find("METADATA_BLOCK_PICTURE") != tagName.npos) + continue; // Avoid polluting the index with binary data + + // Index fields without prefixes for general search. + xTG.increase_termpos(); + xTG.index_text(userComment); + docData += userComment + "\n"; + } + xDoc.set_data(docData); + + ov_clear(&vf); + return 99; + } + + return 0; + } + + RS_SET_CONTEXT_DEBUG_LEVEL(2) +}; diff --git a/libretroshare/src/file_sharing/dir_hierarchy.cc b/libretroshare/src/file_sharing/dir_hierarchy.cc index 6522da2cd..7d7864165 100644 --- a/libretroshare/src/file_sharing/dir_hierarchy.cc +++ b/libretroshare/src/file_sharing/dir_hierarchy.cc @@ -21,15 +21,19 @@ ******************************************************************************/ #include #include + #include "util/rstime.h" #include "util/rsdir.h" #include "util/rsprint.h" #include "retroshare/rsexpr.h" - #include "dir_hierarchy.h" #include "filelist_io.h" #include "file_sharing_defaults.h" +#ifdef RS_DEEP_FILES_INDEX +# include "deep_search/filesindex.hpp" +#endif // def RS_DEEP_FILES_INDEX + //#define DEBUG_DIRECTORY_STORAGE 1 typedef FileListIO::read_error read_error; @@ -391,6 +395,11 @@ void InternalFileHierarchyStorage::deleteFileNode(uint32_t index) { FileEntry& fe(*static_cast(mNodes[index])) ; +#ifdef RS_DEEP_FILES_INDEX + DeepFilesIndex tfi(DeepFilesIndex::dbDefaultPath()); + tfi.removeFileFromIndex(fe.file_hash); +#endif + if(mTotalSize >= fe.file_size) mTotalSize -= fe.file_size ; @@ -753,7 +762,9 @@ int InternalFileHierarchyStorage::searchBoolExp(RsRegularExpression::Expression return 0; } -int InternalFileHierarchyStorage::searchTerms(const std::list& terms, std::list &results) const +int InternalFileHierarchyStorage::searchTerms( + const std::list& terms, + std::list& results ) const { // most entries are likely to be files, so we could do a linear search over the entries tab. // instead we go through the table of hashes. diff --git a/libretroshare/src/file_sharing/directory_storage.cc b/libretroshare/src/file_sharing/directory_storage.cc index 2e328e61e..fbbb27e4a 100644 --- a/libretroshare/src/file_sharing/directory_storage.cc +++ b/libretroshare/src/file_sharing/directory_storage.cc @@ -20,6 +20,7 @@ * * ******************************************************************************/ #include + #include "util/rstime.h" #include "serialiser/rstlvbinary.h" #include "retroshare/rspeers.h" @@ -30,6 +31,10 @@ #include "dir_hierarchy.h" #include "filelist_io.h" +#ifdef RS_DEEP_FILES_INDEX +# include "deep_search/filesindex.hpp" +#endif // def RS_DEEP_FILES_INDEX + //#define DEBUG_REMOTE_DIRECTORY_STORAGE 1 /******************************************************************************************************************/ @@ -180,7 +185,9 @@ void DirectoryStorage::print() mFileHierarchy->print(); } -int DirectoryStorage::searchTerms(const std::list& terms, std::list &results) const +int DirectoryStorage::searchTerms( + const std::list& terms, + std::list& results ) const { RS_STACK_MUTEX(mDirStorageMtx) ; return mFileHierarchy->searchTerms(terms,results); @@ -501,18 +508,39 @@ void LocalDirectoryStorage::updateTimeStamps() #endif } } -bool LocalDirectoryStorage::updateHash(const EntryIndex& index, const RsFileHash& hash, bool update_internal_hierarchy) -{ - RS_STACK_MUTEX(mDirStorageMtx) ; - mEncryptedHashes[makeEncryptedHash(hash)] = hash ; - mChanged = true ; +bool LocalDirectoryStorage::updateHash( + const EntryIndex& index, const RsFileHash& hash, + bool update_internal_hierarchy ) +{ + bool ret = false; + + { + RS_STACK_MUTEX(mDirStorageMtx); + + mEncryptedHashes[makeEncryptedHash(hash)] = hash ; + mChanged = true ; #ifdef DEBUG_LOCAL_DIRECTORY_STORAGE - std::cerr << "Updating index of hash " << hash << " update_internal=" << update_internal_hierarchy << std::endl; + std::cerr << "Updating index of hash " << hash << " update_internal=" + << update_internal_hierarchy << std::endl; #endif - return (!update_internal_hierarchy)|| mFileHierarchy->updateHash(index,hash); + ret = (!update_internal_hierarchy) || + mFileHierarchy->updateHash(index,hash); + } // RS_STACK_MUTEX(mDirStorageMtx); + +#ifdef RS_DEEP_FILES_INDEX + FileInfo fInfo; + if( ret && getFileInfo(index, fInfo) && + fInfo.storage_permission_flags & DIR_FLAGS_ANONYMOUS_SEARCH ) + { + DeepFilesIndex dfi(DeepFilesIndex::dbDefaultPath()); + ret &= dfi.indexFile(fInfo.path, fInfo.fname, hash); + } +#endif // def RS_DEEP_FILES_INDEX + + return ret; } std::string LocalDirectoryStorage::locked_findRealRootFromVirtualFilename(const std::string& virtual_rootdir) const { diff --git a/libretroshare/src/ft/ftserver.cc b/libretroshare/src/ft/ftserver.cc index 132973bfd..d52340de0 100644 --- a/libretroshare/src/ft/ftserver.cc +++ b/libretroshare/src/ft/ftserver.cc @@ -54,6 +54,10 @@ #include #include "util/rstime.h" +#ifdef RS_DEEP_FILES_INDEX +# include "deep_search/filesindex.hpp" +#endif // def RS_DEEP_FILES_INDEX + /*** * #define SERVER_DEBUG 1 * #define SERVER_DEBUG_CACHE 1 @@ -65,9 +69,26 @@ static const rstime_t FILE_TRANSFER_LOW_PRIORITY_TASKS_PERIOD = 5 ; // low priority tasks handling every 5 seconds static const rstime_t FILE_TRANSFER_MAX_DELAY_BEFORE_DROP_USAGE_RECORD = 10 ; // keep usage records for 10 secs at most. +#ifdef RS_DEEP_FILES_INDEX +TurtleFileInfoV2::TurtleFileInfoV2(const DeepFilesSearchResult& dRes) : + fHash(dRes.mFileHash), fWeight(static_cast(dRes.mWeight)), + fSnippet(dRes.mSnippet) +{ + FileInfo fInfo; + rsFiles->FileDetails(fHash, RS_FILE_HINTS_LOCAL, fInfo); + + fSize = fInfo.size; + fName = fInfo.fname; +} +#endif // def RS_DEEP_FILES_INDEX + +TurtleFileInfoV2::~TurtleFileInfoV2() = default; + /* Setup */ -ftServer::ftServer(p3PeerMgr *pm, p3ServiceControl *sc) - : p3Service(),RsServiceSerializer(RS_SERVICE_TYPE_TURTLE), // should be FT, but this is for backward compatibility +ftServer::ftServer(p3PeerMgr *pm, p3ServiceControl *sc): + p3Service(), + // should be FT, but this is for backward compatibility + RsServiceSerializer(RS_SERVICE_TYPE_TURTLE), mPeerMgr(pm), mServiceCtrl(sc), mFileDatabase(NULL), mFtController(NULL), mFtExtra(NULL), @@ -500,15 +521,24 @@ bool ftServer::FileDetails(const RsFileHash &hash, FileSearchFlags hintflags, Fi return false; } -RsItem *ftServer::create_item(uint16_t service,uint8_t item_type) const +RsItem *ftServer::create_item(uint16_t service, uint8_t item_type) const { #ifdef SERVER_DEBUG FTSERVER_DEBUG() << "p3turtle: deserialising packet: " << std::endl ; #endif - if (RS_SERVICE_TYPE_TURTLE != service) + + RsServiceType serviceType = static_cast(service); + switch (serviceType) { - FTSERVER_ERROR() << " Wrong type !!" << std::endl ; - return NULL; /* wrong type */ + /* This one is here for retro-compatibility as turtle routing and file + * trasfer services were just one service before turle service was + * generalized */ + case RsServiceType::TURTLE: break; + case RsServiceType::FILE_TRANSFER: break; + default: + RsErr() << __PRETTY_FUNCTION__ << " Wrong service type: " << service + << std::endl; + return nullptr; } try @@ -521,16 +551,19 @@ RsItem *ftServer::create_item(uint16_t service,uint8_t item_type) const case RS_TURTLE_SUBTYPE_FILE_MAP : return new RsTurtleFileMapItem(); case RS_TURTLE_SUBTYPE_CHUNK_CRC_REQUEST : return new RsTurtleChunkCrcRequestItem(); case RS_TURTLE_SUBTYPE_CHUNK_CRC : return new RsTurtleChunkCrcItem(); - + case static_cast(RsFileItemType::FILE_SEARCH_REQUEST): + return new RsFileSearchRequestItem(); + case static_cast(RsFileItemType::FILE_SEARCH_RESULT): + return new RsFileSearchResultItem(); default: - return NULL ; + return nullptr; } } catch(std::exception& e) { FTSERVER_ERROR() << "(EE) deserialisation error in " << __PRETTY_FUNCTION__ << ": " << e.what() << std::endl; - return NULL ; + return nullptr; } } @@ -1837,7 +1870,12 @@ void ftServer::ftReceiveSearchResult(RsTurtleFTSearchResultItem *item) if(cbpt != mSearchCallbacksMap.end()) { hasCallback = true; - cbpt->second.first(item->result); + + std::vector cRes; + for( const auto& tfiold : item->result) + cRes.push_back(tfiold); + + cbpt->second.first(cRes); } } // end RS_STACK_MUTEX(mSearchCallbacksMapMutex); @@ -1845,6 +1883,99 @@ void ftServer::ftReceiveSearchResult(RsTurtleFTSearchResultItem *item) RsServer::notify()->notifyTurtleSearchResult(item->PeerId(),item->request_id, item->result ); } +bool ftServer::receiveSearchRequest( + unsigned char* searchRequestData, uint32_t searchRequestDataLen, + unsigned char*& searchResultData, uint32_t& searchResultDataLen, + uint32_t& maxAllowsHits ) +{ +#ifdef RS_DEEP_FILES_INDEX + std::unique_ptr recvItem( + RsServiceSerializer::deserialise( + searchRequestData, &searchRequestDataLen ) ); + + if(!recvItem) + { + RsWarn() << __PRETTY_FUNCTION__ << " Search request deserialization " + << "failed" << std::endl; + return false; + } + + std::unique_ptr sReqItPtr( + dynamic_cast(recvItem.get()) ); + if(!sReqItPtr) + { + RsWarn() << __PRETTY_FUNCTION__ << " Received an invalid search request" + << " " << *recvItem << std::endl; + return false; + } + recvItem.release(); + + RsFileSearchRequestItem& searchReq(*sReqItPtr); + + std::vector dRes; + DeepFilesIndex dfi(DeepFilesIndex::dbDefaultPath()); + if(dfi.search(searchReq.queryString, dRes, maxAllowsHits) > 0) + { + RsFileSearchResultItem resIt; + + for(const auto& dMatch : dRes) + resIt.mResults.push_back(TurtleFileInfoV2(dMatch)); + + searchResultDataLen = RsServiceSerializer::size(&resIt); + searchResultData = static_cast(malloc(searchResultDataLen)); + return RsServiceSerializer::serialise( + &resIt, searchResultData, &searchResultDataLen ); + } +#endif // def RS_DEEP_FILES_INDEX + + searchResultData = nullptr; + searchResultDataLen = 0; + return false; +} + +void ftServer::receiveSearchResult( + TurtleSearchRequestId requestId, unsigned char* searchResultData, + uint32_t searchResultDataLen ) +{ + if(!searchResultData || !searchResultDataLen) + { + RsWarn() << __PRETTY_FUNCTION__ << " got null paramethers " + << "searchResultData: " << static_cast(searchResultData) + << " searchResultDataLen: " << searchResultDataLen + << " seems someone else in the network have a buggy RetroShare" + << " implementation" << std::endl; + return; + } + + RS_STACK_MUTEX(mSearchCallbacksMapMutex); + auto cbpt = mSearchCallbacksMap.find(requestId); + if(cbpt != mSearchCallbacksMap.end()) + { + RsItem* recvItem = RsServiceSerializer::deserialise( + searchResultData, &searchResultDataLen ); + + if(!recvItem) + { + RsWarn() << __PRETTY_FUNCTION__ << " Search result deserialization " + << "failed" << std::endl; + return; + } + + std::unique_ptr resItPtr( + dynamic_cast(recvItem) ); + + if(!resItPtr) + { + RsWarn() << __PRETTY_FUNCTION__ << " Received invalid search result" + << std::endl; + delete recvItem; + return; + } + + cbpt->second.first(resItPtr->mResults); + } +} + /***************************** CONFIG ****************************/ bool ftServer::addConfiguration(p3ConfigMgr *cfgmgr) @@ -1857,27 +1988,74 @@ bool ftServer::addConfiguration(p3ConfigMgr *cfgmgr) return true; } +#ifdef RS_DEEP_FILES_INDEX +static std::vector xapianQueryKeywords = +{ + " AND ", " OR ", " NOT ", " XOR ", " +", " -", " ( ", " ) ", " NEAR ", + " ADJ ", " \"", "\" " +}; +#endif + bool ftServer::turtleSearchRequest( const std::string& matchString, - const std::function& results)>& multiCallback, + const std::function& results)>& multiCallback, rstime_t maxWait ) { if(matchString.empty()) { - std::cerr << __PRETTY_FUNCTION__ << " match string can't be empty!" - << std::endl; + RsWarn() << __PRETTY_FUNCTION__ << " match string can't be empty!" + << std::endl; return false; } - TurtleRequestId sId = turtleSearch(matchString); +#ifdef RS_DEEP_FILES_INDEX + RsFileSearchRequestItem sItem; + sItem.queryString = matchString; - RS_STACK_MUTEX(mSearchCallbacksMapMutex); - mSearchCallbacksMap.emplace( - sId, - std::make_pair( - multiCallback, - std::chrono::system_clock::now() + - std::chrono::seconds(maxWait) ) ); + uint32_t iSize = RsServiceSerializer::size(&sItem); + uint8_t* iBuf = static_cast(malloc(iSize)); + RsServiceSerializer::serialise(&sItem, iBuf, &iSize); + + Dbg3() << __PRETTY_FUNCTION__ << " sending search request:" << sItem + << std::endl; + + TurtleRequestId xsId = mTurtleRouter->turtleSearch(iBuf, iSize, this); + + { RS_STACK_MUTEX(mSearchCallbacksMapMutex); + mSearchCallbacksMap.emplace( + xsId, + std::make_pair( + multiCallback, + std::chrono::system_clock::now() + + std::chrono::seconds(maxWait) ) ); + } // RS_STACK_MUTEX(mSearchCallbacksMapMutex); + + /* Trick to keep receiving more or less usable results from old peers */ + std::string strippedQuery = matchString; + for(const std::string& xKeyword : xapianQueryKeywords) + { + std::string::size_type pos = std::string::npos; + while( (pos = strippedQuery.find(xKeyword)) != std::string::npos ) + strippedQuery.replace(pos, xKeyword.length(), " "); + } + + Dbg3() << __PRETTY_FUNCTION__ << " sending stripped query for " + << "retro-compatibility: " << strippedQuery << std::endl; + + TurtleRequestId sId = mTurtleRouter->turtleSearch(strippedQuery); +#else // def RS_DEEP_FILES_INDEX + TurtleRequestId sId = mTurtleRouter->turtleSearch(matchString); +#endif // def RS_DEEP_FILES_INDEX + + { + RS_STACK_MUTEX(mSearchCallbacksMapMutex); + mSearchCallbacksMap.emplace( + sId, + std::make_pair( + multiCallback, + std::chrono::system_clock::now() + + std::chrono::seconds(maxWait) ) ); + } return true; } @@ -1913,4 +2091,13 @@ bool ftServer::isHashBanned(const RsFileHash& hash) return mFileDatabase->isFileBanned(hash); } +RsFileItem::~RsFileItem() = default; +RsFileItem::RsFileItem(RsFileItemType subtype) : + RsItem( RS_PKT_VERSION_SERVICE, + static_cast(RsServiceType::FILE_TRANSFER), + static_cast(subtype) ) {} + +void RsFileSearchRequestItem::clear() { queryString.clear(); } + +void RsFileSearchResultItem::clear() { mResults.clear(); } diff --git a/libretroshare/src/ft/ftserver.h b/libretroshare/src/ft/ftserver.h index a8879541a..0cc88d6aa 100644 --- a/libretroshare/src/ft/ftserver.h +++ b/libretroshare/src/ft/ftserver.h @@ -46,7 +46,8 @@ #include "turtle/turtleclientservice.h" #include "services/p3service.h" #include "retroshare/rsfiles.h" - +#include "rsitems/rsitem.h" +#include "serialiser/rsserial.h" #include "pqi/pqi.h" #include "pqi/p3cfgmgr.h" @@ -67,7 +68,53 @@ class p3PeerMgr; class p3ServiceControl; class p3FileDatabase; -class ftServer: public p3Service, public RsFiles, public ftDataSend, public RsTurtleClientService, public RsServiceSerializer +enum class RsFileItemType : uint8_t +{ + NONE = 0x00, /// Only to detect ununitialized + FILE_SEARCH_REQUEST = 0x57, + FILE_SEARCH_RESULT = 0x58 +}; + +struct RsFileItem : RsItem +{ + ~RsFileItem() override; + +protected: + RsFileItem(RsFileItemType subtype); +}; + +struct RsFileSearchRequestItem : RsFileItem +{ + RsFileSearchRequestItem() : RsFileItem(RsFileItemType::FILE_SEARCH_REQUEST) + { setPriorityLevel(QOS_PRIORITY_RS_TURTLE_SEARCH_REQUEST); } + + std::string queryString; + + void serial_process( RsGenericSerializer::SerializeJob j, + RsGenericSerializer::SerializeContext& ctx ) override + { RS_SERIAL_PROCESS(queryString); } + + void clear() override; +}; + +struct RsFileSearchResultItem : RsFileItem +{ + RsFileSearchResultItem() : RsFileItem(RsFileItemType::FILE_SEARCH_RESULT) + { setPriorityLevel(QOS_PRIORITY_RS_TURTLE_SEARCH_RESULT); } + + std::vector mResults; + + void serial_process( RsGenericSerializer::SerializeJob j, + RsGenericSerializer::SerializeContext& ctx ) override + { RS_SERIAL_PROCESS(mResults); } + + void clear() override; +}; + + +class ftServer : + public p3Service, public RsFiles, public ftDataSend, + public RsTurtleClientService, public RsServiceSerializer { public: @@ -98,7 +145,21 @@ public: uint16_t serviceId() const { return RS_SERVICE_TYPE_FILE_TRANSFER ; } virtual bool handleTunnelRequest(const RsFileHash& hash,const RsPeerId& peer_id) ; virtual void receiveTurtleData(const RsTurtleGenericTunnelItem *item,const RsFileHash& hash,const RsPeerId& virtual_peer_id,RsTurtleGenericTunnelItem::Direction direction) ; - virtual void ftReceiveSearchResult(RsTurtleFTSearchResultItem *item); // We dont use TurtleClientService::receiveSearchResult() because of backward compatibility. + + /// We keep this for retro-compatibility @see RsTurtleClientService + virtual void ftReceiveSearchResult(RsTurtleFTSearchResultItem *item); + + /// @see RsTurtleClientService + bool receiveSearchRequest( + unsigned char* searchRequestData, uint32_t searchRequestDataLen, + unsigned char*& search_result_data, uint32_t& searchResultDataLen, + uint32_t& maxAllowsHits ) override; + + /// @see RsTurtleClientService + void receiveSearchResult( + TurtleSearchRequestId requestId, unsigned char* searchResultData, + uint32_t searchResultDataLen ) override; + virtual RsItem *create_item(uint16_t service,uint8_t item_type) const ; virtual RsServiceSerializer *serializer() { return this ; } @@ -148,7 +209,7 @@ public: /// @see RsFiles virtual bool turtleSearchRequest( const std::string& matchString, - const std::function& results)>& multiCallback, + const std::function& results)>& multiCallback, rstime_t maxWait = 300 ); virtual TurtleSearchRequestId turtleSearch(const std::string& string_to_match) ; @@ -337,13 +398,15 @@ private: std::map< TurtleRequestId, std::pair< - std::function& results)>, + std::function& results)>, std::chrono::system_clock::time_point > > mSearchCallbacksMap; RsMutex mSearchCallbacksMapMutex; /// Cleanup mSearchCallbacksMap void cleanTimedOutSearches(); + + RS_SET_CONTEXT_DEBUG_LEVEL(1) }; diff --git a/libretroshare/src/gxs/rsgxsnetservice.cc b/libretroshare/src/gxs/rsgxsnetservice.cc index 989ddf305..6a73ac092 100644 --- a/libretroshare/src/gxs/rsgxsnetservice.cc +++ b/libretroshare/src/gxs/rsgxsnetservice.cc @@ -257,8 +257,8 @@ #include "util/rsmemory.h" #include "util/stacktrace.h" -#ifdef RS_DEEP_SEARCH -# include "deep_search/deep_search.h" +#ifdef RS_DEEP_CHANNEL_INDEX +# include "deep_search/channelsindex.hpp" #endif /*** @@ -5148,13 +5148,13 @@ TurtleRequestId RsGxsNetService::turtleSearchRequest(const std::string& match_st return mGxsNetTunnel->turtleSearchRequest(match_string,this) ; } -#ifndef RS_DEEP_SEARCH +#ifndef RS_DEEP_CHANNEL_INDEX static bool termSearch(const std::string& src, const std::string& substring) { /* always ignore case */ return src.end() != std::search( src.begin(), src.end(), substring.begin(), substring.end(), RsRegularExpression::CompareCharIC() ); } -#endif // ndef RS_DEEP_SEARCH +#endif // ndef RS_DEEP_CHANNEL_INDEX bool RsGxsNetService::retrieveDistantSearchResults(TurtleRequestId req,std::map& group_infos) { @@ -5209,11 +5209,11 @@ void RsGxsNetService::receiveTurtleSearchResults( for (const RsGxsGroupSummary& gps : group_infos) { -#ifndef RS_DEEP_SEARCH +#ifndef RS_DEEP_CHANNEL_INDEX /* Only keep groups that are not locally known, and groups that are * not already in the mDistantSearchResults structure. */ if(grpMeta[gps.mGroupId]) continue; -#else // ndef RS_DEEP_SEARCH +#else // ndef RS_DEEP_CHANNEL_INDEX /* When deep search is enabled search results may bring more info * then we already have also about post that are indexed by xapian, * so we don't apply this filter in this case. */ @@ -5302,9 +5302,9 @@ bool RsGxsNetService::search( const std::string& substring, { group_infos.clear(); -#ifdef RS_DEEP_SEARCH - std::vector results; - DeepSearch::search(substring, results); +#ifdef RS_DEEP_CHANNEL_INDEX + std::vector results; + DeepChannelsIndex::search(substring, results); for(auto dsr : results) { @@ -5324,7 +5324,7 @@ bool RsGxsNetService::search( const std::string& substring, if((rit = uQ.find("name")) != uQ.end()) s.mGroupName = rit->second; if((rit = uQ.find("signFlags")) != uQ.end()) - s.mSignFlags = std::stoul(rit->second); + s.mSignFlags = static_cast(std::stoul(rit->second)); if((rit = uQ.find("publishTs")) != uQ.end()) s.mPublishTs = static_cast(std::stoll(rit->second)); if((rit = uQ.find("authorId")) != uQ.end()) @@ -5340,7 +5340,7 @@ bool RsGxsNetService::search( const std::string& substring, } } } -#else // RS_DEEP_SEARCH +#else // RS_DEEP_CHANNEL_INDEX RsGxsGrpMetaTemporaryMap grpMetaMap; { RS_STACK_MUTEX(mNxsMutex) ; @@ -5366,7 +5366,7 @@ bool RsGxsNetService::search( const std::string& substring, group_infos.push_back(s); } -#endif // RS_DEEP_SEARCH +#endif // RS_DEEP_CHANNEL_INDEX #ifdef NXS_NET_DEBUG_8 GXSNETDEBUG___ << " performing local substring search in response to distant request. Found " << group_infos.size() << " responses." << std::endl; diff --git a/libretroshare/src/gxs/rsgxsutil.cc b/libretroshare/src/gxs/rsgxsutil.cc index 14b93cda8..2396618a8 100644 --- a/libretroshare/src/gxs/rsgxsutil.cc +++ b/libretroshare/src/gxs/rsgxsutil.cc @@ -29,8 +29,8 @@ #include "pqi/pqihash.h" #include "gxs/rsgixs.h" -#ifdef RS_DEEP_SEARCH -# include "deep_search/deep_search.h" +#ifdef RS_DEEP_CHANNEL_INDEX +# include "deep_search/channelsindex.hpp" # include "services/p3gxschannels.h" # include "rsitems/rsgxschannelitems.h" #endif @@ -148,12 +148,12 @@ bool RsGxsMessageCleanUp::clean() RsGxsIntegrityCheck::RsGxsIntegrityCheck( RsGeneralDataService* const dataService, RsGenExchange* genex, RsSerialType& -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX serializer #endif , RsGixs* gixs ) : mDs(dataService), mGenExchangeClient(genex), -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX mSerializer(serializer), #endif mDone(false), mIntegrityMutex("integrity"), mGixs(gixs) {} @@ -168,7 +168,7 @@ void RsGxsIntegrityCheck::run() bool RsGxsIntegrityCheck::check() { -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX bool isGxsChannels = mGenExchangeClient->serviceType() == RS_SERVICE_GXS_TYPE_CHANNELS; std::set indexedGroups; #endif @@ -221,7 +221,7 @@ bool RsGxsIntegrityCheck::check() } else msgIds.erase(msgIds.find(grp->grpId)); -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX if( isGxsChannels && grp->metaData->mCircleType == GXS_CIRCLE_TYPE_PUBLIC && grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED ) @@ -241,7 +241,7 @@ bool RsGxsIntegrityCheck::check() cg.mMeta = meta; indexedGroups.insert(grp->grpId); - DeepSearch::indexChannelGroup(cg); + DeepChannelsIndex::indexChannelGroup(cg); } else { @@ -256,14 +256,15 @@ bool RsGxsIntegrityCheck::check() delete rIt; } -#endif +#endif // def RS_DEEP_CHANNEL_INDEX } else { grpsToDel.push_back(grp->grpId); -#ifdef RS_DEEP_SEARCH - if(isGxsChannels) DeepSearch::removeChannelFromIndex(grp->grpId); -#endif +#ifdef RS_DEEP_CHANNEL_INDEX + if(isGxsChannels) + DeepChannelsIndex::removeChannelFromIndex(grp->grpId); +#endif // def RS_DEEP_CHANNEL_INDEX } if( !(grp->metaData->mSubscribeFlags & GXS_SERV::GROUP_SUBSCRIBE_SUBSCRIBED) && @@ -320,10 +321,10 @@ bool RsGxsIntegrityCheck::check() if (nxsMsgIt == nxsMsgV.end()) { msgsToDel[grpId].insert(msgId); -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX if(isGxsChannels) - DeepSearch::removeChannelPostFromIndex(grpId, msgId); -#endif + DeepChannelsIndex::removeChannelPostFromIndex(grpId, msgId); +#endif // def RS_DEEP_CHANNEL_INDEX } } } @@ -348,14 +349,15 @@ bool RsGxsIntegrityCheck::check() << " with wrong hash or null meta data. meta=" << (void*)msg->metaData << std::endl; msgsToDel[msg->grpId].insert(msg->msgId); -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX if(isGxsChannels) - DeepSearch::removeChannelPostFromIndex(msg->grpId, msg->msgId); -#endif + DeepChannelsIndex::removeChannelPostFromIndex( + msg->grpId, msg->msgId ); +#endif // def RS_DEEP_CHANNEL_INDEX } else if (subscribed_groups.count(msg->metaData->mGroupId)) { -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX if( isGxsChannels && indexedGroups.count(msg->metaData->mGroupId) ) { @@ -373,7 +375,7 @@ bool RsGxsIntegrityCheck::check() cgIt->toChannelPost(cg, false); cg.mMeta = meta; - DeepSearch::indexChannelPost(cg); + DeepChannelsIndex::indexChannelPost(cg); } else if(dynamic_cast(rIt)) {} else if(dynamic_cast(rIt)) {} @@ -391,7 +393,7 @@ bool RsGxsIntegrityCheck::check() delete rIt; } -#endif +#endif // def RS_DEEP_CHANNEL_INDEX if(!msg->metaData->mAuthorId.isNull()) { diff --git a/libretroshare/src/gxs/rsgxsutil.h b/libretroshare/src/gxs/rsgxsutil.h index c1a610a3d..5acbd1f0b 100644 --- a/libretroshare/src/gxs/rsgxsutil.h +++ b/libretroshare/src/gxs/rsgxsutil.h @@ -213,7 +213,7 @@ private: RsGeneralDataService* const mDs; RsGenExchange *mGenExchangeClient; -#ifdef RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX RsSerialType& mSerializer; #endif bool mDone; diff --git a/libretroshare/src/libretroshare.pro b/libretroshare/src/libretroshare.pro index 39e4b20f7..017be8593 100644 --- a/libretroshare/src/libretroshare.pro +++ b/libretroshare/src/libretroshare.pro @@ -899,8 +899,24 @@ rs_jsonapi { SOURCES += jsonapi/jsonapi.cpp } -rs_deep_search { - HEADERS += deep_search/deep_search.h +rs_deep_channels_index { + HEADERS *= deep_search/commonutils.hpp + SOURCES *= deep_search/commonutils.cpp + + HEADERS += deep_search/channelsindex.hpp + SOURCES += deep_search/channelsindex.cpp +} + +rs_deep_files_index { + HEADERS *= deep_search/commonutils.hpp + SOURCES *= deep_search/commonutils.cpp + + HEADERS += deep_search/filesindex.hpp + SOURCES += deep_search/filesindex.cpp +} + +rs_deep_files_index_ogg { + HEADERS += deep_search/filesoggindexer.hpp } rs_broadcast_discovery { diff --git a/libretroshare/src/retroshare/rsfiles.h b/libretroshare/src/retroshare/rsfiles.h index 47c464052..daa83d3bc 100644 --- a/libretroshare/src/retroshare/rsfiles.h +++ b/libretroshare/src/retroshare/rsfiles.h @@ -202,6 +202,52 @@ struct BannedFileEntry : RsSerializable } }; +struct DeepFilesSearchResult; + +struct TurtleFileInfoV2 : RsSerializable +{ + TurtleFileInfoV2() : fSize(0), fWeight(0) {} + + TurtleFileInfoV2(const TurtleFileInfo& oldInfo) : + fSize(oldInfo.size), fHash(oldInfo.hash), fName(oldInfo.name), + fWeight(0) {} + +#ifdef RS_DEEP_FILES_INDEX + TurtleFileInfoV2(const DeepFilesSearchResult& dRes); +#endif // def RS_DEEP_FILES_INDEX + + uint64_t fSize; /// File size + RsFileHash fHash; /// File hash + std::string fName; /// File name + + /** @brief Xapian weight of the file which matched the search criteria + * This field is optional (its value is 0 when not specified). + * Given that Xapian weight for the same file is usually different on + * different nodes, it should not be used as an absolute refence, but just + * as an hint of how much the given file match the search criteria. + */ + float fWeight; + + /** @brief Xapian snippet of the file which matched the search criteria + * This field is optional (its value is an empty string when not specified). + */ + std::string fSnippet; + + + /// @see RsSerializable::serial_process + void serial_process( RsGenericSerializer::SerializeJob j, + RsGenericSerializer::SerializeContext& ctx ) override + { + RS_SERIAL_PROCESS(fSize); + RS_SERIAL_PROCESS(fHash); + RS_SERIAL_PROCESS(fName); + RS_SERIAL_PROCESS(fWeight); + RS_SERIAL_PROCESS(fSnippet); + } + + ~TurtleFileInfoV2() override; +}; + class RsFiles { public: @@ -209,7 +255,7 @@ public: virtual ~RsFiles() {} /** - * @brief Provides file data for the gui, media streaming or rpc clients. + * @brief Provides file data for the GUI, media streaming or API clients. * It may return unverified chunks. This allows streaming without having to * wait for hashes or completion of the file. * This function returns an unspecified amount of bytes. Either as much data @@ -217,8 +263,8 @@ public: * To get more data, call this function repeatedly with different offsets. * * @jsonapi{development,manualwrapper} - * note the missing @ the wrapper for this is written manually not - * autogenerated @see JsonApiServer. + * note the wrapper for this is written manually not autogenerated + * @see JsonApiServer. * * @param[in] hash hash of the file. The file has to be available on this node * or it has to be in downloading state. @@ -356,7 +402,9 @@ public: /** * @brief Request remote files search * @jsonapi{development} - * @param[in] matchString string to look for in the search + * @param[in] matchString string to look for in the search. If files deep + * indexing is enabled at compile time support advanced features described + * at https://xapian.org/docs/queryparser.html * @param multiCallback function that will be called each time a search * result is received * @param[in] maxWait maximum wait time in seconds for search results @@ -364,7 +412,7 @@ public: */ virtual bool turtleSearchRequest( const std::string& matchString, - const std::function& results)>& multiCallback, + const std::function& results)>& multiCallback, rstime_t maxWait = 300 ) = 0; virtual TurtleRequestId turtleSearch(const std::string& string_to_match) = 0; @@ -627,8 +675,19 @@ public: */ virtual bool removeSharedDirectory(std::string dir) = 0; - virtual bool getIgnoreLists(std::list& ignored_prefixes, std::list& ignored_suffixes,uint32_t& flags) =0; - virtual void setIgnoreLists(const std::list& ignored_prefixes, const std::list& ignored_suffixes,uint32_t flags) =0; + /** + * @brief Get list of ignored file name prefixes and suffixes + * @param[out] ignoredPrefixes storage for ingored prefixes + * @param[out] ignoredSuffixes storage for ingored suffixes + * @param flags RS_FILE_SHARE_FLAGS_IGNORE_* + * @return false if something failed, true otherwhise + */ + virtual bool getIgnoreLists( + std::list& ignoredPrefixes, + std::list& ignoredSuffixes, + uint32_t& flags ) = 0; + + virtual void setIgnoreLists(const std::list& ignored_prefixes, const std::list& ignored_suffixes,uint32_t flags) =0; virtual void setWatchPeriod(int minutes) =0; virtual void setWatchEnabled(bool b) =0; diff --git a/libretroshare/src/retroshare/rsturtle.h b/libretroshare/src/retroshare/rsturtle.h index 6f5ac469c..76120e85b 100644 --- a/libretroshare/src/retroshare/rsturtle.h +++ b/libretroshare/src/retroshare/rsturtle.h @@ -44,12 +44,10 @@ extern RsTurtle* rsTurtle; typedef uint32_t TurtleRequestId ; typedef RsPeerId TurtleVirtualPeerId; -/** - * This is the structure used to send back results of the turtle search, - * to other peers, to the notifyBase class, to the search caller or to the GUI. - */ struct TurtleFileInfo : RsSerializable { + TurtleFileInfo() : size(0) {} + uint64_t size; /// File size RsFileHash hash; /// File hash std::string name; /// File name @@ -65,7 +63,7 @@ struct TurtleFileInfo : RsSerializable RsTypeSerializer::serial_process( j, ctx, TLV_TYPE_STR_NAME, name, "name" ); } -}; +} RS_DEPRECATED_FOR(TurtleFileInfoV2); struct TurtleTunnelRequestDisplayInfo { @@ -120,10 +118,9 @@ public: virtual void setSessionEnabled(bool) = 0 ; virtual bool sessionEnabled() const = 0 ; - // Lauches a search request through the pipes, and immediately returns - // the request id, which will be further used by the gui to store results - // as they come back. - // + /** Lauches a search request through the pipes, and immediately returns + * the request id, which will be further used by client services to + * handle results as they come back. */ virtual TurtleRequestId turtleSearch( unsigned char *search_bin_data, uint32_t search_bin_data_len, RsTurtleClientService* client_service ) = 0; diff --git a/libretroshare/src/serialiser/rsserializer.cc b/libretroshare/src/serialiser/rsserializer.cc index c1385fddf..71d7abea3 100644 --- a/libretroshare/src/serialiser/rsserializer.cc +++ b/libretroshare/src/serialiser/rsserializer.cc @@ -27,6 +27,7 @@ #include "serialiser/rsserializer.h" #include "serialiser/rstypeserializer.h" #include "util/stacktrace.h" +#include "util/rsdebug.h" const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_NONE ( 0x0000 ); const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_CONFIG ( 0x0001 ); @@ -36,6 +37,16 @@ const SerializationFlags RsGenericSerializer::SERIALIZATION_FLAG_YIELDING ( 0 RsItem *RsServiceSerializer::deserialise(void *data, uint32_t *size) { + if(!data || !size || !*size) + { + RsErr() << __PRETTY_FUNCTION__ << " Called with null paramethers data: " + << data << " size: " << static_cast(size) << " *size: " + << (size ? *size : 0) << " this should never happen!" + << std::endl; + print_stacktrace(); + return nullptr; + } + if(mFlags & SERIALIZATION_FLAG_SKIP_HEADER) { std::cerr << "(EE) Cannot deserialise item with flags SERIALIZATION_FLAG_SKIP_HEADER. Check your code!" << std::endl; diff --git a/libretroshare/src/services/p3gxschannels.cc b/libretroshare/src/services/p3gxschannels.cc index 753430fca..70e0763bf 100644 --- a/libretroshare/src/services/p3gxschannels.cc +++ b/libretroshare/src/services/p3gxschannels.cc @@ -44,9 +44,9 @@ #include "util/rsrandom.h" #include "util/rsstring.h" -#ifdef RS_DEEP_SEARCH -# include "deep_search/deep_search.h" -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX +# include "deep_search/channelsindex.hpp" +#endif // RS_DEEP_CHANNEL_INDEX /**** @@ -1149,9 +1149,9 @@ bool p3GxsChannels::createChannelV2( channelId = channel.mMeta.mGroupId; -#ifdef RS_DEEP_SEARCH - DeepSearch::indexChannelGroup(channel); -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX + DeepChannelsIndex::indexChannelGroup(channel); +#endif // RS_DEEP_CHANNEL_INDEX return true; } @@ -1180,9 +1180,9 @@ bool p3GxsChannels::createChannel(RsGxsChannelGroup& channel) return false; } -#ifdef RS_DEEP_SEARCH - DeepSearch::indexChannelGroup(channel); -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX + DeepChannelsIndex::indexChannelGroup(channel); +#endif // RS_DEEP_CHANNEL_INDEX return true; } @@ -1333,9 +1333,9 @@ bool p3GxsChannels::editChannel(RsGxsChannelGroup& channel) return false; } -#ifdef RS_DEEP_SEARCH - DeepSearch::indexChannelGroup(channel); -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX + DeepChannelsIndex::indexChannelGroup(channel); +#endif // RS_DEEP_CHANNEL_INDEX return true; } @@ -1401,9 +1401,9 @@ bool p3GxsChannels::createPostV2( if(RsGenExchange::getPublishedMsgMeta(token,post.mMeta)) { -#ifdef RS_DEEP_SEARCH - DeepSearch::indexChannelPost(post); -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX + DeepChannelsIndex::indexChannelPost(post); +#endif // RS_DEEP_CHANNEL_INDEX postId = post.mMeta.mMsgId; return true; @@ -1787,9 +1787,9 @@ bool p3GxsChannels::createPost(RsGxsChannelPost& post) if(RsGenExchange::getPublishedMsgMeta(token,post.mMeta)) { -#ifdef RS_DEEP_SEARCH - DeepSearch::indexChannelPost(post); -#endif // RS_DEEP_SEARCH +#ifdef RS_DEEP_CHANNEL_INDEX + DeepChannelsIndex::indexChannelPost(post); +#endif // RS_DEEP_CHANNEL_INDEX return true; } diff --git a/libretroshare/src/turtle/p3turtle.cc b/libretroshare/src/turtle/p3turtle.cc index f1e96a8a4..328b04772 100644 --- a/libretroshare/src/turtle/p3turtle.cc +++ b/libretroshare/src/turtle/p3turtle.cc @@ -865,6 +865,8 @@ int p3turtle::handleIncoming() // void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item) { + Dbg3() << __PRETTY_FUNCTION__ << " " << *item << std::endl; + // take a look at the item and test against inconsistent values // - If the item destimation is @@ -877,11 +879,12 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item) if(item_size > TURTLE_MAX_SEARCH_REQ_ACCEPTED_SERIAL_SIZE) { -#ifdef P3TURTLE_DEBUG - std::cerr << " Dropping, because the serial size exceeds the accepted limit." << std::endl ; -#endif - std::cerr << " Caught a turtle search item with arbitrary large size from " << item->PeerId() << " of size " << item_size << " and depth " << item->depth << ". This is not allowed => dropping." << std::endl; - return ; + RsWarn() << __PRETTY_FUNCTION__ + << " Got a turtle search item with arbitrary large size from " + << item->PeerId() << " of size " << item_size << " and depth " + << item->depth << ". This is not allowed => dropping." + << std::endl; + return; } { @@ -889,22 +892,20 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item) if(_search_requests_origins.size() > MAX_ALLOWED_SR_IN_CACHE) { -#ifdef P3TURTLE_DEBUG - std::cerr << " Dropping, because the search request cache is full." << std::endl ; -#endif - std::cerr << " More than " << MAX_ALLOWED_SR_IN_CACHE << " search request in cache. A peer is probably trying to flood your network See the depth charts to find him." << std::endl; - return ; + RsWarn() << __PRETTY_FUNCTION__ << " More than " + << MAX_ALLOWED_SR_IN_CACHE << " search request in cache. " + << "A peer is probably trying to flood your network See " + "the depth charts to find him." << std::endl; + return; } - // If the item contains an already handled search request, give up. This - // happens when the same search request gets relayed by different peers - // - if(_search_requests_origins.find(item->request_id) != _search_requests_origins.end()) + if( _search_requests_origins.find(item->request_id) != + _search_requests_origins.end() ) { -#ifdef P3TURTLE_DEBUG - std::cerr << " This is a bouncing request. Ignoring and deleting it." << std::endl ; -#endif - return ; + /* If the item contains an already handled search request, give up. + * This happens when the same search request gets relayed by + * different peers */ + return; } } @@ -1013,13 +1014,21 @@ void p3turtle::handleSearchRequest(RsTurtleSearchRequestItem *item) // This function should be removed in the future, when file search will also use generic search items. -void p3turtle::performLocalSearch(RsTurtleSearchRequestItem *item,uint32_t& req_result_count,std::list& search_results,uint32_t& max_allowed_hits) +void p3turtle::performLocalSearch( + RsTurtleSearchRequestItem *item, uint32_t& req_result_count, + std::list& search_results, + uint32_t& max_allowed_hits ) { - RsTurtleFileSearchRequestItem *ftsearch = dynamic_cast(item) ; + Dbg3() << __PRETTY_FUNCTION__ << " " << item << std::endl; + + RsTurtleFileSearchRequestItem* ftsearch = + dynamic_cast(item); if(ftsearch != NULL) { - performLocalSearch_files(ftsearch,req_result_count,search_results,max_allowed_hits) ; + performLocalSearch_files( + ftsearch, req_result_count, search_results, + max_allowed_hits ); return ; } @@ -1060,12 +1069,13 @@ void p3turtle::performLocalSearch_generic(RsTurtleGenericSearchRequestItem *item } } -void p3turtle::performLocalSearch_files(RsTurtleFileSearchRequestItem *item,uint32_t& req_result_count,std::list& result,uint32_t& max_allowed_hits) +void p3turtle::performLocalSearch_files( + RsTurtleFileSearchRequestItem *item, uint32_t& req_result_count, + std::list& result, + uint32_t& max_allowed_hits ) { -#ifdef P3TURTLE_DEBUG - std::cerr << "Performing rsFiles->search()" << std::endl ; -#endif - // now, search! + Dbg3() << __PRETTY_FUNCTION__ << " " << *item << std::endl; + std::list initialResults ; item->search(initialResults) ; @@ -1104,6 +1114,9 @@ void p3turtle::performLocalSearch_files(RsTurtleFileSearchRequestItem *item,uint res_item = NULL ; // forces creation of a new item. } } + + Dbg3() << __PRETTY_FUNCTION__ << " found " << req_result_count << " results" + << std::endl; } void p3turtle::handleSearchResult(RsTurtleSearchResultItem *item) diff --git a/libretroshare/src/turtle/p3turtle.h b/libretroshare/src/turtle/p3turtle.h index a80025c28..6c0b0012d 100644 --- a/libretroshare/src/turtle/p3turtle.h +++ b/libretroshare/src/turtle/p3turtle.h @@ -19,6 +19,7 @@ * along with this program. If not, see . * * * *******************************************************************************/ +#pragma once //====================================== General setup of the router ===================================// // @@ -130,10 +131,6 @@ // - should tunnels be re-used ? nope. The only useful case would be when two peers are exchanging files, which happens quite rarely. // - -#ifndef MRK_PQI_TURTLE_H -#define MRK_PQI_TURTLE_H - #include #include #include @@ -464,6 +461,8 @@ class p3turtle: public p3Service, public RsTurtle, public p3Config uint32_t _service_type ; + RS_SET_CONTEXT_DEBUG_LEVEL(1) + #ifdef P3TURTLE_DEBUG // debug function void dumpState() ; @@ -472,5 +471,3 @@ class p3turtle: public p3Service, public RsTurtle, public p3Config void TS_dumpState(); #endif }; - -#endif diff --git a/libretroshare/src/use_libretroshare.pri b/libretroshare/src/use_libretroshare.pri index 9a74d209c..7f70c0185 100644 --- a/libretroshare/src/use_libretroshare.pri +++ b/libretroshare/src/use_libretroshare.pri @@ -68,11 +68,15 @@ linux-* { mLibs += dl } -rs_deep_search { +rs_deep_channels_index | rs_deep_files_index { mLibs += xapian win32-g++:mLibs += rpcrt4 } +rs_deep_files_index_ogg { + mLibs += vorbisfile +} + rs_broadcast_discovery { no_rs_cross_compiling { UDP_DISCOVERY_SRC_PATH=$$clean_path($${RS_SRC_PATH}/supportlibs/udp-discovery-cpp/) diff --git a/retroshare.pri b/retroshare.pri index dcb9dc4cb..9ba5f0dcc 100644 --- a/retroshare.pri +++ b/retroshare.pri @@ -165,10 +165,20 @@ rs_macos10.14:CONFIG -= rs_macos10.11 CONFIG *= no_rs_jsonapi rs_jsonapi:CONFIG -= no_rs_jsonapi -# To enable deep search append the following assignation to qmake command line -# CONFIG *= rs_deep_search -CONFIG *= no_rs_deep_search -rs_deep_search:CONFIG -= no_rs_deep_search +# To enable channel indexing append the following assignation to qmake command +# line "CONFIG+=rs_deep_channel_index" +CONFIG *= no_rs_deep_channel_index +rs_deep_channel_index:CONFIG -= no_rs_deep_channel_index + +# To enable file indexing append the following assignation to qmake command +# line "CONFIG+=rs_files_index" +CONFIG *= no_rs_deep_files_index +rs_deep_files_index:CONFIG -= no_rs_deep_files_index + +# To enable Ogg file indexing append the following assignation to qmake command +# line "CONFIG+=rs_deep_files_index_ogg" +CONFIG *= no_rs_deep_files_index_ogg +rs_deep_files_index_ogg::CONFIG -= no_rs_deep_files_index_ogg # To enable native dialogs append the following assignation to qmake command # line "CONFIG+=rs_use_native_dialogs" @@ -564,15 +574,10 @@ retroshare_qml_app { warning("QMAKE: you have enabled retroshare_qml_app which is deprecated") } -rs_deep_search { - DEFINES *= RS_DEEP_SEARCH +rs_deep_channels_index:DEFINES *= RS_DEEP_CHANNEL_INDEX - linux { - exists("/usr/include/xapian-1.3") { - INCLUDEPATH += /usr/include/xapian-1.3 - } - } -} +rs_deep_files_index:DEFINES *= RS_DEEP_FILES_INDEX +rs_deep_files_index_ogg:DEFINES *= RS_DEEP_FILES_INDEX_OGG rs_use_native_dialogs:DEFINES *= RS_NATIVEDIALOGS