RetroShare/libretroshare/src/deep_search/commonutils.cpp
2021-10-28 09:53:55 +02:00

220 lines
6.3 KiB
C++

/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2021 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019-2021 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include <algorithm>
#include <thread>
#include "deep_search/commonutils.hpp"
#include "util/stacktrace.h"
#include "util/rsthreads.h"
#include "util/rsdebuglevel0.h"
#ifndef XAPIAN_AT_LEAST
/// Added in Xapian 1.4.2.
#define XAPIAN_AT_LEAST(A,B,C) \
(XAPIAN_MAJOR_VERSION > (A) || \
(XAPIAN_MAJOR_VERSION == (A) && \
(XAPIAN_MINOR_VERSION > (B) || \
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif
namespace DeepSearch
{
std::unique_ptr<Xapian::Database> openReadOnlyDatabase(
const std::string& path, int flags )
{
try
{
#if XAPIAN_AT_LEAST(1,3,2)
std::unique_ptr<Xapian::Database> dbPtr(
new Xapian::Database(path, flags) );
#else
std::unique_ptr<Xapian::Database> dbPtr(new Xapian::Database(path));
if(flags)
{
RS_WARN( "Xapian DB flags: ", flags, " ignored due to old Xapian "
"library version: ", XAPIAN_VERSION, " < 1.3.2" );
}
#endif
return dbPtr;
}
catch(Xapian::DatabaseOpeningError& e)
{
RsWarn() << __PRETTY_FUNCTION__ << " " << e.get_msg()
<< ", probably nothing has been indexed yet." << std::endl;
}
catch(Xapian::DatabaseLockError&)
{
RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock "
<< path << std::endl;
print_stacktrace();
}
catch(...)
{
RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted "
<< "deleting it might help without causing any harm: "
<< path << std::endl;
print_stacktrace();
}
return nullptr;
}
std::string timetToXapianDate(const rstime_t& time)
{
char date[] = "YYYYMMDD\0";
time_t tTime = static_cast<time_t>(time);
std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime));
return date;
}
StubbornWriteOpQueue::~StubbornWriteOpQueue()
{
auto fErr = flush(0);
if(fErr)
{
RS_FATAL( "Flush failed on destruction ", mOpStore.size(),
" operations irreparably lost ", fErr );
print_stacktrace();
}
}
void StubbornWriteOpQueue::push(write_op op)
{
RS_DBG4("");
{
std::unique_lock<std::mutex> lock(mQueueMutex);
mOpStore.push(op);
}
flush();
}
std::error_condition StubbornWriteOpQueue::flush(
rstime_t acceptDelay, rstime_t callTS )
{
RS_DBG4("");
{
// Return without attempt to open the database if the queue is empty
std::unique_lock<std::mutex> lock(mQueueMutex);
if(mOpStore.empty()) return std::error_condition();
}
std::unique_ptr<Xapian::WritableDatabase> dbPtr;
try
{
dbPtr = std::make_unique<Xapian::WritableDatabase>(
mDbPath, Xapian::DB_CREATE_OR_OPEN );
}
catch(Xapian::DatabaseLockError)
{
if(acceptDelay)
{
rstime_t tNow = time(nullptr);
rstime_t maxRemaining = tNow - (callTS + acceptDelay);
if(maxRemaining > 0)
{
std::chrono::milliseconds interval(
std::max(rstime_t(50), maxRemaining*1000/5) );
RS_DBG3( "Cannot acquire database write lock, retrying in:",
interval.count(), "ms" );
RsThread::async([this, acceptDelay, callTS, interval]()
{
std::this_thread::sleep_for(interval);
flush(acceptDelay, callTS);
});
return std::error_condition();
}
else
{
RS_ERR(std::errc::timed_out, acceptDelay, callTS, tNow);
return std::errc::timed_out;
}
}
else return std::errc::resource_unavailable_try_again;
}
catch(...)
{
RS_ERR("Xapian DB ", mDbPath, " is apparently corrupted");
print_stacktrace();
return std::errc::io_error;
}
std::unique_lock<std::mutex> lock(mQueueMutex);
while(!mOpStore.empty())
{
auto op = mOpStore.front(); mOpStore.pop();
op(*dbPtr);
}
return std::error_condition();
}
std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc)
{
if(rsHtmlDoc.empty()) return rsHtmlDoc;
const bool isPlainMsg =
rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>';
if(isPlainMsg) return rsHtmlDoc;
auto oSize = rsHtmlDoc.size();
auto bodyTagBegin(rsHtmlDoc.find("<body"));
if(bodyTagBegin >= oSize) return rsHtmlDoc;
auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin));
if(bodyTagEnd >= oSize) return rsHtmlDoc;
std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1));
// strip also CSS inside <style></style>
oSize = retVal.size();
auto styleTagBegin(retVal.find("<style"));
if(styleTagBegin < oSize)
{
auto styleEnd(retVal.find("</style>", styleTagBegin));
if(styleEnd < oSize)
retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin);
}
std::string::size_type oPos;
std::string::size_type cPos;
int itCount = 0;
while((oPos = retVal.find("<")) < retVal.size())
{
if((cPos = retVal.find(">")) <= retVal.size())
retVal.erase(oPos, 1+cPos-oPos);
else break;
// Avoid infinite loop with crafty input
if(itCount > 1000)
{
RS_WARN( "Breaking stripping loop due to max allowed iterations ",
"rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal );
break;
}
++itCount;
}
return retVal;
}
}