mirror of
https://github.com/RetroShare/RetroShare.git
synced 2025-01-13 08:29:32 -05:00
1562 lines
43 KiB
C++
1562 lines
43 KiB
C++
/*******************************************************************************
|
|
* plugins/FeedReader/services/p3FeedReaderThread.cc *
|
|
* *
|
|
* Copyright (C) 2012 by Thunder <retroshare.project@gmail.com> *
|
|
* *
|
|
* This program is free software: you can redistribute it and/or modify *
|
|
* it under the terms of the GNU Affero General Public License as *
|
|
* published by the Free Software Foundation, either version 3 of the *
|
|
* License, or (at your option) any later version. *
|
|
* *
|
|
* This program is distributed in the hope that it will be useful, *
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
* GNU Affero General Public License for more details. *
|
|
* *
|
|
* You should have received a copy of the GNU Affero General Public License *
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
|
|
* *
|
|
*******************************************************************************/
|
|
|
|
#include "p3FeedReaderThread.h"
|
|
#include "rsFeedReaderItems.h"
|
|
#include "util/rsstring.h"
|
|
#include "util/rstime.h"
|
|
#include "util/CURLWrapper.h"
|
|
#include "util/XMLWrapper.h"
|
|
#include "util/HTMLWrapper.h"
|
|
#include "util/XPathWrapper.h"
|
|
|
|
#include <openssl/evp.h>
|
|
#include <unistd.h> // for usleep
|
|
|
|
enum FeedFormat { FORMAT_RSS, FORMAT_RDF, FORMAT_ATOM };
|
|
|
|
/*********
|
|
* #define FEEDREADER_DEBUG
|
|
*********/
|
|
|
|
p3FeedReaderThread::p3FeedReaderThread(p3FeedReader *feedReader, Type type, uint32_t feedId) :
|
|
RsTickingThread(), mFeedReader(feedReader), mType(type), mFeedId(feedId)
|
|
{
|
|
}
|
|
|
|
p3FeedReaderThread::~p3FeedReaderThread()
|
|
{
|
|
}
|
|
|
|
/***************************************************************************/
|
|
/****************************** Thread *************************************/
|
|
/***************************************************************************/
|
|
|
|
void p3FeedReaderThread::threadTick()
|
|
{
|
|
rstime::rs_usleep(1000000);
|
|
|
|
/* every second */
|
|
|
|
switch (mType) {
|
|
case DOWNLOAD:
|
|
{
|
|
RsFeedReaderFeed feed;
|
|
if (mFeedReader->getFeedToDownload(feed, mFeedId)) {
|
|
std::string content;
|
|
std::string icon;
|
|
std::string errorString;
|
|
|
|
RsFeedReaderErrorState result = download(feed, content, icon, errorString);
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
/* trim */
|
|
XMLWrapper::trimString(content);
|
|
|
|
mFeedReader->onDownloadSuccess(feed.feedId, content, icon);
|
|
} else {
|
|
mFeedReader->onDownloadError(feed.feedId, result, errorString);
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PROCESS:
|
|
{
|
|
RsFeedReaderFeed feed;
|
|
if (mFeedReader->getFeedToProcess(feed, mFeedId)) {
|
|
std::list<RsFeedReaderMsg*> msgs;
|
|
std::string errorString;
|
|
std::list<RsFeedReaderMsg*>::iterator it;
|
|
|
|
RsFeedReaderErrorState result = process(feed, msgs, errorString);
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
/* first, filter the messages */
|
|
mFeedReader->onProcessSuccess_filterMsg(feed.feedId, msgs);
|
|
if (isRunning()) {
|
|
/* second, process the descriptions */
|
|
for (it = msgs.begin(); it != msgs.end(); ) {
|
|
if (!isRunning()) {
|
|
break;
|
|
}
|
|
|
|
RsFeedReaderMsg *mi = *it;
|
|
result = processMsg(feed, mi, errorString);
|
|
if (result != RS_FEED_ERRORSTATE_OK) {
|
|
break;
|
|
}
|
|
|
|
if (feed.preview) {
|
|
/* add every message */
|
|
it = msgs.erase(it);
|
|
|
|
std::list<RsFeedReaderMsg*> msgSingle;
|
|
msgSingle.push_back(mi);
|
|
mFeedReader->onProcessSuccess_addMsgs(feed.feedId, msgSingle, true);
|
|
|
|
/* delete not accepted message */
|
|
std::list<RsFeedReaderMsg*>::iterator it1;
|
|
for (it1 = msgSingle.begin(); it1 != msgSingle.end(); ++it1) {
|
|
delete (*it1);
|
|
}
|
|
} else {
|
|
result = processTransformation(feed, mi, errorString);
|
|
if (result != RS_FEED_ERRORSTATE_OK) {
|
|
break;
|
|
}
|
|
++it;
|
|
}
|
|
}
|
|
if (isRunning()) {
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
/* third, add messages */
|
|
mFeedReader->onProcessSuccess_addMsgs(feed.feedId, msgs, false);
|
|
} else {
|
|
mFeedReader->onProcessError(feed.feedId, result, errorString);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
mFeedReader->onProcessError(feed.feedId, result, errorString);
|
|
}
|
|
|
|
/* delete not accepted messages */
|
|
for (it = msgs.begin(); it != msgs.end(); ++it) {
|
|
delete (*it);
|
|
}
|
|
msgs.clear();
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/***************************************************************************/
|
|
/****************************** Download ***********************************/
|
|
/***************************************************************************/
|
|
|
|
static bool isContentType(const std::string &contentType, const char *type)
|
|
{
|
|
return (strncasecmp(contentType.c_str(), type, strlen(type)) == 0);
|
|
}
|
|
|
|
static bool toBase64(const std::vector<unsigned char> &data, std::string &base64)
|
|
{
|
|
bool result = false;
|
|
|
|
/* Set up a base64 encoding BIO that writes to a memory BIO */
|
|
BIO *b64 = BIO_new(BIO_f_base64());
|
|
if (b64) {
|
|
BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL);
|
|
BIO *bmem = BIO_new(BIO_s_mem());
|
|
if (bmem) {
|
|
BIO_set_flags(bmem, BIO_CLOSE); // probably redundant
|
|
b64 = BIO_push(b64, bmem);
|
|
/* Send the data */
|
|
BIO_write(b64, data.data(), data.size());
|
|
/* Collect the encoded data */
|
|
BIO_flush(b64);
|
|
char* temp;
|
|
int count = BIO_get_mem_data(bmem, &temp);
|
|
if (count && temp) {
|
|
base64.assign(temp, count);
|
|
result = true;
|
|
}
|
|
}
|
|
BIO_free_all(b64);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static std::string getBaseLink(std::string link)
|
|
{
|
|
size_t found = link.rfind('/');
|
|
if (found != std::string::npos) {
|
|
link.erase(found + 1);
|
|
}
|
|
|
|
return link;
|
|
}
|
|
|
|
static std::string calculateLink(const std::string &baseLink, const std::string &link)
|
|
{
|
|
if (link.substr(0, 7) == "http://") {
|
|
/* absolute link */
|
|
return link;
|
|
}
|
|
|
|
/* calculate link of base link */
|
|
std::string resultLink = baseLink;
|
|
|
|
/* link should begin with "http://" */
|
|
if (resultLink.substr(0, 7) != "http://") {
|
|
resultLink.insert(0, "http://");
|
|
}
|
|
|
|
if (link.empty()) {
|
|
/* no link */
|
|
return resultLink;
|
|
}
|
|
|
|
if (*link.begin() == '/') {
|
|
/* link begins with "/" */
|
|
size_t found = resultLink.find('/', 7);
|
|
if (found != std::string::npos) {
|
|
resultLink.erase(found);
|
|
}
|
|
} else {
|
|
/* check for "/" at the end */
|
|
std::string::reverse_iterator it = resultLink.rend ();
|
|
it--;
|
|
if (*it != '/') {
|
|
resultLink += "/";
|
|
}
|
|
}
|
|
|
|
resultLink += link;
|
|
|
|
return resultLink;
|
|
}
|
|
|
|
static bool getFavicon(CURLWrapper &CURL, const std::string &url, std::string &icon)
|
|
{
|
|
icon.clear();
|
|
|
|
bool result = false;
|
|
|
|
std::vector<unsigned char> vicon;
|
|
CURLcode code = CURL.downloadBinary(calculateLink(url, "/favicon.ico"), vicon);
|
|
if (code == CURLE_OK) {
|
|
if (CURL.responseCode() == 200) {
|
|
std::string contentType = CURL.contentType();
|
|
if (isContentType(contentType, "image/x-icon") ||
|
|
isContentType(contentType, "application/octet-stream") ||
|
|
isContentType(contentType, "text/plain")) {
|
|
if (!vicon.empty()) {
|
|
#warning p3FeedReaderThread.cc TODO thunder2: check it
|
|
result = toBase64(vicon, icon);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::download(const RsFeedReaderFeed &feed, std::string &content, std::string &icon, std::string &errorString)
|
|
{
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::download - feed " << feed.feedId << " (" << feed.name << ")" << std::endl;
|
|
#endif
|
|
|
|
content.clear();
|
|
errorString.clear();
|
|
|
|
RsFeedReaderErrorState result;
|
|
|
|
std::string proxy = getProxyForFeed(feed);
|
|
CURLWrapper CURL(proxy);
|
|
CURLcode code = CURL.downloadText(feed.url, content);
|
|
|
|
if (code == CURLE_OK) {
|
|
long responseCode = CURL.responseCode();
|
|
|
|
switch (responseCode) {
|
|
case 200:
|
|
{
|
|
std::string contentType = CURL.contentType();
|
|
|
|
if (isContentType(contentType, "text/xml") ||
|
|
isContentType(contentType, "text/html") ||
|
|
isContentType(contentType, "application/rss+xml") ||
|
|
isContentType(contentType, "application/xml") ||
|
|
isContentType(contentType, "application/xhtml+xml") ||
|
|
isContentType(contentType, "application/atom+xml")) {
|
|
/* ok */
|
|
result = RS_FEED_ERRORSTATE_OK;
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKNOWN_CONTENT_TYPE;
|
|
errorString = contentType;
|
|
}
|
|
}
|
|
break;
|
|
case 404:
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_NOT_FOUND;
|
|
break;
|
|
default:
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKOWN_RESPONSE_CODE;
|
|
rs_sprintf(errorString, "%ld", responseCode);
|
|
}
|
|
|
|
getFavicon(CURL, feed.url, icon);
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_ERROR;
|
|
errorString = curl_easy_strerror(code);
|
|
}
|
|
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::download - feed " << feed.feedId << " (" << feed.name << "), result " << result << ", error = " << errorString << std::endl;
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
/***************************************************************************/
|
|
/****************************** Process ************************************/
|
|
/***************************************************************************/
|
|
|
|
static xmlNodePtr getNextItem(FeedFormat feedFormat, xmlNodePtr channel, xmlNodePtr item)
|
|
{
|
|
if (!channel) {
|
|
return NULL;
|
|
}
|
|
|
|
if (!item) {
|
|
switch (feedFormat) {
|
|
case FORMAT_RSS:
|
|
case FORMAT_ATOM:
|
|
item = channel->children;
|
|
break;
|
|
case FORMAT_RDF:
|
|
item = channel->next;
|
|
break;
|
|
default:
|
|
return NULL;
|
|
}
|
|
} else {
|
|
item = item->next;
|
|
}
|
|
for (; item; item = item->next) {
|
|
if (item->type == XML_ELEMENT_NODE && xmlStrcasecmp(item->name, (feedFormat == FORMAT_ATOM) ? BAD_CAST"entry" : BAD_CAST"item") == 0) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return item;
|
|
}
|
|
|
|
static void splitString(std::string s, std::vector<std::string> &v, const char d)
|
|
{
|
|
v.clear();
|
|
|
|
std::string::size_type p;
|
|
while ((p = s.find_first_of(d)) != std::string::npos) {
|
|
v.push_back(s.substr(0, p));
|
|
s.erase(0, p + 1);
|
|
}
|
|
if (!s.empty()) {
|
|
v.push_back(s);
|
|
}
|
|
}
|
|
|
|
static unsigned int ymdhms_to_seconds(int year, int mon, int day, int hour, int minute, int second)
|
|
{
|
|
if (sizeof(time_t) == 4)
|
|
{
|
|
if ((time_t)-1 < 0)
|
|
{
|
|
if (year >= 2038)
|
|
{
|
|
year = 2038;
|
|
mon = 0;
|
|
day = 1;
|
|
hour = 0;
|
|
minute = 0;
|
|
second = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (year >= 2115)
|
|
{
|
|
year = 2115;
|
|
mon = 0;
|
|
day = 1;
|
|
hour = 0;
|
|
minute = 0;
|
|
second = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned int ret = (day - 32075) /* days */
|
|
+ 1461L * (year + 4800L + (mon - 14) / 12) / 4
|
|
+ 367 * (mon - 2 - (mon - 14) / 12 * 12) / 12
|
|
- 3 * ((year + 4900L + (mon - 14) / 12) / 100) / 4
|
|
- 2440588;
|
|
ret = 24*ret + hour; /* hours */
|
|
ret = 60*ret + minute; /* minutes */
|
|
ret = 60*ret + second; /* seconds */
|
|
|
|
return ret;
|
|
}
|
|
|
|
static const char haystack[37]="janfebmaraprmayjunjulaugsepoctnovdec";
|
|
|
|
// we follow the recommendation of rfc2822 to consider all
|
|
// obsolete time zones not listed here equivalent to "-0000"
|
|
static const struct {
|
|
const char tzName[4];
|
|
int tzOffset;
|
|
} known_zones[] = {
|
|
{ "UT", 0 },
|
|
{ "GMT", 0 },
|
|
{ "EST", -300 },
|
|
{ "EDT", -240 },
|
|
{ "CST", -360 },
|
|
{ "CDT", -300 },
|
|
{ "MST", -420 },
|
|
{ "MDT", -360 },
|
|
{ "PST", -480 },
|
|
{ "PDT", -420 },
|
|
{ { 0,0,0,0 }, 0 }
|
|
};
|
|
|
|
// copied from KRFCDate::parseDate
|
|
static time_t parseRFC822Date(const std::string &pubDate)
|
|
{
|
|
if (pubDate.empty())
|
|
return 0;
|
|
|
|
// This parse a date in the form:
|
|
// Wednesday, 09-Nov-99 23:12:40 GMT
|
|
// or
|
|
// Sat, 01-Jan-2000 08:00:00 GMT
|
|
// or
|
|
// Sat, 01 Jan 2000 08:00:00 GMT
|
|
// or
|
|
// 01 Jan 99 22:00 +0100 (exceptions in rfc822/rfc2822)
|
|
//
|
|
// We ignore the weekday
|
|
//
|
|
time_t result = 0;
|
|
int offset = 0;
|
|
char *newPosStr;
|
|
const char *dateString = pubDate.c_str();
|
|
int day = 0;
|
|
char monthStr[4];
|
|
int month = -1;
|
|
int year = 0;
|
|
int hour = 0;
|
|
int minute = 0;
|
|
int second = 0;
|
|
|
|
// Strip leading space
|
|
while(*dateString && isspace(*dateString))
|
|
dateString++;
|
|
|
|
// Strip weekday
|
|
while(*dateString && !isdigit(*dateString) && !isspace(*dateString))
|
|
dateString++;
|
|
|
|
// Strip trailing space
|
|
while(*dateString && isspace(*dateString))
|
|
dateString++;
|
|
|
|
if (!*dateString)
|
|
return result; // Invalid date
|
|
|
|
if (isalpha(*dateString))
|
|
{
|
|
// ' Nov 5 1994 18:15:30 GMT'
|
|
// Strip leading space
|
|
while(*dateString && isspace(*dateString))
|
|
dateString++;
|
|
|
|
for(int i=0; i < 3;i++)
|
|
{
|
|
if (!*dateString || (*dateString == '-') || isspace(*dateString))
|
|
return result; // Invalid date
|
|
monthStr[i] = tolower(*dateString++);
|
|
}
|
|
monthStr[3] = '\0';
|
|
|
|
newPosStr = (char*)strstr(haystack, monthStr);
|
|
|
|
if (!newPosStr)
|
|
return result; // Invalid date
|
|
|
|
month = (newPosStr-haystack)/3; // Jan=00, Feb=01, Mar=02, ..
|
|
|
|
if ((month < 0) || (month > 11))
|
|
return result; // Invalid date
|
|
|
|
while (*dateString && isalpha(*dateString))
|
|
dateString++; // Skip rest of month-name
|
|
}
|
|
|
|
// ' 09-Nov-99 23:12:40 GMT'
|
|
// ' 5 1994 18:15:30 GMT'
|
|
day = strtol(dateString, &newPosStr, 10);
|
|
dateString = newPosStr;
|
|
|
|
if ((day < 1) || (day > 31))
|
|
return result; // Invalid date;
|
|
|
|
if (!*dateString)
|
|
return result; // Invalid date
|
|
|
|
while(*dateString && (isspace(*dateString) || (*dateString == '-')))
|
|
dateString++;
|
|
|
|
if (month == -1)
|
|
{
|
|
for(int i=0; i < 3;i++)
|
|
{
|
|
if (!*dateString || (*dateString == '-') || isspace(*dateString))
|
|
return result; // Invalid date
|
|
monthStr[i] = tolower(*dateString++);
|
|
}
|
|
monthStr[3] = '\0';
|
|
|
|
newPosStr = (char*)strstr(haystack, monthStr);
|
|
|
|
if (!newPosStr)
|
|
return result; // Invalid date
|
|
|
|
month = (newPosStr-haystack)/3; // Jan=00, Feb=01, Mar=02, ..
|
|
|
|
if ((month < 0) || (month > 11))
|
|
return result; // Invalid date
|
|
|
|
while (*dateString && isalpha(*dateString))
|
|
dateString++; // Skip rest of month-name
|
|
|
|
}
|
|
|
|
// '-99 23:12:40 GMT'
|
|
while(*dateString && (isspace(*dateString) || (*dateString == '-')))
|
|
dateString++;
|
|
|
|
if (!*dateString || !isdigit(*dateString))
|
|
return result; // Invalid date
|
|
|
|
// '99 23:12:40 GMT'
|
|
year = strtol(dateString, &newPosStr, 10);
|
|
dateString = newPosStr;
|
|
|
|
// Y2K: Solve 2 digit years
|
|
if ((year >= 0) && (year < 50))
|
|
year += 2000;
|
|
|
|
if ((year >= 50) && (year < 100))
|
|
year += 1900; // Y2K
|
|
|
|
if ((year < 1900) || (year > 2500))
|
|
return result; // Invalid date
|
|
|
|
// Don't fail if the time is missing.
|
|
if (*dateString)
|
|
{
|
|
// ' 23:12:40 GMT'
|
|
if (!isspace(*dateString++))
|
|
return result; // Invalid date
|
|
|
|
hour = strtol(dateString, &newPosStr, 10);
|
|
dateString = newPosStr;
|
|
|
|
if ((hour < 0) || (hour > 23))
|
|
return result; // Invalid date
|
|
|
|
if (!*dateString)
|
|
return result; // Invalid date
|
|
|
|
// ':12:40 GMT'
|
|
if (*dateString++ != ':')
|
|
return result; // Invalid date
|
|
|
|
minute = strtol(dateString, &newPosStr, 10);
|
|
dateString = newPosStr;
|
|
|
|
if ((minute < 0) || (minute > 59))
|
|
return result; // Invalid date
|
|
|
|
if (!*dateString)
|
|
return result; // Invalid date
|
|
|
|
// ':40 GMT'
|
|
if (*dateString != ':' && !isspace(*dateString))
|
|
return result; // Invalid date
|
|
|
|
// seconds are optional in rfc822 + rfc2822
|
|
if (*dateString ==':') {
|
|
dateString++;
|
|
|
|
second = strtol(dateString, &newPosStr, 10);
|
|
dateString = newPosStr;
|
|
|
|
if ((second < 0) || (second > 59))
|
|
return result; // Invalid date
|
|
} else {
|
|
dateString++;
|
|
}
|
|
|
|
while(*dateString && isspace(*dateString))
|
|
dateString++;
|
|
}
|
|
|
|
// don't fail if the time zone is missing, some
|
|
// broken mail-/news-clients omit the time zone
|
|
if (*dateString) {
|
|
if ((strncasecmp(dateString, "gmt", 3) == 0) ||
|
|
(strncasecmp(dateString, "utc", 3) == 0))
|
|
{
|
|
dateString += 3;
|
|
while(*dateString && isspace(*dateString))
|
|
dateString++;
|
|
}
|
|
|
|
if ((*dateString == '+') || (*dateString == '-')) {
|
|
offset = strtol(dateString, &newPosStr, 10);
|
|
if (abs(offset) < 30)
|
|
{
|
|
dateString = newPosStr;
|
|
|
|
offset = offset * 100;
|
|
|
|
if (*dateString && *(dateString+1))
|
|
{
|
|
dateString++;
|
|
int minutes = strtol(dateString, &newPosStr, 10);
|
|
if (offset > 0)
|
|
offset += minutes;
|
|
else
|
|
offset -= minutes;
|
|
}
|
|
}
|
|
|
|
if ((offset < -9959) || (offset > 9959))
|
|
return result; // Invalid date
|
|
|
|
int sgn = (offset < 0)? -1:1;
|
|
offset = abs(offset);
|
|
offset = ((offset / 100)*60 + (offset % 100))*sgn;
|
|
} else {
|
|
for (int i=0; known_zones[i].tzName[0] != 0; i++) {
|
|
if (0 == strncasecmp(dateString, known_zones[i].tzName, strlen(known_zones[i].tzName))) {
|
|
offset = known_zones[i].tzOffset;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
result = ymdhms_to_seconds(year, month+1, day, hour, minute, second);
|
|
|
|
// avoid negative time values
|
|
if ((offset > 0) && (offset > result))
|
|
offset = 0;
|
|
|
|
result -= offset*60;
|
|
|
|
// If epoch 0 return epoch +1 which is Thu, 01-Jan-70 00:00:01 GMT
|
|
// This is so that parse error and valid epoch 0 return values won't
|
|
// be the same for sensitive applications...
|
|
if (result < 1) result = 1;
|
|
|
|
return result;
|
|
}
|
|
|
|
// copied and converted to std::string from KRFCDate::parseDateISO8601
|
|
static time_t parseISO8601Date(const std::string &pubDate)
|
|
{
|
|
if (pubDate.empty()) {
|
|
return 0;
|
|
}
|
|
|
|
// These dates look like this:
|
|
// YYYY-MM-DDTHH:MM:SS
|
|
// But they may also have 0, 1 or 2 suffixes.
|
|
// Suffix 1: .secfrac (fraction of second)
|
|
// Suffix 2: Either 'Z' or +zone or -zone, where zone is HHMM
|
|
|
|
unsigned int year = 0;
|
|
unsigned int month = 0;
|
|
unsigned int mday = 0;
|
|
unsigned int hour = 0;
|
|
unsigned int min = 0;
|
|
unsigned int sec = 0;
|
|
|
|
int offset = 0;
|
|
|
|
std::string input = pubDate;
|
|
|
|
// First find the 'T' separator, if any.
|
|
int tPos = input.find('T');
|
|
|
|
// If there is no time, no month or no day specified, fill those missing
|
|
// fields so that 'input' matches YYYY-MM-DDTHH:MM:SS
|
|
if (-1 == tPos) {
|
|
int dashes = 0;
|
|
std::string::iterator it;
|
|
for (it = input.begin(); it != input.end(); ++it) {
|
|
if (*it == '-') {
|
|
++dashes;
|
|
}
|
|
}
|
|
if (0 == dashes) {
|
|
input += "-01-01";
|
|
} else if (1 == dashes) {
|
|
input += "-01";
|
|
}
|
|
tPos = input.length();
|
|
input += "T12:00:00";
|
|
}
|
|
|
|
// Now parse the date part.
|
|
|
|
std::string dateString = input.substr(0, tPos);//.stripWhiteSpace();
|
|
|
|
std::string timeString = input.substr(tPos + 1);//.stripWhiteSpace();
|
|
|
|
std::vector<std::string> l;
|
|
splitString(dateString, l, '-');
|
|
|
|
if (l.size() < 3)
|
|
return 0;
|
|
|
|
sscanf(l[0].c_str(), "%u", &year);
|
|
sscanf(l[1].c_str(), "%u", &month);
|
|
sscanf(l[2].c_str(), "%u", &mday);
|
|
|
|
// Z suffix means UTC.
|
|
if ('Z' == timeString[timeString.length() - 1]) {
|
|
timeString.erase(timeString.length() - 1, 1);
|
|
}
|
|
|
|
// +zone or -zone suffix (offset from UTC).
|
|
|
|
int plusPos = timeString.find_last_of('+');
|
|
|
|
if (-1 != plusPos) {
|
|
std::string offsetString = timeString.substr(plusPos + 1);
|
|
|
|
unsigned int offsetHour;
|
|
unsigned int offsetMinute;
|
|
|
|
sscanf(offsetString.substr(0, 1).c_str(), "%u", &offsetHour);
|
|
sscanf(offsetString.substr(offsetString.length() - 2).c_str(), "%u", &offsetMinute);
|
|
|
|
offset = offsetHour * 60 + offsetMinute;
|
|
|
|
timeString = timeString.substr(0, plusPos);
|
|
} else {
|
|
int minusPos = timeString.find_last_of('-');
|
|
|
|
if (-1 != minusPos) {
|
|
std::string offsetString = timeString.substr(minusPos + 1);
|
|
|
|
unsigned int offsetHour;
|
|
unsigned int offsetMinute;
|
|
|
|
sscanf(offsetString.substr(0, 1).c_str(), "%u", &offsetHour);
|
|
sscanf(offsetString.substr(offsetString.length() - 2).c_str(), "%u", &offsetMinute);
|
|
|
|
timeString = timeString.substr(0, minusPos);
|
|
}
|
|
}
|
|
|
|
// secfrac suffix.
|
|
int dotPos = timeString.find_last_of('.');
|
|
|
|
if (-1 != dotPos) {
|
|
timeString = timeString.substr(0, dotPos);
|
|
}
|
|
|
|
// Now parse the time part.
|
|
|
|
splitString(timeString, l, ':');
|
|
|
|
if (l.size() < 3)
|
|
return 0;
|
|
|
|
sscanf(l[0].c_str(), "%u", &hour);
|
|
sscanf(l[1].c_str(), "%u", &min);
|
|
sscanf(l[2].c_str(), "%u", &sec);
|
|
|
|
time_t result = ymdhms_to_seconds(year, month, mday, hour, min, sec);
|
|
|
|
// avoid negative time values
|
|
if ((offset > 0) && (offset > result))
|
|
offset = 0;
|
|
|
|
result -= offset*60;
|
|
|
|
// If epoch 0 return epoch +1 which is Thu, 01-Jan-70 00:00:01 GMT
|
|
// This is so that parse error and valid epoch 0 return values won't
|
|
// be the same for sensitive applications...
|
|
if (result < 1) result = 1;
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::process(const RsFeedReaderFeed &feed, std::list<RsFeedReaderMsg*> &entries, std::string &errorString)
|
|
{
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << ")" << std::endl;
|
|
#endif
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
XMLWrapper xml;
|
|
if (xml.readXML(feed.content.c_str())) {
|
|
xmlNodePtr root = xml.getRootElement();
|
|
if (root) {
|
|
FeedFormat feedFormat;
|
|
if (xmlStrcasecmp(root->name, BAD_CAST"rss") == 0) {
|
|
feedFormat = FORMAT_RSS;
|
|
} else if (xmlStrcasecmp (root->name, BAD_CAST"rdf") == 0) {
|
|
feedFormat = FORMAT_RDF;
|
|
} else if (xmlStrcasecmp (root->name, BAD_CAST"feed") == 0) {
|
|
feedFormat = FORMAT_ATOM;
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT;
|
|
errorString = "Only RSS, RDF or ATOM supported";
|
|
}
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
xmlNodePtr channel = NULL;
|
|
switch (feedFormat) {
|
|
case FORMAT_RSS:
|
|
case FORMAT_RDF:
|
|
channel = xml.findNode(root->children, "channel");
|
|
break;
|
|
case FORMAT_ATOM:
|
|
channel = root;
|
|
break;
|
|
}
|
|
|
|
if (channel) {
|
|
/* import header info */
|
|
if (feed.flag & RS_FEED_FLAG_INFO_FROM_FEED) {
|
|
std::string title;
|
|
if (xml.getChildText(channel, "title", title) && !title.empty()) {
|
|
std::string::size_type p;
|
|
while ((p = title.find_first_of("\r\n")) != std::string::npos) {
|
|
title.erase(p, 1);
|
|
}
|
|
std::string description;
|
|
xml.getChildText(channel, (feedFormat == FORMAT_ATOM) ? "subtitle" : "description", description);
|
|
mFeedReader->setFeedInfo(feed.feedId, title, description);
|
|
}
|
|
}
|
|
|
|
/* process items */
|
|
xmlNodePtr node;
|
|
for (node = NULL; (node = getNextItem(feedFormat, channel, node)) != NULL; ) {
|
|
if (!isRunning()) {
|
|
break;
|
|
}
|
|
|
|
std::string title;
|
|
if (!xml.getChildText(node, "title", title) || title.empty()) {
|
|
continue;
|
|
}
|
|
|
|
/* remove newlines */
|
|
std::string::size_type p;
|
|
while ((p = title.find_first_of("\r\n")) != std::string::npos) {
|
|
title.erase(p, 1);
|
|
}
|
|
|
|
RsFeedReaderMsg *item = new RsFeedReaderMsg();
|
|
item->msgId.clear(); // is calculated later
|
|
item->feedId = feed.feedId;
|
|
item->title = title;
|
|
|
|
/* try feedburner:origLink */
|
|
if (!xml.getChildText(node, "origLink", item->link) || item->link.empty()) {
|
|
xml.getChildText(node, "link", item->link);
|
|
if (item->link.empty()) {
|
|
xmlNodePtr linkNode = xml.findNode(node, "link", true);
|
|
item->link = xml.getAttr(linkNode, "href");
|
|
}
|
|
}
|
|
|
|
// remove sid=
|
|
std::string linkUpper;
|
|
stringToUpperCase(item->link, linkUpper);
|
|
std::string::size_type sidStart = linkUpper.find("SID=");
|
|
if (sidStart != std::string::npos) {
|
|
std::string::size_type sidEnd1 = linkUpper.find(";", sidStart);
|
|
std::string::size_type sidEnd2 = linkUpper.find("#", sidStart);
|
|
|
|
if (sidEnd1 == std::string::npos) {
|
|
sidEnd1 = linkUpper.size();
|
|
}
|
|
if (sidEnd2 == std::string::npos) {
|
|
sidEnd2 = linkUpper.size();
|
|
}
|
|
|
|
if (sidStart > 0 && linkUpper[sidStart - 1] == '&') {
|
|
sidStart--;
|
|
}
|
|
|
|
std::string::size_type sidEnd = std::min(sidEnd1, sidEnd2);
|
|
item->link.erase(sidStart, sidEnd - sidStart);
|
|
}
|
|
|
|
if (feedFormat == FORMAT_ATOM) {
|
|
/* <author><name>...</name></author> */
|
|
xmlNodePtr author = xml.findNode(node->children, "author", false);
|
|
if (author) {
|
|
xml.getChildText(node, "name", item->author);
|
|
}
|
|
} else {
|
|
if (!xml.getChildText(node, "author", item->author)) {
|
|
xml.getChildText(node, "creator", item->author);
|
|
}
|
|
}
|
|
|
|
switch (feedFormat) {
|
|
case FORMAT_RSS:
|
|
case FORMAT_RDF:
|
|
/* try content:encoded */
|
|
if (!xml.getChildText(node, "encoded", item->description)) {
|
|
/* use description */
|
|
xml.getChildText(node, "description", item->description);
|
|
}
|
|
break;
|
|
case FORMAT_ATOM:
|
|
/* try content */
|
|
if (!xml.getChildText(node, "content", item->description)) {
|
|
/* use summary */
|
|
xml.getChildText(node, "summary", item->description);
|
|
}
|
|
break;
|
|
}
|
|
|
|
std::string pubDate;
|
|
if (xml.getChildText(node, "pubDate", pubDate)) {
|
|
item->pubDate = parseRFC822Date(pubDate);
|
|
}
|
|
if (xml.getChildText(node, "date", pubDate)) {
|
|
item->pubDate = parseISO8601Date (pubDate);
|
|
}
|
|
if (xml.getChildText(node, "updated", pubDate)) {
|
|
// atom
|
|
item->pubDate = parseISO8601Date (pubDate);
|
|
}
|
|
|
|
if (item->pubDate == 0) {
|
|
/* use current time */
|
|
item->pubDate = time(NULL);
|
|
}
|
|
|
|
entries.push_back(item);
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT;
|
|
errorString = "Channel not found";
|
|
}
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT;
|
|
errorString = "Can't read document";
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
errorString = xml.lastError();
|
|
}
|
|
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << "), result " << result << ", error = " << errorString << std::endl;
|
|
if (result == RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR) {
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
}
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string p3FeedReaderThread::getProxyForFeed(const RsFeedReaderFeed &feed)
|
|
{
|
|
std::string proxy;
|
|
if (feed.flag & RS_FEED_FLAG_STANDARD_PROXY) {
|
|
std::string standardProxyAddress;
|
|
uint16_t standardProxyPort;
|
|
if (mFeedReader->getStandardProxy(standardProxyAddress, standardProxyPort)) {
|
|
rs_sprintf(proxy, "%s:%u", standardProxyAddress.c_str(), standardProxyPort);
|
|
}
|
|
} else {
|
|
if (!feed.proxyAddress.empty() && feed.proxyPort) {
|
|
rs_sprintf(proxy, "%s:%u", feed.proxyAddress.c_str(), feed.proxyPort);
|
|
}
|
|
}
|
|
return proxy;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processMsg(const RsFeedReaderFeed &feed, RsFeedReaderMsg *msg, std::string &errorString)
|
|
{
|
|
//long todo_fill_errorString;
|
|
|
|
if (!msg) {
|
|
return RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
}
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
std::string proxy = getProxyForFeed(feed);
|
|
|
|
std::string url;
|
|
if (feed.flag & RS_FEED_FLAG_SAVE_COMPLETE_PAGE) {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") download page " << msg->link << std::endl;
|
|
#endif
|
|
std::string content;
|
|
CURLWrapper CURL(proxy);
|
|
CURLcode code = CURL.downloadText(msg->link, content);
|
|
|
|
if (code == CURLE_OK) {
|
|
long responseCode = CURL.responseCode();
|
|
|
|
switch (responseCode) {
|
|
case 200:
|
|
{
|
|
std::string contentType = CURL.contentType();
|
|
|
|
if (isContentType(CURL.contentType(), "text/html")) {
|
|
/* ok */
|
|
msg->description = content;
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKNOWN_CONTENT_TYPE;
|
|
errorString = contentType;
|
|
}
|
|
}
|
|
break;
|
|
case 404:
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_NOT_FOUND;
|
|
break;
|
|
default:
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKOWN_RESPONSE_CODE;
|
|
rs_sprintf(errorString, "%ld", responseCode);
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_DOWNLOAD_ERROR;
|
|
errorString = curl_easy_strerror(code);
|
|
}
|
|
|
|
if (result != RS_FEED_ERRORSTATE_OK) {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot download page, CURLCode = " << code << ", error = " << errorString << std::endl;
|
|
#endif
|
|
return result;
|
|
}
|
|
|
|
/* get effective url (moved location) */
|
|
std::string effectiveUrl = CURL.effectiveUrl();
|
|
url = getBaseLink(effectiveUrl.empty() ? msg->link : effectiveUrl);
|
|
}
|
|
|
|
/* check if string contains xml chars (very simple test) */
|
|
if (msg->description.find('<') == std::string::npos && feed.transformationType == RS_FEED_TRANSFORMATION_TYPE_NONE) {
|
|
return result;
|
|
}
|
|
|
|
if (isRunning()) {
|
|
/* process description */
|
|
//long todo; // encoding
|
|
HTMLWrapper html;
|
|
if (html.readHTML(msg->description.c_str(), url.c_str())) {
|
|
xmlNodePtr root = html.getRootElement();
|
|
if (root) {
|
|
std::list<xmlNodePtr> nodesToDelete;
|
|
|
|
/* process all children */
|
|
std::list<xmlNodePtr> nodes;
|
|
nodes.push_back(root);
|
|
|
|
while (!nodes.empty()) {
|
|
if (!isRunning()) {
|
|
break;
|
|
}
|
|
xmlNodePtr node = nodes.front();
|
|
nodes.pop_front();
|
|
|
|
switch (node->type) {
|
|
case XML_ELEMENT_NODE:
|
|
if (xmlStrcasecmp(node->name, BAD_CAST"img") == 0) {
|
|
/* process images */
|
|
|
|
if ((feed.flag & RS_FEED_FLAG_EMBED_IMAGES) == 0) {
|
|
/* remove image */
|
|
xmlUnlinkNode(node);
|
|
nodesToDelete.push_back(node);
|
|
continue;
|
|
}
|
|
} else if (xmlStrcasecmp(node->name, BAD_CAST"script") == 0) {
|
|
/* remove script */
|
|
xmlUnlinkNode(node);
|
|
nodesToDelete.push_back(node);
|
|
continue;
|
|
}
|
|
|
|
xmlNodePtr child;
|
|
for (child = node->children; child; child = child->next) {
|
|
nodes.push_back(child);
|
|
}
|
|
break;
|
|
|
|
case XML_TEXT_NODE:
|
|
{
|
|
/* check for only space */
|
|
std::string content;
|
|
if (html.getContent(node, content, false)) {
|
|
std::string newContent = content;
|
|
|
|
/* trim */
|
|
XMLWrapper::trimString(newContent);
|
|
|
|
if (newContent.empty()) {
|
|
xmlUnlinkNode(node);
|
|
nodesToDelete.push_back(node);
|
|
} else {
|
|
if (content != newContent) {
|
|
html.setContent(node, newContent.c_str());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case XML_COMMENT_NODE:
|
|
// xmlUnlinkNode(node);
|
|
// nodesToDelete.push_back(node);
|
|
break;
|
|
|
|
case XML_ATTRIBUTE_NODE:
|
|
case XML_CDATA_SECTION_NODE:
|
|
case XML_ENTITY_REF_NODE:
|
|
case XML_ENTITY_NODE:
|
|
case XML_PI_NODE:
|
|
case XML_DOCUMENT_NODE:
|
|
case XML_DOCUMENT_TYPE_NODE:
|
|
case XML_DOCUMENT_FRAG_NODE:
|
|
case XML_NOTATION_NODE:
|
|
case XML_HTML_DOCUMENT_NODE:
|
|
case XML_DTD_NODE:
|
|
case XML_ELEMENT_DECL:
|
|
case XML_ATTRIBUTE_DECL:
|
|
case XML_ENTITY_DECL:
|
|
case XML_NAMESPACE_DECL:
|
|
case XML_XINCLUDE_START:
|
|
case XML_XINCLUDE_END:
|
|
#ifdef LIBXML_DOCB_ENABLED
|
|
case XML_DOCB_DOCUMENT_NODE:
|
|
#endif
|
|
break;
|
|
}
|
|
}
|
|
|
|
std::list<xmlNodePtr>::iterator nodeIt;
|
|
for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) {
|
|
xmlFreeNode(*nodeIt);
|
|
}
|
|
nodesToDelete.clear();
|
|
|
|
if (isRunning() && result == RS_FEED_ERRORSTATE_OK) {
|
|
unsigned int xpathCount;
|
|
unsigned int xpathIndex;
|
|
XPathWrapper *xpath = html.createXPath();
|
|
if (xpath) {
|
|
/* process images */
|
|
if (xpath->compile("//img")) {
|
|
xpathCount = xpath->count();
|
|
for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) {
|
|
if (!isRunning()) {
|
|
break;
|
|
}
|
|
xmlNodePtr node = xpath->node(xpathIndex);
|
|
|
|
if (node->type == XML_ELEMENT_NODE) {
|
|
bool removeImage = true;
|
|
|
|
if (feed.flag & RS_FEED_FLAG_EMBED_IMAGES) {
|
|
/* embed image */
|
|
std::string src = html.getAttr(node, "src");
|
|
if (!src.empty()) {
|
|
/* download image */
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") download image " << src << std::endl;
|
|
#endif
|
|
std::vector<unsigned char> data;
|
|
CURLWrapper CURL(proxy);
|
|
CURLcode code = CURL.downloadBinary(calculateLink(url, src), data);
|
|
if (code == CURLE_OK && CURL.responseCode() == 200) {
|
|
std::string contentType = CURL.contentType();
|
|
if (isContentType(contentType, "image/")) {
|
|
std::string base64;
|
|
if (toBase64(data, base64)) {
|
|
std::string imageBase64;
|
|
rs_sprintf(imageBase64, "data:%s;base64,%s", contentType.c_str(), base64.c_str());
|
|
if (html.setAttr(node, "src", imageBase64.c_str())) {
|
|
removeImage = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (removeImage) {
|
|
/* remove image */
|
|
xmlUnlinkNode(node);
|
|
nodesToDelete.push_back(node);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// unable to compile xpath expression
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR;
|
|
}
|
|
delete(xpath);
|
|
xpath = NULL;
|
|
} else {
|
|
// unable to create xpath object
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR;
|
|
std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << "), unable to create xpath object" << std::endl;
|
|
}
|
|
}
|
|
|
|
for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) {
|
|
xmlFreeNode(*nodeIt);
|
|
}
|
|
nodesToDelete.clear();
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
if (isRunning()) {
|
|
if (!html.saveHTML(msg->description)) {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot dump html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") no root element" << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
} else {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot read html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processTransformation(const RsFeedReaderFeed &feed, RsFeedReaderMsg *msg, std::string &errorString)
|
|
{
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
switch (feed.transformationType) {
|
|
case RS_FEED_TRANSFORMATION_TYPE_NONE:
|
|
break;
|
|
case RS_FEED_TRANSFORMATION_TYPE_XPATH:
|
|
msg->descriptionTransformed = msg->description;
|
|
result = processXPath(feed.xpathsToUse.ids, feed.xpathsToRemove.ids, msg->descriptionTransformed, errorString);
|
|
break;
|
|
case RS_FEED_TRANSFORMATION_TYPE_XSLT:
|
|
msg->descriptionTransformed = msg->description;
|
|
result = processXslt(feed.xslt, msg->descriptionTransformed, errorString);
|
|
break;
|
|
}
|
|
|
|
if (msg->descriptionTransformed == msg->description) {
|
|
msg->descriptionTransformed.clear();
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processXPath(const std::list<std::string> &xpathsToUse, const std::list<std::string> &xpathsToRemove, HTMLWrapper &html, std::string &errorString)
|
|
{
|
|
#warning p3FeedReaderThread.cc TODO thunder2: fill_errorString;
|
|
|
|
if (xpathsToUse.empty() && xpathsToRemove.empty()) {
|
|
return RS_FEED_ERRORSTATE_OK;
|
|
}
|
|
|
|
XPathWrapper *xpath = html.createXPath();
|
|
if (xpath == NULL) {
|
|
// unable to create xpath object
|
|
std::cerr << "p3FeedReaderThread::processXPath - unable to create xpath object" << std::endl;
|
|
return RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR;
|
|
}
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
unsigned int xpathCount;
|
|
unsigned int xpathIndex;
|
|
std::list<std::string>::const_iterator xpathIt;
|
|
|
|
if (!xpathsToUse.empty()) {
|
|
HTMLWrapper htmlNew;
|
|
if (htmlNew.createHTML()) {
|
|
xmlNodePtr body = htmlNew.getBody();
|
|
if (body) {
|
|
/* process use list */
|
|
for (xpathIt = xpathsToUse.begin(); xpathIt != xpathsToUse.end(); ++xpathIt) {
|
|
if (xpath->compile(xpathIt->c_str())) {
|
|
xpathCount = xpath->count();
|
|
if (xpathCount) {
|
|
for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) {
|
|
xmlNodePtr node = xpath->node(xpathIndex);
|
|
xmlUnlinkNode(node);
|
|
xmlAddChild(body, node);
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_NO_RESULT;
|
|
errorString = *xpathIt;
|
|
break;
|
|
}
|
|
} else {
|
|
// unable to process xpath expression
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXPath - unable to process xpath expression" << std::endl;
|
|
#endif
|
|
errorString = *xpathIt;
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_WRONG_EXPRESSION;
|
|
}
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
html = htmlNew;
|
|
}
|
|
}
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
std::list<xmlNodePtr> nodesToDelete;
|
|
|
|
/* process remove list */
|
|
for (xpathIt = xpathsToRemove.begin(); xpathIt != xpathsToRemove.end(); ++xpathIt) {
|
|
if (xpath->compile(xpathIt->c_str())) {
|
|
xpathCount = xpath->count();
|
|
if (xpathCount) {
|
|
for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) {
|
|
xmlNodePtr node = xpath->node(xpathIndex);
|
|
|
|
xmlUnlinkNode(node);
|
|
nodesToDelete.push_back(node);
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_NO_RESULT;
|
|
errorString = *xpathIt;
|
|
break;
|
|
}
|
|
} else {
|
|
// unable to process xpath expression
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXPath - unable to process xpath expression" << std::endl;
|
|
#endif
|
|
errorString = *xpathIt;
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XPATH_WRONG_EXPRESSION;
|
|
break;
|
|
}
|
|
}
|
|
|
|
std::list<xmlNodePtr>::iterator nodeIt;
|
|
for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) {
|
|
xmlFreeNode(*nodeIt);
|
|
}
|
|
nodesToDelete.clear();
|
|
}
|
|
|
|
delete(xpath);
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processXPath(const std::list<std::string> &xpathsToUse, const std::list<std::string> &xpathsToRemove, std::string &description, std::string &errorString)
|
|
{
|
|
if (xpathsToUse.empty() && xpathsToRemove.empty()) {
|
|
return RS_FEED_ERRORSTATE_OK;
|
|
}
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
/* process description */
|
|
#warning p3FeedReaderThread.cc TODO thunder2: encoding
|
|
HTMLWrapper html;
|
|
if (html.readHTML(description.c_str(), "")) {
|
|
xmlNodePtr root = html.getRootElement();
|
|
if (root) {
|
|
result = processXPath(xpathsToUse, xpathsToRemove, html, errorString);
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
if (!html.saveHTML(description)) {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXPath - cannot dump html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
}
|
|
}
|
|
} else {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXPath - no root element" << std::endl;
|
|
#endif
|
|
errorString = "No root element found";
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
} else {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXPath - cannot read html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processXslt(const std::string &xslt, HTMLWrapper &html, std::string &errorString)
|
|
{
|
|
XMLWrapper style;
|
|
if (!style.readXML(xslt.c_str())) {
|
|
errorString = style.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - error loading style" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
return RS_FEED_ERRORSTATE_PROCESS_XSLT_FORMAT_ERROR;
|
|
}
|
|
|
|
XMLWrapper xmlResult;
|
|
if (!html.transform(style, xmlResult)) {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - error transform" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
return RS_FEED_ERRORSTATE_PROCESS_XSLT_TRANSFORM_ERROR;
|
|
}
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
xmlNodePtr root = xmlResult.getRootElement();
|
|
if (root) {
|
|
if (xmlResult.nodeName(root) == "html") {
|
|
if (root->children && xmlResult.nodeName(root->children) == "body") {
|
|
root = root->children->children;
|
|
}
|
|
}
|
|
HTMLWrapper htmlNew;
|
|
if (htmlNew.createHTML()) {
|
|
xmlNodePtr body = htmlNew.getBody();
|
|
if (body) {
|
|
/* copy result nodes */
|
|
xmlNodePtr node;
|
|
for (node = root; node; node = node->next) {
|
|
xmlNodePtr newNode = xmlCopyNode(node, 1);
|
|
if (newNode) {
|
|
if (!xmlAddChild(body, newNode)) {
|
|
xmlFreeNode(newNode);
|
|
break;
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - node copy error" << std::endl;
|
|
#endif
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
} else {
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
html = htmlNew;
|
|
}
|
|
} else {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - no result" << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_XSLT_NO_RESULT;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
RsFeedReaderErrorState p3FeedReaderThread::processXslt(const std::string &xslt, std::string &description, std::string &errorString)
|
|
{
|
|
if (xslt.empty()) {
|
|
return RS_FEED_ERRORSTATE_OK;
|
|
}
|
|
|
|
RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK;
|
|
|
|
/* process description */
|
|
//long todo; // encoding
|
|
HTMLWrapper html;
|
|
if (html.readHTML(description.c_str(), "")) {
|
|
xmlNodePtr root = html.getRootElement();
|
|
if (root) {
|
|
result = processXslt(xslt, html, errorString);
|
|
|
|
if (result == RS_FEED_ERRORSTATE_OK) {
|
|
if (!html.saveHTML(description)) {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - cannot dump html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR;
|
|
}
|
|
}
|
|
} else {
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - no root element" << std::endl;
|
|
#endif
|
|
errorString = "No root element found";
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
} else {
|
|
errorString = html.lastError();
|
|
#ifdef FEEDREADER_DEBUG
|
|
std::cerr << "p3FeedReaderThread::processXslt - cannot read html" << std::endl;
|
|
std::cerr << " Error: " << errorString << std::endl;
|
|
#endif
|
|
result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR;
|
|
}
|
|
|
|
return result;
|
|
}
|