/**************************************************************** * RetroShare GUI is distributed under the following license: * * Copyright (C) 2012 by Thunder * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. ****************************************************************/ #include "p3FeedReaderThread.h" #include "rsFeedReaderItems.h" #include "util/rsstring.h" #include "util/CURLWrapper.h" #include "util/XMLWrapper.h" #include "util/HTMLWrapper.h" #include "util/XPathWrapper.h" #include #include // for usleep enum FeedFormat { FORMAT_RSS, FORMAT_RDF, FORMAT_ATOM }; /********* * #define FEEDREADER_DEBUG *********/ #define FEEDREADER_DEBUG p3FeedReaderThread::p3FeedReaderThread(p3FeedReader *feedReader, Type type, const std::string &feedId) : RsThread(), mFeedReader(feedReader), mType(type), mFeedId(feedId) { } p3FeedReaderThread::~p3FeedReaderThread() { } /***************************************************************************/ /****************************** Thread *************************************/ /***************************************************************************/ void p3FeedReaderThread::run() { while (isRunning()) { #ifdef WIN32 Sleep(1000); #else usleep(1000000); #endif /* every second */ switch (mType) { case DOWNLOAD: { RsFeedReaderFeed feed; if (mFeedReader->getFeedToDownload(feed, mFeedId)) { std::string content; std::string icon; std::string errorString; RsFeedReaderErrorState result = download(feed, content, icon, errorString); if (result == RS_FEED_ERRORSTATE_OK) { mFeedReader->onDownloadSuccess(feed.feedId, content, icon); } else { mFeedReader->onDownloadError(feed.feedId, result, errorString); } } } break; case PROCESS: { RsFeedReaderFeed feed; if (mFeedReader->getFeedToProcess(feed, mFeedId)) { std::list msgs; std::string errorString; std::list::iterator it; RsFeedReaderErrorState result = process(feed, msgs, errorString); if (result == RS_FEED_ERRORSTATE_OK) { /* first, filter the messages */ mFeedReader->onProcessSuccess_filterMsg(feed.feedId, msgs); if (isRunning()) { /* second, process the descriptions */ for (it = msgs.begin(); it != msgs.end(); ) { RsFeedReaderMsg *mi = *it; result = processMsg(feed, mi, errorString); if (result != RS_FEED_ERRORSTATE_OK) { break; } if (feed.preview) { /* add every message */ it = msgs.erase(it); std::list msgSingle; msgSingle.push_back(mi); mFeedReader->onProcessSuccess_addMsgs(feed.feedId, msgSingle, true); /* delete not accepted message */ std::list::iterator it1; for (it1 = msgSingle.begin(); it1 != msgSingle.end(); ++it1) { delete (*it1); } } else { ++it; } } if (result == RS_FEED_ERRORSTATE_OK) { /* third, add messages */ mFeedReader->onProcessSuccess_addMsgs(feed.feedId, msgs, false); } else { mFeedReader->onProcessError(feed.feedId, result, errorString); } } } else { mFeedReader->onProcessError(feed.feedId, result, errorString); } /* delete not accepted messages */ for (it = msgs.begin(); it != msgs.end(); ++it) { delete (*it); } msgs.clear(); } } break; } } } /***************************************************************************/ /****************************** Download ***********************************/ /***************************************************************************/ static bool isContentType(const std::string &contentType, const char *type) { return (strncasecmp(contentType.c_str(), type, strlen(type)) == 0); } static bool toBase64(const std::vector &data, std::string &base64) { bool result = false; /* Set up a base64 encoding BIO that writes to a memory BIO */ BIO *b64 = BIO_new(BIO_f_base64()); if (b64) { BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL); BIO *bmem = BIO_new(BIO_s_mem()); if (bmem) { BIO_set_flags(bmem, BIO_CLOSE); // probably redundant b64 = BIO_push(b64, bmem); /* Send the data */ BIO_write(b64, data.data(), data.size()); /* Collect the encoded data */ BIO_flush(b64); char* temp; int count = BIO_get_mem_data(bmem, &temp); if (count && temp) { base64.assign(temp, count); result = true; } } BIO_free_all(b64); } return result; } static std::string getBaseLink(std::string link) { size_t found = link.rfind('/'); if (found != std::string::npos) { link.erase(found + 1); } return link; } static std::string calculateLink(const std::string &baseLink, const std::string &link) { if (link.substr(0, 7) == "http://") { /* absolute link */ return link; } /* calculate link of base link */ std::string resultLink = baseLink; /* link should begin with "http://" */ if (resultLink.substr(0, 7) != "http://") { resultLink.insert(0, "http://"); } if (link.empty()) { /* no link */ return resultLink; } if (*link.begin() == '/') { /* link begins with "/" */ size_t found = resultLink.find('/', 7); if (found != std::string::npos) { resultLink.erase(found); } } else { /* check for "/" at the end */ std::string::reverse_iterator it = resultLink.rend (); it--; if (*it != '/') { resultLink += "/"; } } resultLink += link; return resultLink; } static bool getFavicon(CURLWrapper &CURL, const std::string &url, std::string &icon) { icon.clear(); bool result = false; std::vector vicon; CURLcode code = CURL.downloadBinary(calculateLink(url, "/favicon.ico"), vicon); if (code == CURLE_OK) { if (CURL.responseCode() == 200) { std::string contentType = CURL.contentType(); if (isContentType(contentType, "image/x-icon") || isContentType(contentType, "application/octet-stream") || isContentType(contentType, "text/plain")) { if (!vicon.empty()) { long todo; // check it result = toBase64(vicon, icon); } } } } return result; } RsFeedReaderErrorState p3FeedReaderThread::download(const RsFeedReaderFeed &feed, std::string &content, std::string &icon, std::string &error) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::download - feed " << feed.feedId << " (" << feed.name << ")" << std::endl; #endif content.clear(); error.clear(); RsFeedReaderErrorState result; std::string proxy = getProxyForFeed(feed); CURLWrapper CURL(proxy); CURLcode code = CURL.downloadText(feed.url, content); if (code == CURLE_OK) { long responseCode = CURL.responseCode(); switch (responseCode) { case 200: { std::string contentType = CURL.contentType(); if (isContentType(contentType, "text/xml") || isContentType(contentType, "text/html") || isContentType(contentType, "application/rss+xml") || isContentType(contentType, "application/xml") || isContentType(contentType, "application/xhtml+xml") || isContentType(contentType, "application/atom+xml")) { /* ok */ result = RS_FEED_ERRORSTATE_OK; } else { result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKNOWN_CONTENT_TYPE; error = contentType; } } break; case 404: result = RS_FEED_ERRORSTATE_DOWNLOAD_NOT_FOUND; break; default: result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKOWN_RESPONSE_CODE; rs_sprintf(error, "%ld", responseCode); } getFavicon(CURL, feed.url, icon); } else { result = RS_FEED_ERRORSTATE_DOWNLOAD_ERROR; error = curl_easy_strerror(code); } #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::download - feed " << feed.feedId << " (" << feed.name << "), result " << result << ", error = " << error << std::endl; #endif return result; } /***************************************************************************/ /****************************** Process ************************************/ /***************************************************************************/ static xmlNodePtr getNextItem(FeedFormat feedFormat, xmlNodePtr channel, xmlNodePtr item) { if (!channel) { return NULL; } if (!item) { switch (feedFormat) { case FORMAT_RSS: case FORMAT_ATOM: item = channel->children; break; case FORMAT_RDF: item = channel->next; break; default: return NULL; } } else { item = item->next; } for (; item; item = item->next) { if (item->type == XML_ELEMENT_NODE && xmlStrcasecmp(item->name, (feedFormat == FORMAT_ATOM) ? BAD_CAST"entry" : BAD_CAST"item") == 0) { break; } } return item; } static void splitString(std::string s, std::vector &v, const char d) { v.clear(); std::string::size_type p; while ((p = s.find_first_of(d)) != std::string::npos) { v.push_back(s.substr(0, p)); s.erase(0, p + 1); } if (!s.empty()) { v.push_back(s); } } static unsigned int ymdhms_to_seconds(int year, int mon, int day, int hour, int minute, int second) { if (sizeof(time_t) == 4) { if ((time_t)-1 < 0) { if (year >= 2038) { year = 2038; mon = 0; day = 1; hour = 0; minute = 0; second = 0; } } else { if (year >= 2115) { year = 2115; mon = 0; day = 1; hour = 0; minute = 0; second = 0; } } } unsigned int ret = (day - 32075) /* days */ + 1461L * (year + 4800L + (mon - 14) / 12) / 4 + 367 * (mon - 2 - (mon - 14) / 12 * 12) / 12 - 3 * ((year + 4900L + (mon - 14) / 12) / 100) / 4 - 2440588; ret = 24*ret + hour; /* hours */ ret = 60*ret + minute; /* minutes */ ret = 60*ret + second; /* seconds */ return ret; } static const char haystack[37]="janfebmaraprmayjunjulaugsepoctnovdec"; // we follow the recommendation of rfc2822 to consider all // obsolete time zones not listed here equivalent to "-0000" static const struct { const char tzName[4]; int tzOffset; } known_zones[] = { { "UT", 0 }, { "GMT", 0 }, { "EST", -300 }, { "EDT", -240 }, { "CST", -360 }, { "CDT", -300 }, { "MST", -420 }, { "MDT", -360 }, { "PST", -480 }, { "PDT", -420 }, { { 0,0,0,0 }, 0 } }; // copied from KRFCDate::parseDate static time_t parseRFC822Date(const std::string &pubDate) { if (pubDate.empty()) return 0; // This parse a date in the form: // Wednesday, 09-Nov-99 23:12:40 GMT // or // Sat, 01-Jan-2000 08:00:00 GMT // or // Sat, 01 Jan 2000 08:00:00 GMT // or // 01 Jan 99 22:00 +0100 (exceptions in rfc822/rfc2822) // // We ignore the weekday // time_t result = 0; int offset = 0; char *newPosStr; const char *dateString = pubDate.c_str(); int day = 0; char monthStr[4]; int month = -1; int year = 0; int hour = 0; int minute = 0; int second = 0; // Strip leading space while(*dateString && isspace(*dateString)) dateString++; // Strip weekday while(*dateString && !isdigit(*dateString) && !isspace(*dateString)) dateString++; // Strip trailing space while(*dateString && isspace(*dateString)) dateString++; if (!*dateString) return result; // Invalid date if (isalpha(*dateString)) { // ' Nov 5 1994 18:15:30 GMT' // Strip leading space while(*dateString && isspace(*dateString)) dateString++; for(int i=0; i < 3;i++) { if (!*dateString || (*dateString == '-') || isspace(*dateString)) return result; // Invalid date monthStr[i] = tolower(*dateString++); } monthStr[3] = '\0'; newPosStr = (char*)strstr(haystack, monthStr); if (!newPosStr) return result; // Invalid date month = (newPosStr-haystack)/3; // Jan=00, Feb=01, Mar=02, .. if ((month < 0) || (month > 11)) return result; // Invalid date while (*dateString && isalpha(*dateString)) dateString++; // Skip rest of month-name } // ' 09-Nov-99 23:12:40 GMT' // ' 5 1994 18:15:30 GMT' day = strtol(dateString, &newPosStr, 10); dateString = newPosStr; if ((day < 1) || (day > 31)) return result; // Invalid date; if (!*dateString) return result; // Invalid date while(*dateString && (isspace(*dateString) || (*dateString == '-'))) dateString++; if (month == -1) { for(int i=0; i < 3;i++) { if (!*dateString || (*dateString == '-') || isspace(*dateString)) return result; // Invalid date monthStr[i] = tolower(*dateString++); } monthStr[3] = '\0'; newPosStr = (char*)strstr(haystack, monthStr); if (!newPosStr) return result; // Invalid date month = (newPosStr-haystack)/3; // Jan=00, Feb=01, Mar=02, .. if ((month < 0) || (month > 11)) return result; // Invalid date while (*dateString && isalpha(*dateString)) dateString++; // Skip rest of month-name } // '-99 23:12:40 GMT' while(*dateString && (isspace(*dateString) || (*dateString == '-'))) dateString++; if (!*dateString || !isdigit(*dateString)) return result; // Invalid date // '99 23:12:40 GMT' year = strtol(dateString, &newPosStr, 10); dateString = newPosStr; // Y2K: Solve 2 digit years if ((year >= 0) && (year < 50)) year += 2000; if ((year >= 50) && (year < 100)) year += 1900; // Y2K if ((year < 1900) || (year > 2500)) return result; // Invalid date // Don't fail if the time is missing. if (*dateString) { // ' 23:12:40 GMT' if (!isspace(*dateString++)) return result; // Invalid date hour = strtol(dateString, &newPosStr, 10); dateString = newPosStr; if ((hour < 0) || (hour > 23)) return result; // Invalid date if (!*dateString) return result; // Invalid date // ':12:40 GMT' if (*dateString++ != ':') return result; // Invalid date minute = strtol(dateString, &newPosStr, 10); dateString = newPosStr; if ((minute < 0) || (minute > 59)) return result; // Invalid date if (!*dateString) return result; // Invalid date // ':40 GMT' if (*dateString != ':' && !isspace(*dateString)) return result; // Invalid date // seconds are optional in rfc822 + rfc2822 if (*dateString ==':') { dateString++; second = strtol(dateString, &newPosStr, 10); dateString = newPosStr; if ((second < 0) || (second > 59)) return result; // Invalid date } else { dateString++; } while(*dateString && isspace(*dateString)) dateString++; } // don't fail if the time zone is missing, some // broken mail-/news-clients omit the time zone if (*dateString) { if ((strncasecmp(dateString, "gmt", 3) == 0) || (strncasecmp(dateString, "utc", 3) == 0)) { dateString += 3; while(*dateString && isspace(*dateString)) dateString++; } if ((*dateString == '+') || (*dateString == '-')) { offset = strtol(dateString, &newPosStr, 10); if (abs(offset) < 30) { dateString = newPosStr; offset = offset * 100; if (*dateString && *(dateString+1)) { dateString++; int minutes = strtol(dateString, &newPosStr, 10); if (offset > 0) offset += minutes; else offset -= minutes; } } if ((offset < -9959) || (offset > 9959)) return result; // Invalid date int sgn = (offset < 0)? -1:1; offset = abs(offset); offset = ((offset / 100)*60 + (offset % 100))*sgn; } else { for (int i=0; known_zones[i].tzName != 0; i++) { if (0 == strncasecmp(dateString, known_zones[i].tzName, strlen(known_zones[i].tzName))) { offset = known_zones[i].tzOffset; break; } } } } result = ymdhms_to_seconds(year, month+1, day, hour, minute, second); // avoid negative time values if ((offset > 0) && (offset > result)) offset = 0; result -= offset*60; // If epoch 0 return epoch +1 which is Thu, 01-Jan-70 00:00:01 GMT // This is so that parse error and valid epoch 0 return values won't // be the same for sensitive applications... if (result < 1) result = 1; return result; } // copied and converted to std::string from KRFCDate::parseDateISO8601 static time_t parseISO8601Date(const std::string &pubDate) { if (pubDate.empty()) { return 0; } // These dates look like this: // YYYY-MM-DDTHH:MM:SS // But they may also have 0, 1 or 2 suffixes. // Suffix 1: .secfrac (fraction of second) // Suffix 2: Either 'Z' or +zone or -zone, where zone is HHMM unsigned int year = 0; unsigned int month = 0; unsigned int mday = 0; unsigned int hour = 0; unsigned int min = 0; unsigned int sec = 0; int offset = 0; std::string input = pubDate; // First find the 'T' separator, if any. int tPos = input.find('T'); // If there is no time, no month or no day specified, fill those missing // fields so that 'input' matches YYYY-MM-DDTHH:MM:SS if (-1 == tPos) { int dashes = 0; std::string::iterator it; for (it = input.begin(); it != input.end(); ++it) { if (*it == '-') { ++dashes; } } if (0 == dashes) { input += "-01-01"; } else if (1 == dashes) { input += "-01"; } tPos = input.length(); input += "T12:00:00"; } // Now parse the date part. std::string dateString = input.substr(0, tPos);//.stripWhiteSpace(); std::string timeString = input.substr(tPos + 1);//.stripWhiteSpace(); std::vector l; splitString(dateString, l, '-'); if (l.size() < 3) return 0; sscanf(l[0].c_str(), "%u", &year); sscanf(l[1].c_str(), "%u", &month); sscanf(l[2].c_str(), "%u", &mday); // Z suffix means UTC. if ('Z' == timeString[timeString.length() - 1]) { timeString.erase(timeString.length() - 1, 1); } // +zone or -zone suffix (offset from UTC). int plusPos = timeString.find_last_of('+'); if (-1 != plusPos) { std::string offsetString = timeString.substr(plusPos + 1); unsigned int offsetHour; unsigned int offsetMinute; sscanf(offsetString.substr(0, 1).c_str(), "%u", &offsetHour); sscanf(offsetString.substr(offsetString.length() - 2).c_str(), "%u", &offsetMinute); offset = offsetHour * 60 + offsetMinute; timeString = timeString.substr(0, plusPos); } else { int minusPos = timeString.find_last_of('-'); if (-1 != minusPos) { std::string offsetString = timeString.substr(minusPos + 1); unsigned int offsetHour; unsigned int offsetMinute; sscanf(offsetString.substr(0, 1).c_str(), "%u", &offsetHour); sscanf(offsetString.substr(offsetString.length() - 2).c_str(), "%u", &offsetMinute); timeString = timeString.substr(0, minusPos); } } // secfrac suffix. int dotPos = timeString.find_last_of('.'); if (-1 != dotPos) { timeString = timeString.substr(0, dotPos); } // Now parse the time part. splitString(timeString, l, ':'); if (l.size() < 3) return 0; sscanf(l[0].c_str(), "%u", &hour); sscanf(l[1].c_str(), "%u", &min); sscanf(l[2].c_str(), "%u", &sec); time_t result = ymdhms_to_seconds(year, month, mday, hour, min, sec); // avoid negative time values if ((offset > 0) && (offset > result)) offset = 0; result -= offset*60; // If epoch 0 return epoch +1 which is Thu, 01-Jan-70 00:00:01 GMT // This is so that parse error and valid epoch 0 return values won't // be the same for sensitive applications... if (result < 1) result = 1; return result; } RsFeedReaderErrorState p3FeedReaderThread::process(const RsFeedReaderFeed &feed, std::list &entries, std::string &error) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << ")" << std::endl; #endif RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK; XMLWrapper xml; if (xml.readXML(feed.content.c_str())) { xmlNodePtr root = xml.getRootElement(); if (root) { FeedFormat feedFormat; if (xmlStrcasecmp(root->name, BAD_CAST"rss") == 0) { feedFormat = FORMAT_RSS; } else if (xmlStrcasecmp (root->name, BAD_CAST"rdf") == 0) { feedFormat = FORMAT_RDF; } else if (xmlStrcasecmp (root->name, BAD_CAST"feed") == 0) { feedFormat = FORMAT_ATOM; } else { result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT; error = "Only RSS, RDF or ATOM supported"; } if (result == RS_FEED_ERRORSTATE_OK) { xmlNodePtr channel = NULL; switch (feedFormat) { case FORMAT_RSS: case FORMAT_RDF: channel = xml.findNode(root->children, "channel"); break; case FORMAT_ATOM: channel = root; break; } if (channel) { /* import header info */ if (feed.flag & RS_FEED_FLAG_INFO_FROM_FEED) { std::string title; if (xml.getChildText(channel, "title", title) && !title.empty()) { std::string::size_type p; while ((p = title.find_first_of("\r\n")) != std::string::npos) { title.erase(p, 1); } std::string description; xml.getChildText(channel, (feedFormat == FORMAT_ATOM) ? "subtitle" : "description", description); mFeedReader->setFeedInfo(feed.feedId, title, description); } } /* get item count */ xmlNodePtr node; for (node = NULL; (node = getNextItem(feedFormat, channel, node)) != NULL; ) { if (!isRunning()) { break; } std::string title; if (!xml.getChildText(node, "title", title) || title.empty()) { continue; } /* remove newlines */ std::string::size_type p; while ((p = title.find_first_of("\r\n")) != std::string::npos) { title.erase(p, 1); } RsFeedReaderMsg *item = new RsFeedReaderMsg(); item->msgId.clear(); // is calculated later item->feedId = feed.feedId; item->title = title; /* try feedburner:origLink */ if (!xml.getChildText(node, "origLink", item->link) || item->link.empty()) { xml.getChildText(node, "link", item->link); } // remove sid= std::string linkUpper; stringToUpperCase(item->link, linkUpper); std::string::size_type sidStart = linkUpper.find("SID="); if (sidStart != std::string::npos) { std::string::size_type sidEnd1 = linkUpper.find(";", sidStart); std::string::size_type sidEnd2 = linkUpper.find("#", sidStart); if (sidEnd1 == std::string::npos) { sidEnd1 = linkUpper.size(); } if (sidEnd2 == std::string::npos) { sidEnd2 = linkUpper.size(); } if (sidStart > 0 && linkUpper[sidStart - 1] == '&') { sidStart--; } std::string::size_type sidEnd = std::min(sidEnd1, sidEnd2); item->link.erase(sidStart, sidEnd - sidStart); } if (feedFormat == FORMAT_ATOM) { /* ... */ xmlNodePtr author = xml.findNode(node->children, "author", false); if (author) { xml.getChildText(node, "name", item->author); } } else { xml.getChildText(node, "author", item->author); } switch (feedFormat) { case FORMAT_RSS: case FORMAT_RDF: xml.getChildText(node, "description", item->description); break; case FORMAT_ATOM: /* try content */ if (!xml.getChildText(node, "content", item->description)) { /* use summary */ xml.getChildText(node, "summary", item->description); } break; } std::string pubDate; if (xml.getChildText(node, "pubdate", pubDate)) { item->pubDate = parseRFC822Date(pubDate); } if (xml.getChildText(node, "date", pubDate)) { item->pubDate = parseISO8601Date (pubDate); } if (xml.getChildText(node, "updated", pubDate)) { // atom item->pubDate = parseISO8601Date (pubDate); } if (item->pubDate == 0) { /* use current time */ item->pubDate = time(NULL); } entries.push_back(item); } } else { result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT; error = "Channel not found"; } } } else { result = RS_FEED_ERRORSTATE_PROCESS_UNKNOWN_FORMAT; error = "Can't read document"; } } else { result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR; } #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << "), result " << result << ", error = " << error << std::endl; #endif return result; } std::string p3FeedReaderThread::getProxyForFeed(const RsFeedReaderFeed &feed) { std::string proxy; if (feed.flag & RS_FEED_FLAG_STANDARD_PROXY) { std::string standardProxyAddress; uint16_t standardProxyPort; if (mFeedReader->getStandardProxy(standardProxyAddress, standardProxyPort)) { rs_sprintf(proxy, "%s:%u", standardProxyAddress.c_str(), standardProxyPort); } } else { if (!feed.proxyAddress.empty() && feed.proxyPort) { rs_sprintf(proxy, "%s:%u", feed.proxyAddress.c_str(), feed.proxyPort); } } return proxy; } RsFeedReaderErrorState p3FeedReaderThread::processMsg(const RsFeedReaderFeed &feed, RsFeedReaderMsg *msg, std::string &errorString) { long todo_fill_errorString; if (!msg) { return RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR; } RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK; std::string proxy = getProxyForFeed(feed); std::string url; if (feed.flag & RS_FEED_FLAG_SAVE_COMPLETE_PAGE) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") download page " << msg->link << std::endl; #endif std::string content; CURLWrapper CURL(proxy); CURLcode code = CURL.downloadText(msg->link, content); if (code == CURLE_OK) { long responseCode = CURL.responseCode(); switch (responseCode) { case 200: { std::string contentType = CURL.contentType(); if (isContentType(CURL.contentType(), "text/html")) { /* ok */ msg->description = content; } else { result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKNOWN_CONTENT_TYPE; errorString = contentType; } } break; case 404: result = RS_FEED_ERRORSTATE_DOWNLOAD_NOT_FOUND; break; default: result = RS_FEED_ERRORSTATE_DOWNLOAD_UNKOWN_RESPONSE_CODE; rs_sprintf(errorString, "%ld", responseCode); } } else { result = RS_FEED_ERRORSTATE_DOWNLOAD_ERROR; errorString = curl_easy_strerror(code); } if (result != RS_FEED_ERRORSTATE_OK) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot download page, CURLCode = " << code << ", error = " << errorString << std::endl; #endif return result; } /* get effective url (moved location) */ std::string effectiveUrl = CURL.effectiveUrl(); url = getBaseLink(effectiveUrl.empty() ? msg->link : effectiveUrl); } /* check if string contains xml chars (very simple test) */ if (msg->description.find('<') == std::string::npos) { return result; } /* process description */ long todo; // encoding HTMLWrapper html; if (html.readHTML(msg->description.c_str(), url.c_str())) { xmlNodePtr root = html.getRootElement(); if (root) { std::list nodesToDelete; /* process all children */ std::list nodes; nodes.push_back(root); while (!nodes.empty()) { if (!isRunning()) { break; } xmlNodePtr node = nodes.front(); nodes.pop_front(); switch (node->type) { case XML_ELEMENT_NODE: if (xmlStrcasecmp(node->name, BAD_CAST"img") == 0) { /* process images */ if ((feed.flag & RS_FEED_FLAG_EMBED_IMAGES) == 0) { /* remove image */ xmlUnlinkNode(node); nodesToDelete.push_back(node); continue; } } else if (xmlStrcasecmp(node->name, BAD_CAST"script") == 0) { /* remove script */ xmlUnlinkNode(node); nodesToDelete.push_back(node); continue; } xmlNodePtr child; for (child = node->children; child; child = child->next) { nodes.push_back(child); } break; case XML_TEXT_NODE: { /* check for only space */ std::string content; if (html.getContent(node, content, false)) { std::string newContent = content; /* trim */ XMLWrapper::trimString(newContent); if (newContent.empty()) { xmlUnlinkNode(node); nodesToDelete.push_back(node); } else { if (content != newContent) { html.setContent(node, newContent.c_str()); } } } } break; case XML_COMMENT_NODE: // xmlUnlinkNode(node); // nodesToDelete.push_back(node); break; case XML_ATTRIBUTE_NODE: case XML_CDATA_SECTION_NODE: case XML_ENTITY_REF_NODE: case XML_ENTITY_NODE: case XML_PI_NODE: case XML_DOCUMENT_NODE: case XML_DOCUMENT_TYPE_NODE: case XML_DOCUMENT_FRAG_NODE: case XML_NOTATION_NODE: case XML_HTML_DOCUMENT_NODE: case XML_DTD_NODE: case XML_ELEMENT_DECL: case XML_ATTRIBUTE_DECL: case XML_ENTITY_DECL: case XML_NAMESPACE_DECL: case XML_XINCLUDE_START: case XML_XINCLUDE_END: #ifdef LIBXML_DOCB_ENABLED case XML_DOCB_DOCUMENT_NODE: #endif break; } } std::list::iterator nodeIt; for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) { xmlFreeNode(*nodeIt); } nodesToDelete.clear(); if (!feed.preview) { result = processXPath(feed.xpathsToUse.ids, feed.xpathsToRemove.ids, html, errorString); } if (result == RS_FEED_ERRORSTATE_OK) { unsigned int xpathCount; unsigned int xpathIndex; XPathWrapper *xpath = html.createXPath(); if (xpath) { /* process images */ if (xpath->compile("//img")) { xpathCount = xpath->count(); for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) { xmlNodePtr node = xpath->node(xpathIndex); if (node->type == XML_ELEMENT_NODE) { bool removeImage = true; if (feed.flag & RS_FEED_FLAG_EMBED_IMAGES) { /* embed image */ std::string src = html.getAttr(node, "src"); if (!src.empty()) { /* download image */ #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") download image " << src << std::endl; #endif std::vector data; CURLWrapper CURL(proxy); CURLcode code = CURL.downloadBinary(calculateLink(url, src), data); if (code == CURLE_OK && CURL.responseCode() == 200) { std::string contentType = CURL.contentType(); if (isContentType(contentType, "image/")) { std::string base64; if (toBase64(data, base64)) { std::string imageBase64; rs_sprintf(imageBase64, "data:%s;base64,%s", contentType.c_str(), base64.c_str()); if (html.setAttr(node, "src", imageBase64.c_str())) { removeImage = false; } } } } } } if (removeImage) { /* remove image */ xmlUnlinkNode(node); nodesToDelete.push_back(node); continue; } } } } else { // unable to compile xpath expression result = RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR; } delete(xpath); xpath = NULL; } else { // unable to create xpath object result = RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR; std::cerr << "p3FeedReaderThread::process - feed " << feed.feedId << " (" << feed.name << "), unable to create xpath object" << std::endl; } } for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) { xmlFreeNode(*nodeIt); } nodesToDelete.clear(); if (result == RS_FEED_ERRORSTATE_OK) { if (isRunning()) { if (!html.saveHTML(msg->description)) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot dump html" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR; } } } } else { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") no root element" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } } else { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processHTML - feed " << feed.feedId << " (" << feed.name << ") cannot read html" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } return result; } RsFeedReaderErrorState p3FeedReaderThread::processXPath(const std::list &xpathsToUse, const std::list &xpathsToRemove, HTMLWrapper &html, std::string &errorString) { long todo_fill_errorString; if (xpathsToUse.empty() && xpathsToRemove.empty()) { return RS_FEED_ERRORSTATE_OK; } XPathWrapper *xpath = html.createXPath(); if (xpath == NULL) { // unable to create xpath object std::cerr << "p3FeedReaderThread::processXPath - unable to create xpath object" << std::endl; return RS_FEED_ERRORSTATE_PROCESS_XPATH_INTERNAL_ERROR; } RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK; unsigned int xpathCount; unsigned int xpathIndex; std::list::const_iterator xpathIt; if (!xpathsToUse.empty()) { HTMLWrapper htmlNew; if (htmlNew.createHTML()) { xmlNodePtr body = htmlNew.getBody(); if (body) { /* process use list */ for (xpathIt = xpathsToUse.begin(); xpathIt != xpathsToUse.end(); ++xpathIt) { if (xpath->compile(xpathIt->c_str())) { xpathCount = xpath->count(); if (xpathCount) { for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) { xmlNodePtr node = xpath->node(xpathIndex); xmlUnlinkNode(node); xmlAddChild(body, node); } } else { result = RS_FEED_ERRORSTATE_PROCESS_XPATH_NO_RESULT; errorString = *xpathIt; break; } } else { // unable to process xpath expression #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processXPath - unable to process xpath expression" << std::endl; #endif errorString = *xpathIt; result = RS_FEED_ERRORSTATE_PROCESS_XPATH_WRONG_EXPRESSION; } } } else { result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } } else { result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } if (result == RS_FEED_ERRORSTATE_OK) { html = htmlNew; } } if (result == RS_FEED_ERRORSTATE_OK) { std::list nodesToDelete; /* process remove list */ for (xpathIt = xpathsToRemove.begin(); xpathIt != xpathsToRemove.end(); ++xpathIt) { if (xpath->compile(xpathIt->c_str())) { xpathCount = xpath->count(); if (xpathCount) { for (xpathIndex = 0; xpathIndex < xpathCount; ++xpathIndex) { xmlNodePtr node = xpath->node(xpathIndex); xmlUnlinkNode(node); nodesToDelete.push_back(node); } } else { result = RS_FEED_ERRORSTATE_PROCESS_XPATH_NO_RESULT; errorString = *xpathIt; break; } } else { // unable to process xpath expression #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processXPath - unable to process xpath expression" << std::endl; #endif errorString = *xpathIt; return RS_FEED_ERRORSTATE_PROCESS_XPATH_WRONG_EXPRESSION; } } std::list::iterator nodeIt; for (nodeIt = nodesToDelete.begin(); nodeIt != nodesToDelete.end(); ++nodeIt) { xmlFreeNode(*nodeIt); } nodesToDelete.clear(); } return result; } RsFeedReaderErrorState p3FeedReaderThread::processXPath(const std::list &xpathsToUse, const std::list &xpathsToRemove, std::string &description, std::string &errorString) { if (xpathsToUse.empty() && xpathsToRemove.empty()) { return RS_FEED_ERRORSTATE_OK; } RsFeedReaderErrorState result = RS_FEED_ERRORSTATE_OK; long todo_fill_errorString; /* process description */ long todo; // encoding HTMLWrapper html; if (html.readHTML(description.c_str(), "")) { xmlNodePtr root = html.getRootElement(); if (root) { result = processXPath(xpathsToUse, xpathsToRemove, html, errorString); if (result == RS_FEED_ERRORSTATE_OK) { if (!html.saveHTML(description)) { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processXPath - cannot dump html" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_INTERNAL_ERROR; } } } else { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processXPath - no root element" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } } else { #ifdef FEEDREADER_DEBUG std::cerr << "p3FeedReaderThread::processXPath - cannot read html" << std::endl; #endif result = RS_FEED_ERRORSTATE_PROCESS_HTML_ERROR; } return result; }