http_client: rewrite header parsing manually for speed

boost::regex is stupendously atrocious at parsing malformed data
This commit is contained in:
moneromooo-monero 2017-12-12 13:44:11 +00:00
parent ec724eb64a
commit bd1f6029a3
No known key found for this signature in database
GPG Key ID: 686F07454D6CEFC3

View File

@ -27,6 +27,7 @@
#pragma once
#include <ctype.h>
#include <boost/shared_ptr.hpp>
#include <boost/regex.hpp>
#include <boost/lexical_cast.hpp>
@ -752,87 +753,107 @@ namespace net_utils
return true;
}
//---------------------------------------------------------------------------
inline
bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
{
MTRACE("http_stream_filter::parse_cached_header(*)");
STATIC_REGEXP_EXPR_1(rexp_mach_field,
"\n?((Connection)|(Referer)|(Content-Length)|(Content-Type)|(Transfer-Encoding)|(Content-Encoding)|(Host)|(Cookie)|(User-Agent)|(Origin)"
// 12 3 4 5 6 7 8 9 10 11
"|([\\w-]+?)) ?: ?((.*?)(\r?\n))[^\t ]",
//12 13 14 15
boost::regex::icase | boost::regex::normal);
boost::smatch result;
std::string::const_iterator it_current_bound = m_cache_to_process.begin();
std::string::const_iterator it_end_bound = m_cache_to_process.end();
//lookup all fields and fill well-known fields
while( boost::regex_search( it_current_bound, it_end_bound, result, rexp_mach_field, boost::match_default) && result[0].matched)
{
const size_t field_val = 14;
//const size_t field_etc_name = 11;
int i = 2; //start position = 2
if(result[i++].matched)//"Connection"
body_info.m_connection = result[field_val];
else if(result[i++].matched)//"Referrer"
body_info.m_referer = result[field_val];
else if(result[i++].matched)//"Content-Length"
body_info.m_content_length = result[field_val];
else if(result[i++].matched)//"Content-Type"
body_info.m_content_type = result[field_val];
else if(result[i++].matched)//"Transfer-Encoding"
body_info.m_transfer_encoding = result[field_val];
else if(result[i++].matched)//"Content-Encoding"
body_info.m_content_encoding = result[field_val];
else if(result[i++].matched)//"Host"
{ body_info.m_host = result[field_val];
string_tools::trim(body_info.m_host);
}
else if(result[i++].matched)//"Cookie"
body_info.m_cookie = result[field_val];
else if(result[i++].matched)//"User-Agent"
body_info.m_user_agent = result[field_val];
else if(result[i++].matched)//"Origin"
body_info.m_origin = result[field_val];
else if(result[i++].matched)//e.t.c (HAVE TO BE MATCHED!)
body_info.m_etc_fields.emplace_back(result[12], result[field_val]);
else
{CHECK_AND_ASSERT_MES(false, false, "http_stream_filter::parse_cached_header() not matched last entry in:"<<m_cache_to_process);}
it_current_bound = result[(int)result.size()-1]. first;
}
return true;
}
inline
bool analize_first_response_line()
inline bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
{
MTRACE("http_stream_filter::parse_cached_header(*)");
//First line response, look like this: "HTTP/1.1 200 OK"
STATIC_REGEXP_EXPR_1(rexp_match_first_response_line, "^HTTP/(\\d+).(\\d+) ((\\d)\\d{2})( [^\n]*)?\r?\n", boost::regex::icase | boost::regex::normal);
// 1 2 34 5
//size_t match_len = 0;
boost::smatch result;
if(boost::regex_search( m_header_cache, result, rexp_match_first_response_line, boost::match_default) && result[0].matched)
const char *ptr = m_cache_to_process.c_str();
while (ptr[0] != '\r' || ptr[1] != '\n')
{
CHECK_AND_ASSERT_MES(result[1].matched&&result[2].matched, false, "http_stream_filter::handle_invoke_reply_line() assert failed...");
m_response_info.m_http_ver_hi = boost::lexical_cast<int>(result[1]);
m_response_info.m_http_ver_lo = boost::lexical_cast<int>(result[2]);
m_response_info.m_response_code = boost::lexical_cast<int>(result[3]);
m_header_cache.erase(to_nonsonst_iterator(m_header_cache, result[0].first), to_nonsonst_iterator(m_header_cache, result[0].second));
return true;
}else
{
LOG_ERROR("http_stream_filter::handle_invoke_reply_line(): Failed to match first response line:" << m_header_cache);
return false;
// optional \n
if (*ptr == '\n')
++ptr;
// an identifier composed of letters or -
const char *key_pos = ptr;
while (isalnum(*ptr) || *ptr == '_' || *ptr == '-')
++ptr;
const char *key_end = ptr;
// optional space (not in RFC, but in previous code)
if (*ptr == ' ')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == ':', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
++ptr;
// optional whitespace, but not newlines - line folding is obsolete, let's ignore it
while (isblank(*ptr))
++ptr;
const char *value_pos = ptr;
while (*ptr != '\r' && *ptr != '\n')
++ptr;
const char *value_end = ptr;
// optional trailing whitespace
while (value_end > value_pos && isblank(*(value_end-1)))
--value_end;
if (*ptr == '\r')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == '\n', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
++ptr;
const std::string key = std::string(key_pos, key_end - key_pos);
const std::string value = std::string(value_pos, value_end - value_pos);
if (!key.empty())
{
if (!string_tools::compare_no_case(key, "Connection"))
body_info.m_connection = value;
else if(!string_tools::compare_no_case(key, "Referrer"))
body_info.m_referer = value;
else if(!string_tools::compare_no_case(key, "Content-Length"))
body_info.m_content_length = value;
else if(!string_tools::compare_no_case(key, "Content-Type"))
body_info.m_content_type = value;
else if(!string_tools::compare_no_case(key, "Transfer-Encoding"))
body_info.m_transfer_encoding = value;
else if(!string_tools::compare_no_case(key, "Content-Encoding"))
body_info.m_content_encoding = value;
else if(!string_tools::compare_no_case(key, "Host"))
body_info.m_host = value;
else if(!string_tools::compare_no_case(key, "Cookie"))
body_info.m_cookie = value;
else if(!string_tools::compare_no_case(key, "User-Agent"))
body_info.m_user_agent = value;
else if(!string_tools::compare_no_case(key, "Origin"))
body_info.m_origin = value;
else
body_info.m_etc_fields.emplace_back(key, value);
}
}
return true;
}
//---------------------------------------------------------------------------
inline bool analize_first_response_line()
{
//First line response, look like this: "HTTP/1.1 200 OK"
const char *ptr = m_header_cache.c_str();
CHECK_AND_ASSERT_MES(!memcmp(ptr, "HTTP/", 5), false, "Invalid first response line: " + m_header_cache);
ptr += 5;
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
unsigned long ul;
char *end;
ul = strtoul(ptr, &end, 10);
CHECK_AND_ASSERT_MES(ul <= INT_MAX && *end =='.', false, "Invalid first response line: " + m_header_cache);
m_response_info.m_http_ver_hi = ul;
ptr = end + 1;
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
ul = strtoul(ptr, &end, 10);
CHECK_AND_ASSERT_MES(ul <= INT_MAX && isblank(*end), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
m_response_info.m_http_ver_lo = ul;
ptr = end + 1;
while (isblank(*ptr))
++ptr;
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
ul = strtoul(ptr, &end, 10);
CHECK_AND_ASSERT_MES(ul >= 100 && ul <= 999 && isspace(*end), false, "Invalid first response line: " + m_header_cache);
m_response_info.m_response_code = ul;
ptr = end;
// ignore the optional text, till the end
while (*ptr != '\r' && *ptr != '\n')
++ptr;
if (*ptr == '\r')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == '\n', false, "Invalid first response line: " << m_header_cache);
++ptr;
m_header_cache.erase(0, ptr - m_header_cache.c_str());
return true;
}
inline
bool set_reply_content_encoder()