added rapidjson-1.1.0 code hard-coded in the source directory to allow everyone to compile without the need to tweak too much. When v1.1.0 is mainstream (espcially on ubuntu) we can revert back to an external dependency

This commit is contained in:
csoler 2018-05-02 22:46:27 +02:00
parent 48c4b4c4f8
commit 8e111c2ee2
No known key found for this signature in database
GPG key ID: 7BCA522266C0804C
41 changed files with 14593 additions and 4 deletions

View file

@ -0,0 +1,701 @@
// Tencent is pleased to support the open source community by making RapidJSON available.
//
// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
//
// Licensed under the MIT License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// http://opensource.org/licenses/MIT
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#ifndef RAPIDJSON_INTERNAL_REGEX_H_
#define RAPIDJSON_INTERNAL_REGEX_H_
#include "../allocators.h"
#include "../stream.h"
#include "stack.h"
#ifdef __clang__
RAPIDJSON_DIAG_PUSH
RAPIDJSON_DIAG_OFF(padded)
RAPIDJSON_DIAG_OFF(switch-enum)
RAPIDJSON_DIAG_OFF(implicit-fallthrough)
#endif
#ifdef __GNUC__
RAPIDJSON_DIAG_PUSH
RAPIDJSON_DIAG_OFF(effc++)
#endif
#ifdef _MSC_VER
RAPIDJSON_DIAG_PUSH
RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated
#endif
#ifndef RAPIDJSON_REGEX_VERBOSE
#define RAPIDJSON_REGEX_VERBOSE 0
#endif
RAPIDJSON_NAMESPACE_BEGIN
namespace internal {
///////////////////////////////////////////////////////////////////////////////
// GenericRegex
static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1
static const SizeType kRegexInvalidRange = ~SizeType(0);
//! Regular expression engine with subset of ECMAscript grammar.
/*!
Supported regular expression syntax:
- \c ab Concatenation
- \c a|b Alternation
- \c a? Zero or one
- \c a* Zero or more
- \c a+ One or more
- \c a{3} Exactly 3 times
- \c a{3,} At least 3 times
- \c a{3,5} 3 to 5 times
- \c (ab) Grouping
- \c ^a At the beginning
- \c a$ At the end
- \c . Any character
- \c [abc] Character classes
- \c [a-c] Character class range
- \c [a-z0-9_] Character class combination
- \c [^abc] Negated character classes
- \c [^a-c] Negated character class range
- \c [\b] Backspace (U+0008)
- \c \\| \\\\ ... Escape characters
- \c \\f Form feed (U+000C)
- \c \\n Line feed (U+000A)
- \c \\r Carriage return (U+000D)
- \c \\t Tab (U+0009)
- \c \\v Vertical tab (U+000B)
\note This is a Thompson NFA engine, implemented with reference to
Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).",
https://swtch.com/~rsc/regexp/regexp1.html
*/
template <typename Encoding, typename Allocator = CrtAllocator>
class GenericRegex {
public:
typedef typename Encoding::Ch Ch;
GenericRegex(const Ch* source, Allocator* allocator = 0) :
states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
{
GenericStringStream<Encoding> ss(source);
DecodedStream<GenericStringStream<Encoding> > ds(ss);
Parse(ds);
}
~GenericRegex() {
Allocator::Free(stateSet_);
}
bool IsValid() const {
return root_ != kRegexInvalidState;
}
template <typename InputStream>
bool Match(InputStream& is) const {
return SearchWithAnchoring(is, true, true);
}
bool Match(const Ch* s) const {
GenericStringStream<Encoding> is(s);
return Match(is);
}
template <typename InputStream>
bool Search(InputStream& is) const {
return SearchWithAnchoring(is, anchorBegin_, anchorEnd_);
}
bool Search(const Ch* s) const {
GenericStringStream<Encoding> is(s);
return Search(is);
}
private:
enum Operator {
kZeroOrOne,
kZeroOrMore,
kOneOrMore,
kConcatenation,
kAlternation,
kLeftParenthesis
};
static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.'
static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
static const unsigned kRangeNegationFlag = 0x80000000;
struct Range {
unsigned start; //
unsigned end;
SizeType next;
};
struct State {
SizeType out; //!< Equals to kInvalid for matching state
SizeType out1; //!< Equals to non-kInvalid for split
SizeType rangeStart;
unsigned codepoint;
};
struct Frag {
Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {}
SizeType start;
SizeType out; //!< link-list of all output states
SizeType minIndex;
};
template <typename SourceStream>
class DecodedStream {
public:
DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
unsigned Peek() { return codepoint_; }
unsigned Take() {
unsigned c = codepoint_;
if (c) // No further decoding when '\0'
Decode();
return c;
}
private:
void Decode() {
if (!Encoding::Decode(ss_, &codepoint_))
codepoint_ = 0;
}
SourceStream& ss_;
unsigned codepoint_;
};
State& GetState(SizeType index) {
RAPIDJSON_ASSERT(index < stateCount_);
return states_.template Bottom<State>()[index];
}
const State& GetState(SizeType index) const {
RAPIDJSON_ASSERT(index < stateCount_);
return states_.template Bottom<State>()[index];
}
Range& GetRange(SizeType index) {
RAPIDJSON_ASSERT(index < rangeCount_);
return ranges_.template Bottom<Range>()[index];
}
const Range& GetRange(SizeType index) const {
RAPIDJSON_ASSERT(index < rangeCount_);
return ranges_.template Bottom<Range>()[index];
}
template <typename InputStream>
void Parse(DecodedStream<InputStream>& ds) {
Allocator allocator;
Stack<Allocator> operandStack(&allocator, 256); // Frag
Stack<Allocator> operatorStack(&allocator, 256); // Operator
Stack<Allocator> atomCountStack(&allocator, 256); // unsigned (Atom per parenthesis)
*atomCountStack.template Push<unsigned>() = 0;
unsigned codepoint;
while (ds.Peek() != 0) {
switch (codepoint = ds.Take()) {
case '^':
anchorBegin_ = true;
break;
case '$':
anchorEnd_ = true;
break;
case '|':
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
return;
*operatorStack.template Push<Operator>() = kAlternation;
*atomCountStack.template Top<unsigned>() = 0;
break;
case '(':
*operatorStack.template Push<Operator>() = kLeftParenthesis;
*atomCountStack.template Push<unsigned>() = 0;
break;
case ')':
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
return;
if (operatorStack.Empty())
return;
operatorStack.template Pop<Operator>(1);
atomCountStack.template Pop<unsigned>(1);
ImplicitConcatenation(atomCountStack, operatorStack);
break;
case '?':
if (!Eval(operandStack, kZeroOrOne))
return;
break;
case '*':
if (!Eval(operandStack, kZeroOrMore))
return;
break;
case '+':
if (!Eval(operandStack, kOneOrMore))
return;
break;
case '{':
{
unsigned n, m;
if (!ParseUnsigned(ds, &n))
return;
if (ds.Peek() == ',') {
ds.Take();
if (ds.Peek() == '}')
m = kInfinityQuantifier;
else if (!ParseUnsigned(ds, &m) || m < n)
return;
}
else
m = n;
if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}')
return;
ds.Take();
}
break;
case '.':
PushOperand(operandStack, kAnyCharacterClass);
ImplicitConcatenation(atomCountStack, operatorStack);
break;
case '[':
{
SizeType range;
if (!ParseRange(ds, &range))
return;
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
GetState(s).rangeStart = range;
*operandStack.template Push<Frag>() = Frag(s, s, s);
}
ImplicitConcatenation(atomCountStack, operatorStack);
break;
case '\\': // Escape character
if (!CharacterEscape(ds, &codepoint))
return; // Unsupported escape character
// fall through to default
default: // Pattern character
PushOperand(operandStack, codepoint);
ImplicitConcatenation(atomCountStack, operatorStack);
}
}
while (!operatorStack.Empty())
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
return;
// Link the operand to matching state.
if (operandStack.GetSize() == sizeof(Frag)) {
Frag* e = operandStack.template Pop<Frag>(1);
Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
root_ = e->start;
#if RAPIDJSON_REGEX_VERBOSE
printf("root: %d\n", root_);
for (SizeType i = 0; i < stateCount_ ; i++) {
State& s = GetState(i);
printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint);
}
printf("\n");
#endif
}
// Preallocate buffer for SearchWithAnchoring()
RAPIDJSON_ASSERT(stateSet_ == 0);
if (stateCount_ > 0) {
stateSet_ = static_cast<unsigned*>(states_.GetAllocator().Malloc(GetStateSetSize()));
state0_.template Reserve<SizeType>(stateCount_);
state1_.template Reserve<SizeType>(stateCount_);
}
}
SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
State* s = states_.template Push<State>();
s->out = out;
s->out1 = out1;
s->codepoint = codepoint;
s->rangeStart = kRegexInvalidRange;
return stateCount_++;
}
void PushOperand(Stack<Allocator>& operandStack, unsigned codepoint) {
SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
*operandStack.template Push<Frag>() = Frag(s, s, s);
}
void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
if (*atomCountStack.template Top<unsigned>())
*operatorStack.template Push<Operator>() = kConcatenation;
(*atomCountStack.template Top<unsigned>())++;
}
SizeType Append(SizeType l1, SizeType l2) {
SizeType old = l1;
while (GetState(l1).out != kRegexInvalidState)
l1 = GetState(l1).out;
GetState(l1).out = l2;
return old;
}
void Patch(SizeType l, SizeType s) {
for (SizeType next; l != kRegexInvalidState; l = next) {
next = GetState(l).out;
GetState(l).out = s;
}
}
bool Eval(Stack<Allocator>& operandStack, Operator op) {
switch (op) {
case kConcatenation:
RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2);
{
Frag e2 = *operandStack.template Pop<Frag>(1);
Frag e1 = *operandStack.template Pop<Frag>(1);
Patch(e1.out, e2.start);
*operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
}
return true;
case kAlternation:
if (operandStack.GetSize() >= sizeof(Frag) * 2) {
Frag e2 = *operandStack.template Pop<Frag>(1);
Frag e1 = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(e1.start, e2.start, 0);
*operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
return true;
}
return false;
case kZeroOrOne:
if (operandStack.GetSize() >= sizeof(Frag)) {
Frag e = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(kRegexInvalidState, e.start, 0);
*operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
return true;
}
return false;
case kZeroOrMore:
if (operandStack.GetSize() >= sizeof(Frag)) {
Frag e = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(kRegexInvalidState, e.start, 0);
Patch(e.out, s);
*operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
return true;
}
return false;
default:
RAPIDJSON_ASSERT(op == kOneOrMore);
if (operandStack.GetSize() >= sizeof(Frag)) {
Frag e = *operandStack.template Pop<Frag>(1);
SizeType s = NewState(kRegexInvalidState, e.start, 0);
Patch(e.out, s);
*operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
return true;
}
return false;
}
}
bool EvalQuantifier(Stack<Allocator>& operandStack, unsigned n, unsigned m) {
RAPIDJSON_ASSERT(n <= m);
RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag));
if (n == 0) {
if (m == 0) // a{0} not support
return false;
else if (m == kInfinityQuantifier)
Eval(operandStack, kZeroOrMore); // a{0,} -> a*
else {
Eval(operandStack, kZeroOrOne); // a{0,5} -> a?
for (unsigned i = 0; i < m - 1; i++)
CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a?
for (unsigned i = 0; i < m - 1; i++)
Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a?
}
return true;
}
for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a
CloneTopOperand(operandStack);
if (m == kInfinityQuantifier)
Eval(operandStack, kOneOrMore); // a{3,} -> a a a+
else if (m > n) {
CloneTopOperand(operandStack); // a{3,5} -> a a a a
Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a?
for (unsigned i = n; i < m - 1; i++)
CloneTopOperand(operandStack); // a{3,5} -> a a a a? a?
for (unsigned i = n; i < m; i++)
Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a?
}
for (unsigned i = 0; i < n - 1; i++)
Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a?
return true;
}
static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; }
void CloneTopOperand(Stack<Allocator>& operandStack) {
const Frag src = *operandStack.template Top<Frag>(); // Copy constructor to prevent invalidation
SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_)
State* s = states_.template Push<State>(count);
memcpy(s, &GetState(src.minIndex), count * sizeof(State));
for (SizeType j = 0; j < count; j++) {
if (s[j].out != kRegexInvalidState)
s[j].out += count;
if (s[j].out1 != kRegexInvalidState)
s[j].out1 += count;
}
*operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
stateCount_ += count;
}
template <typename InputStream>
bool ParseUnsigned(DecodedStream<InputStream>& ds, unsigned* u) {
unsigned r = 0;
if (ds.Peek() < '0' || ds.Peek() > '9')
return false;
while (ds.Peek() >= '0' && ds.Peek() <= '9') {
if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295
return false; // overflow
r = r * 10 + (ds.Take() - '0');
}
*u = r;
return true;
}
template <typename InputStream>
bool ParseRange(DecodedStream<InputStream>& ds, SizeType* range) {
bool isBegin = true;
bool negate = false;
int step = 0;
SizeType start = kRegexInvalidRange;
SizeType current = kRegexInvalidRange;
unsigned codepoint;
while ((codepoint = ds.Take()) != 0) {
if (isBegin) {
isBegin = false;
if (codepoint == '^') {
negate = true;
continue;
}
}
switch (codepoint) {
case ']':
if (start == kRegexInvalidRange)
return false; // Error: nothing inside []
if (step == 2) { // Add trailing '-'
SizeType r = NewRange('-');
RAPIDJSON_ASSERT(current != kRegexInvalidRange);
GetRange(current).next = r;
}
if (negate)
GetRange(start).start |= kRangeNegationFlag;
*range = start;
return true;
case '\\':
if (ds.Peek() == 'b') {
ds.Take();
codepoint = 0x0008; // Escape backspace character
}
else if (!CharacterEscape(ds, &codepoint))
return false;
// fall through to default
default:
switch (step) {
case 1:
if (codepoint == '-') {
step++;
break;
}
// fall through to step 0 for other characters
case 0:
{
SizeType r = NewRange(codepoint);
if (current != kRegexInvalidRange)
GetRange(current).next = r;
if (start == kRegexInvalidRange)
start = r;
current = r;
}
step = 1;
break;
default:
RAPIDJSON_ASSERT(step == 2);
GetRange(current).end = codepoint;
step = 0;
}
}
}
return false;
}
SizeType NewRange(unsigned codepoint) {
Range* r = ranges_.template Push<Range>();
r->start = r->end = codepoint;
r->next = kRegexInvalidRange;
return rangeCount_++;
}
template <typename InputStream>
bool CharacterEscape(DecodedStream<InputStream>& ds, unsigned* escapedCodepoint) {
unsigned codepoint;
switch (codepoint = ds.Take()) {
case '^':
case '$':
case '|':
case '(':
case ')':
case '?':
case '*':
case '+':
case '.':
case '[':
case ']':
case '{':
case '}':
case '\\':
*escapedCodepoint = codepoint; return true;
case 'f': *escapedCodepoint = 0x000C; return true;
case 'n': *escapedCodepoint = 0x000A; return true;
case 'r': *escapedCodepoint = 0x000D; return true;
case 't': *escapedCodepoint = 0x0009; return true;
case 'v': *escapedCodepoint = 0x000B; return true;
default:
return false; // Unsupported escape character
}
}
template <typename InputStream>
bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) const {
RAPIDJSON_ASSERT(IsValid());
DecodedStream<InputStream> ds(is);
state0_.Clear();
Stack<Allocator> *current = &state0_, *next = &state1_;
const size_t stateSetSize = GetStateSetSize();
std::memset(stateSet_, 0, stateSetSize);
bool matched = AddState(*current, root_);
unsigned codepoint;
while (!current->Empty() && (codepoint = ds.Take()) != 0) {
std::memset(stateSet_, 0, stateSetSize);
next->Clear();
matched = false;
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
const State& sr = GetState(*s);
if (sr.codepoint == codepoint ||
sr.codepoint == kAnyCharacterClass ||
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
{
matched = AddState(*next, sr.out) || matched;
if (!anchorEnd && matched)
return true;
}
if (!anchorBegin)
AddState(*next, root_);
}
internal::Swap(current, next);
}
return matched;
}
size_t GetStateSetSize() const {
return (stateCount_ + 31) / 32 * 4;
}
// Return whether the added states is a match state
bool AddState(Stack<Allocator>& l, SizeType index) const {
RAPIDJSON_ASSERT(index != kRegexInvalidState);
const State& s = GetState(index);
if (s.out1 != kRegexInvalidState) { // Split
bool matched = AddState(l, s.out);
return AddState(l, s.out1) || matched;
}
else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
stateSet_[index >> 5] |= (1 << (index & 31));
*l.template PushUnsafe<SizeType>() = index;
}
return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
}
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
while (rangeIndex != kRegexInvalidRange) {
const Range& r = GetRange(rangeIndex);
if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
return yes;
rangeIndex = r.next;
}
return !yes;
}
Stack<Allocator> states_;
Stack<Allocator> ranges_;
SizeType root_;
SizeType stateCount_;
SizeType rangeCount_;
static const unsigned kInfinityQuantifier = ~0u;
// For SearchWithAnchoring()
uint32_t* stateSet_; // allocated by states_.GetAllocator()
mutable Stack<Allocator> state0_;
mutable Stack<Allocator> state1_;
bool anchorBegin_;
bool anchorEnd_;
};
typedef GenericRegex<UTF8<> > Regex;
} // namespace internal
RAPIDJSON_NAMESPACE_END
#ifdef __clang__
RAPIDJSON_DIAG_POP
#endif
#ifdef _MSC_VER
RAPIDJSON_DIAG_POP
#endif
#endif // RAPIDJSON_INTERNAL_REGEX_H_