// // peglib.h // // Copyright (c) 2022 Yuji Hirose. All rights reserved. // MIT License // #pragma once /* * Configuration */ #ifndef CPPPEGLIB_HEURISTIC_ERROR_TOKEN_MAX_CHAR_COUNT #define CPPPEGLIB_HEURISTIC_ERROR_TOKEN_MAX_CHAR_COUNT 32 #endif #include #include #include #include #if __has_include() #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if !defined(__cplusplus) || __cplusplus < 201703L #error "Requires complete C++17 support" #endif namespace peg { /*----------------------------------------------------------------------------- * scope_exit *---------------------------------------------------------------------------*/ // This is based on // "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189". template struct scope_exit { explicit scope_exit(EF &&f) : exit_function(std::move(f)), execute_on_destruction{true} {} scope_exit(scope_exit &&rhs) : exit_function(std::move(rhs.exit_function)), execute_on_destruction{rhs.execute_on_destruction} { rhs.release(); } ~scope_exit() { if (execute_on_destruction) { this->exit_function(); } } void release() { this->execute_on_destruction = false; } private: scope_exit(const scope_exit &) = delete; void operator=(const scope_exit &) = delete; scope_exit &operator=(scope_exit &&) = delete; EF exit_function; bool execute_on_destruction; }; /*----------------------------------------------------------------------------- * UTF8 functions *---------------------------------------------------------------------------*/ inline size_t codepoint_length(const char *s8, size_t l) { if (l) { auto b = static_cast(s8[0]); if ((b & 0x80) == 0) { return 1; } else if ((b & 0xE0) == 0xC0 && l >= 2) { return 2; } else if ((b & 0xF0) == 0xE0 && l >= 3) { return 3; } else if ((b & 0xF8) == 0xF0 && l >= 4) { return 4; } } return 0; } inline size_t codepoint_count(const char *s8, size_t l) { size_t count = 0; for (size_t i = 0; i < l; i += codepoint_length(s8 + i, l - i)) { count++; } return count; } inline size_t encode_codepoint(char32_t cp, char *buff) { if (cp < 0x0080) { buff[0] = static_cast(cp & 0x7F); return 1; } else if (cp < 0x0800) { buff[0] = static_cast(0xC0 | ((cp >> 6) & 0x1F)); buff[1] = static_cast(0x80 | (cp & 0x3F)); return 2; } else if (cp < 0xD800) { buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); buff[2] = static_cast(0x80 | (cp & 0x3F)); return 3; } else if (cp < 0xE000) { // D800 - DFFF is invalid... return 0; } else if (cp < 0x10000) { buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); buff[2] = static_cast(0x80 | (cp & 0x3F)); return 3; } else if (cp < 0x110000) { buff[0] = static_cast(0xF0 | ((cp >> 18) & 0x7)); buff[1] = static_cast(0x80 | ((cp >> 12) & 0x3F)); buff[2] = static_cast(0x80 | ((cp >> 6) & 0x3F)); buff[3] = static_cast(0x80 | (cp & 0x3F)); return 4; } return 0; } inline std::string encode_codepoint(char32_t cp) { char buff[4]; auto l = encode_codepoint(cp, buff); return std::string(buff, l); } inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes, char32_t &cp) { if (l) { auto b = static_cast(s8[0]); if ((b & 0x80) == 0) { bytes = 1; cp = b; return true; } else if ((b & 0xE0) == 0xC0) { if (l >= 2) { bytes = 2; cp = ((static_cast(s8[0] & 0x1F)) << 6) | (static_cast(s8[1] & 0x3F)); return true; } } else if ((b & 0xF0) == 0xE0) { if (l >= 3) { bytes = 3; cp = ((static_cast(s8[0] & 0x0F)) << 12) | ((static_cast(s8[1] & 0x3F)) << 6) | (static_cast(s8[2] & 0x3F)); return true; } } else if ((b & 0xF8) == 0xF0) { if (l >= 4) { bytes = 4; cp = ((static_cast(s8[0] & 0x07)) << 18) | ((static_cast(s8[1] & 0x3F)) << 12) | ((static_cast(s8[2] & 0x3F)) << 6) | (static_cast(s8[3] & 0x3F)); return true; } } } return false; } inline size_t decode_codepoint(const char *s8, size_t l, char32_t &cp) { size_t bytes; if (decode_codepoint(s8, l, bytes, cp)) { return bytes; } return 0; } inline char32_t decode_codepoint(const char *s8, size_t l) { char32_t cp = 0; decode_codepoint(s8, l, cp); return cp; } inline std::u32string decode(const char *s8, size_t l) { std::u32string out; size_t i = 0; while (i < l) { auto beg = i++; while (i < l && (s8[i] & 0xc0) == 0x80) { i++; } out += decode_codepoint(&s8[beg], (i - beg)); } return out; } template const char *u8(const T *s) { return reinterpret_cast(s); } /*----------------------------------------------------------------------------- * escape_characters *---------------------------------------------------------------------------*/ inline std::string escape_characters(const char *s, size_t n) { std::string str; for (size_t i = 0; i < n; i++) { auto c = s[i]; switch (c) { case '\f': str += "\\f"; break; case '\n': str += "\\n"; break; case '\r': str += "\\r"; break; case '\t': str += "\\t"; break; case '\v': str += "\\v"; break; default: str += c; break; } } return str; } inline std::string escape_characters(std::string_view sv) { return escape_characters(sv.data(), sv.size()); } /*----------------------------------------------------------------------------- * resolve_escape_sequence *---------------------------------------------------------------------------*/ inline bool is_hex(char c, int &v) { if ('0' <= c && c <= '9') { v = c - '0'; return true; } else if ('a' <= c && c <= 'f') { v = c - 'a' + 10; return true; } else if ('A' <= c && c <= 'F') { v = c - 'A' + 10; return true; } return false; } inline bool is_digit(char c, int &v) { if ('0' <= c && c <= '9') { v = c - '0'; return true; } return false; } inline std::pair parse_hex_number(const char *s, size_t n, size_t i) { int ret = 0; int val; while (i < n && is_hex(s[i], val)) { ret = static_cast(ret * 16 + val); i++; } return std::pair(ret, i); } inline std::pair parse_octal_number(const char *s, size_t n, size_t i) { int ret = 0; int val; while (i < n && is_digit(s[i], val)) { ret = static_cast(ret * 8 + val); i++; } return std::pair(ret, i); } inline std::string resolve_escape_sequence(const char *s, size_t n) { std::string r; r.reserve(n); size_t i = 0; while (i < n) { auto ch = s[i]; if (ch == '\\') { i++; if (i == n) { throw std::runtime_error("Invalid escape sequence..."); } switch (s[i]) { case 'f': r += '\f'; i++; break; case 'n': r += '\n'; i++; break; case 'r': r += '\r'; i++; break; case 't': r += '\t'; i++; break; case 'v': r += '\v'; i++; break; case '\'': r += '\''; i++; break; case '"': r += '"'; i++; break; case '[': r += '['; i++; break; case ']': r += ']'; i++; break; case '\\': r += '\\'; i++; break; case 'x': case 'u': { char32_t cp; std::tie(cp, i) = parse_hex_number(s, n, i + 1); r += encode_codepoint(cp); break; } default: { char32_t cp; std::tie(cp, i) = parse_octal_number(s, n, i); r += encode_codepoint(cp); break; } } } else { r += ch; i++; } } return r; } /*----------------------------------------------------------------------------- * token_to_number_ - This function should be removed eventually *---------------------------------------------------------------------------*/ template T token_to_number_(std::string_view sv) { T n = 0; #if __has_include() if constexpr (!std::is_floating_point::value) { std::from_chars(sv.data(), sv.data() + sv.size(), n); #else if constexpr (false) { #endif } else { auto s = std::string(sv); std::istringstream ss(s); ss >> n; } return n; } /*----------------------------------------------------------------------------- * Trie *---------------------------------------------------------------------------*/ class Trie { public: Trie(const std::vector &items, bool ignore_case) : ignore_case_(ignore_case) { size_t id = 0; for (const auto &item : items) { const auto &s = ignore_case ? to_lower(item) : item; for (size_t len = 1; len <= item.size(); len++) { auto last = len == item.size(); std::string_view sv(s.data(), len); auto it = dic_.find(sv); if (it == dic_.end()) { dic_.emplace(sv, Info{last, last, id}); } else if (last) { it->second.match = true; } else { it->second.done = false; } } id++; } } size_t match(const char *text, size_t text_len, size_t &id) const { std::string lower_text; if (ignore_case_) { lower_text = to_lower(text); text = lower_text.data(); } size_t match_len = 0; auto done = false; size_t len = 1; while (!done && len <= text_len) { std::string_view sv(text, len); auto it = dic_.find(sv); if (it == dic_.end()) { done = true; } else { if (it->second.match) { match_len = len; id = it->second.id; } if (it->second.done) { done = true; } } len += 1; } return match_len; } size_t size() const { return dic_.size(); } private: std::string to_lower(std::string s) const { for (char &c : s) { c = std::tolower(c); } return s; } struct Info { bool done; bool match; size_t id; }; // TODO: Use unordered_map when heterogeneous lookup is supported in C++20 // std::unordered_map dic_; std::map> dic_; bool ignore_case_; }; /*----------------------------------------------------------------------------- * PEG *---------------------------------------------------------------------------*/ /* * Line information utility function */ inline std::pair line_info(const char *start, const char *cur) { auto p = start; auto col_ptr = p; auto no = 1; while (p < cur) { if (*p == '\n') { no++; col_ptr = p + 1; } p++; } auto col = codepoint_count(col_ptr, p - col_ptr) + 1; return std::pair(no, col); } /* * String tag */ inline constexpr unsigned int str2tag_core(const char *s, size_t l, unsigned int h) { return (l == 0) ? h : str2tag_core(s + 1, l - 1, (h * 33) ^ static_cast(*s)); } inline constexpr unsigned int str2tag(std::string_view sv) { return str2tag_core(sv.data(), sv.size(), 0); } namespace udl { inline constexpr unsigned int operator"" _(const char *s, size_t l) { return str2tag_core(s, l, 0); } } // namespace udl /* * Semantic values */ class Context; struct SemanticValues : protected std::vector { SemanticValues() = default; SemanticValues(Context *c) : c_(c) {} // Input text const char *path = nullptr; const char *ss = nullptr; // Matched string std::string_view sv() const { return sv_; } // Definition name const std::string &name() const { return name_; } std::vector tags; // Line number and column at which the matched string is std::pair line_info() const; // Choice count size_t choice_count() const { return choice_count_; } // Choice number (0 based index) size_t choice() const { return choice_; } // Tokens std::vector tokens; std::string_view token(size_t id = 0) const { if (tokens.empty()) { return sv_; } assert(id < tokens.size()); return tokens[id]; } // Token conversion std::string token_to_string(size_t id = 0) const { return std::string(token(id)); } template T token_to_number() const { return token_to_number_(token()); } // Transform the semantic value vector to another vector template std::vector transform(size_t beg = 0, size_t end = static_cast(-1)) const { std::vector r; end = (std::min)(end, size()); for (size_t i = beg; i < end; i++) { r.emplace_back(std::any_cast((*this)[i])); } return r; } void append(SemanticValues &chvs) { sv_ = chvs.sv_; for (auto &v : chvs) { emplace_back(std::move(v)); } for (auto &tag : chvs.tags) { tags.emplace_back(std::move(tag)); } for (auto &tok : chvs.tokens) { tokens.emplace_back(std::move(tok)); } } using std::vector::iterator; using std::vector::const_iterator; using std::vector::size; using std::vector::empty; using std::vector::assign; using std::vector::begin; using std::vector::end; using std::vector::rbegin; using std::vector::rend; using std::vector::operator[]; using std::vector::at; using std::vector::resize; using std::vector::front; using std::vector::back; using std::vector::push_back; using std::vector::pop_back; using std::vector::insert; using std::vector::erase; using std::vector::clear; using std::vector::swap; using std::vector::emplace; using std::vector::emplace_back; private: friend class Context; friend class Dictionary; friend class Sequence; friend class PrioritizedChoice; friend class Repetition; friend class Holder; friend class PrecedenceClimbing; Context *c_ = nullptr; std::string_view sv_; size_t choice_count_ = 0; size_t choice_ = 0; std::string name_; }; /* * Semantic action */ template std::any call(F fn, Args &&...args) { using R = decltype(fn(std::forward(args)...)); if constexpr (std::is_void::value) { fn(std::forward(args)...); return std::any(); } else if constexpr (std::is_same::type, std::any>::value) { return fn(std::forward(args)...); } else { return std::any(fn(std::forward(args)...)); } } template struct argument_count : argument_count {}; template struct argument_count : std::integral_constant {}; template struct argument_count : std::integral_constant {}; template struct argument_count : std::integral_constant {}; class Action { public: Action() = default; Action(Action &&rhs) = default; template Action(F fn) : fn_(make_adaptor(fn)) {} template void operator=(F fn) { fn_ = make_adaptor(fn); } Action &operator=(const Action &rhs) = default; operator bool() const { return bool(fn_); } std::any operator()(SemanticValues &vs, std::any &dt) const { return fn_(vs, dt); } private: using Fty = std::function; template Fty make_adaptor(F fn) { if constexpr (argument_count::value == 1) { return [fn](auto &vs, auto & /*dt*/) { return call(fn, vs); }; } else { return [fn](auto &vs, auto &dt) { return call(fn, vs, dt); }; } } Fty fn_; }; /* * Parse result helper */ inline bool success(size_t len) { return len != static_cast(-1); } inline bool fail(size_t len) { return len == static_cast(-1); } /* * Log */ using Log = std::function; /* * ErrorInfo */ class Definition; struct ErrorInfo { const char *error_pos = nullptr; std::vector> expected_tokens; const char *message_pos = nullptr; std::string message; std::string label; const char *last_output_pos = nullptr; bool keep_previous_token = false; void clear() { error_pos = nullptr; expected_tokens.clear(); message_pos = nullptr; message.clear(); } void add(const char *error_literal, const Definition *error_rule) { for (const auto &[t, r] : expected_tokens) { if (t == error_literal && r == error_rule) { return; } } expected_tokens.emplace_back(error_literal, error_rule); } void output_log(const Log &log, const char *s, size_t n); private: int cast_char(char c) const { return static_cast(c); } std::string heuristic_error_token(const char *s, size_t n, const char *pos) const { auto len = n - std::distance(s, pos); if (len) { size_t i = 0; auto c = cast_char(pos[i++]); if (!std::ispunct(c) && !std::isspace(c)) { while (i < len && !std::ispunct(cast_char(pos[i])) && !std::isspace(cast_char(pos[i]))) { i++; } } size_t count = CPPPEGLIB_HEURISTIC_ERROR_TOKEN_MAX_CHAR_COUNT; size_t j = 0; while (count > 0 && j < i) { j += codepoint_length(&pos[j], i - j); count--; } return escape_characters(pos, j); } return std::string(); } std::string replace_all(std::string str, const std::string &from, const std::string &to) const { size_t pos = 0; while ((pos = str.find(from, pos)) != std::string::npos) { str.replace(pos, from.length(), to); pos += to.length(); } return str; } }; /* * Context */ class Ope; using TracerEnter = std::function; using TracerLeave = std::function; using TracerStartOrEnd = std::function; class Context { public: const char *path; const char *s; const size_t l; ErrorInfo error_info; bool recovered = false; std::vector> value_stack; size_t value_stack_size = 0; std::vector rule_stack; std::vector>> args_stack; size_t in_token_boundary_count = 0; std::shared_ptr whitespaceOpe; bool in_whitespace = false; std::shared_ptr wordOpe; std::vector> capture_scope_stack; size_t capture_scope_stack_size = 0; std::vector cut_stack; const size_t def_count; const bool enablePackratParsing; std::vector cache_registered; std::vector cache_success; std::map, std::tuple> cache_values; TracerEnter tracer_enter; TracerLeave tracer_leave; std::any trace_data; const bool verbose_trace; Log log; Context(const char *path, const char *s, size_t l, size_t def_count, std::shared_ptr whitespaceOpe, std::shared_ptr wordOpe, bool enablePackratParsing, TracerEnter tracer_enter, TracerLeave tracer_leave, std::any trace_data, bool verbose_trace, Log log) : path(path), s(s), l(l), whitespaceOpe(whitespaceOpe), wordOpe(wordOpe), def_count(def_count), enablePackratParsing(enablePackratParsing), cache_registered(enablePackratParsing ? def_count * (l + 1) : 0), cache_success(enablePackratParsing ? def_count * (l + 1) : 0), tracer_enter(tracer_enter), tracer_leave(tracer_leave), trace_data(trace_data), verbose_trace(verbose_trace), log(log) { push_args({}); push_capture_scope(); } ~Context() { pop_capture_scope(); assert(!value_stack_size); assert(!capture_scope_stack_size); assert(cut_stack.empty()); } Context(const Context &) = delete; Context(Context &&) = delete; Context operator=(const Context &) = delete; template void packrat(const char *a_s, size_t def_id, size_t &len, std::any &val, T fn) { if (!enablePackratParsing) { fn(val); return; } auto col = a_s - s; auto idx = def_count * static_cast(col) + def_id; if (cache_registered[idx]) { if (cache_success[idx]) { auto key = std::pair(col, def_id); std::tie(len, val) = cache_values[key]; return; } else { len = static_cast(-1); return; } } else { fn(val); cache_registered[idx] = true; cache_success[idx] = success(len); if (success(len)) { auto key = std::pair(col, def_id); cache_values[key] = std::pair(len, val); } return; } } SemanticValues &push() { push_capture_scope(); return push_semantic_values_scope(); } void pop() { pop_capture_scope(); pop_semantic_values_scope(); } // Semantic values SemanticValues &push_semantic_values_scope() { assert(value_stack_size <= value_stack.size()); if (value_stack_size == value_stack.size()) { value_stack.emplace_back(std::make_shared(this)); } else { auto &vs = *value_stack[value_stack_size]; if (!vs.empty()) { vs.clear(); if (!vs.tags.empty()) { vs.tags.clear(); } } vs.sv_ = std::string_view(); vs.choice_count_ = 0; vs.choice_ = 0; if (!vs.tokens.empty()) { vs.tokens.clear(); } } auto &vs = *value_stack[value_stack_size++]; vs.path = path; vs.ss = s; return vs; } void pop_semantic_values_scope() { value_stack_size--; } // Arguments void push_args(std::vector> &&args) { args_stack.emplace_back(args); } void pop_args() { args_stack.pop_back(); } const std::vector> &top_args() const { return args_stack[args_stack.size() - 1]; } // Capture scope void push_capture_scope() { assert(capture_scope_stack_size <= capture_scope_stack.size()); if (capture_scope_stack_size == capture_scope_stack.size()) { capture_scope_stack.emplace_back( std::map()); } else { auto &cs = capture_scope_stack[capture_scope_stack_size]; if (!cs.empty()) { cs.clear(); } } capture_scope_stack_size++; } void pop_capture_scope() { capture_scope_stack_size--; } void shift_capture_values() { assert(capture_scope_stack_size >= 2); auto curr = &capture_scope_stack[capture_scope_stack_size - 1]; auto prev = curr - 1; for (const auto &[k, v] : *curr) { (*prev)[k] = v; } } // Error void set_error_pos(const char *a_s, const char *literal = nullptr); // Trace void trace_enter(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt); void trace_leave(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt, size_t len); bool is_traceable(const Ope &ope) const; // Line info std::pair line_info(const char *cur) const { std::call_once(source_line_index_init_, [this]() { for (size_t pos = 0; pos < l; pos++) { if (s[pos] == '\n') { source_line_index.push_back(pos); } } source_line_index.push_back(l); }); auto pos = static_cast(std::distance(s, cur)); auto it = std::lower_bound( source_line_index.begin(), source_line_index.end(), pos, [](size_t element, size_t value) { return element < value; }); auto id = static_cast(std::distance(source_line_index.begin(), it)); auto off = pos - (id == 0 ? 0 : source_line_index[id - 1] + 1); return std::pair(id + 1, off + 1); } size_t next_trace_id = 0; std::vector trace_ids; bool ignore_trace_state = false; mutable std::once_flag source_line_index_init_; mutable std::vector source_line_index; }; /* * Parser operators */ class Ope { public: struct Visitor; virtual ~Ope() = default; size_t parse(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const; virtual size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const = 0; virtual void accept(Visitor &v) = 0; }; class Sequence : public Ope { public: template Sequence(const Args &...args) : opes_{static_cast>(args)...} {} Sequence(const std::vector> &opes) : opes_(opes) {} Sequence(std::vector> &&opes) : opes_(opes) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { auto &chvs = c.push_semantic_values_scope(); auto se = scope_exit([&]() { c.pop_semantic_values_scope(); }); size_t i = 0; for (const auto &ope : opes_) { auto len = ope->parse(s + i, n - i, chvs, c, dt); if (fail(len)) { return len; } i += len; } vs.append(chvs); return i; } void accept(Visitor &v) override; std::vector> opes_; }; class PrioritizedChoice : public Ope { public: template PrioritizedChoice(bool for_label, const Args &...args) : opes_{static_cast>(args)...}, for_label_(for_label) {} PrioritizedChoice(const std::vector> &opes) : opes_(opes) {} PrioritizedChoice(std::vector> &&opes) : opes_(opes) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { size_t len = static_cast(-1); if (!for_label_) { c.cut_stack.push_back(false); } auto se = scope_exit([&]() { if (!for_label_) { c.cut_stack.pop_back(); } }); size_t id = 0; for (const auto &ope : opes_) { if (!c.cut_stack.empty()) { c.cut_stack.back() = false; } auto &chvs = c.push(); c.error_info.keep_previous_token = id > 0; auto se = scope_exit([&]() { c.pop(); c.error_info.keep_previous_token = false; }); len = ope->parse(s, n, chvs, c, dt); if (success(len)) { vs.append(chvs); vs.choice_count_ = opes_.size(); vs.choice_ = id; c.shift_capture_values(); break; } else if (!c.cut_stack.empty() && c.cut_stack.back()) { break; } id++; } return len; } void accept(Visitor &v) override; size_t size() const { return opes_.size(); } std::vector> opes_; bool for_label_ = false; }; class Repetition : public Ope { public: Repetition(const std::shared_ptr &ope, size_t min, size_t max) : ope_(ope), min_(min), max_(max) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { size_t count = 0; size_t i = 0; while (count < min_) { auto &chvs = c.push(); auto se = scope_exit([&]() { c.pop(); }); auto len = ope_->parse(s + i, n - i, chvs, c, dt); if (success(len)) { vs.append(chvs); c.shift_capture_values(); } else { return len; } i += len; count++; } while (count < max_) { auto &chvs = c.push(); auto se = scope_exit([&]() { c.pop(); }); auto len = ope_->parse(s + i, n - i, chvs, c, dt); if (success(len)) { vs.append(chvs); c.shift_capture_values(); } else { break; } i += len; count++; } return i; } void accept(Visitor &v) override; bool is_zom() const { return min_ == 0 && max_ == std::numeric_limits::max(); } static std::shared_ptr zom(const std::shared_ptr &ope) { return std::make_shared(ope, 0, std::numeric_limits::max()); } static std::shared_ptr oom(const std::shared_ptr &ope) { return std::make_shared(ope, 1, std::numeric_limits::max()); } static std::shared_ptr opt(const std::shared_ptr &ope) { return std::make_shared(ope, 0, 1); } std::shared_ptr ope_; size_t min_; size_t max_; }; class AndPredicate : public Ope { public: AndPredicate(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any &dt) const override { auto &chvs = c.push(); auto se = scope_exit([&]() { c.pop(); }); auto len = ope_->parse(s, n, chvs, c, dt); if (success(len)) { return 0; } else { return len; } } void accept(Visitor &v) override; std::shared_ptr ope_; }; class NotPredicate : public Ope { public: NotPredicate(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any &dt) const override { auto &chvs = c.push(); auto se = scope_exit([&]() { c.pop(); }); auto len = ope_->parse(s, n, chvs, c, dt); if (success(len)) { c.set_error_pos(s); return static_cast(-1); } else { return 0; } } void accept(Visitor &v) override; std::shared_ptr ope_; }; class Dictionary : public Ope, public std::enable_shared_from_this { public: Dictionary(const std::vector &v, bool ignore_case) : trie_(v, ignore_case) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; Trie trie_; }; class LiteralString : public Ope, public std::enable_shared_from_this { public: LiteralString(std::string &&s, bool ignore_case) : lit_(s), ignore_case_(ignore_case), is_word_(false) {} LiteralString(const std::string &s, bool ignore_case) : lit_(s), ignore_case_(ignore_case), is_word_(false) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::string lit_; bool ignore_case_; mutable std::once_flag init_is_word_; mutable bool is_word_; }; class CharacterClass : public Ope, public std::enable_shared_from_this { public: CharacterClass(const std::string &s, bool negated, bool ignore_case) : negated_(negated), ignore_case_(ignore_case) { auto chars = decode(s.data(), s.length()); auto i = 0u; while (i < chars.size()) { if (i + 2 < chars.size() && chars[i + 1] == '-') { auto cp1 = chars[i]; auto cp2 = chars[i + 2]; ranges_.emplace_back(std::pair(cp1, cp2)); i += 3; } else { auto cp = chars[i]; ranges_.emplace_back(std::pair(cp, cp)); i += 1; } } assert(!ranges_.empty()); } CharacterClass(const std::vector> &ranges, bool negated, bool ignore_case) : ranges_(ranges), negated_(negated), ignore_case_(ignore_case) { assert(!ranges_.empty()); } size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const override { if (n < 1) { c.set_error_pos(s); return static_cast(-1); } char32_t cp = 0; auto len = decode_codepoint(s, n, cp); for (const auto &range : ranges_) { if (in_range(range, cp)) { if (negated_) { c.set_error_pos(s); return static_cast(-1); } else { return len; } } } if (negated_) { return len; } else { c.set_error_pos(s); return static_cast(-1); } } void accept(Visitor &v) override; private: bool in_range(const std::pair &range, char32_t cp) const { if (ignore_case_) { auto cpl = std::tolower(cp); return std::tolower(range.first) <= cpl && cpl <= std::tolower(range.second); } else { return range.first <= cp && cp <= range.second; } } std::vector> ranges_; bool negated_; bool ignore_case_; }; class Character : public Ope, public std::enable_shared_from_this { public: Character(char ch) : ch_(ch) {} size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const override { if (n < 1 || s[0] != ch_) { c.set_error_pos(s); return static_cast(-1); } return 1; } void accept(Visitor &v) override; char ch_; }; class AnyCharacter : public Ope, public std::enable_shared_from_this { public: size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const override { auto len = codepoint_length(s, n); if (len < 1) { c.set_error_pos(s); return static_cast(-1); } return len; } void accept(Visitor &v) override; }; class CaptureScope : public Ope { public: CaptureScope(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { c.push_capture_scope(); auto se = scope_exit([&]() { c.pop_capture_scope(); }); return ope_->parse(s, n, vs, c, dt); } void accept(Visitor &v) override; std::shared_ptr ope_; }; class Capture : public Ope { public: using MatchAction = std::function; Capture(const std::shared_ptr &ope, MatchAction ma) : ope_(ope), match_action_(ma) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { auto len = ope_->parse(s, n, vs, c, dt); if (success(len) && match_action_) { match_action_(s, len, c); } return len; } void accept(Visitor &v) override; std::shared_ptr ope_; MatchAction match_action_; }; class TokenBoundary : public Ope { public: TokenBoundary(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::shared_ptr ope_; }; class Ignore : public Ope { public: Ignore(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any &dt) const override { auto &chvs = c.push_semantic_values_scope(); auto se = scope_exit([&]() { c.pop_semantic_values_scope(); }); return ope_->parse(s, n, chvs, c, dt); } void accept(Visitor &v) override; std::shared_ptr ope_; }; using Parser = std::function; class User : public Ope { public: User(Parser fn) : fn_(fn) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context & /*c*/, std::any &dt) const override { assert(fn_); return fn_(s, n, vs, dt); } void accept(Visitor &v) override; std::function fn_; }; class WeakHolder : public Ope { public: WeakHolder(const std::shared_ptr &ope) : weak_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { auto ope = weak_.lock(); assert(ope); return ope->parse(s, n, vs, c, dt); } void accept(Visitor &v) override; std::weak_ptr weak_; }; class Holder : public Ope { public: Holder(Definition *outer) : outer_(outer) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::any reduce(SemanticValues &vs, std::any &dt) const; const std::string &name() const; const std::string &trace_name() const; std::shared_ptr ope_; Definition *outer_; mutable std::once_flag trace_name_init_; mutable std::string trace_name_; friend class Definition; }; using Grammar = std::unordered_map; class Reference : public Ope, public std::enable_shared_from_this { public: Reference(const Grammar &grammar, const std::string &name, const char *s, bool is_macro, const std::vector> &args) : grammar_(grammar), name_(name), s_(s), is_macro_(is_macro), args_(args), rule_(nullptr), iarg_(0) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::shared_ptr get_core_operator() const; const Grammar &grammar_; const std::string name_; const char *s_; const bool is_macro_; const std::vector> args_; Definition *rule_; size_t iarg_; }; class Whitespace : public Ope { public: Whitespace(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { if (c.in_whitespace) { return 0; } c.in_whitespace = true; auto se = scope_exit([&]() { c.in_whitespace = false; }); return ope_->parse(s, n, vs, c, dt); } void accept(Visitor &v) override; std::shared_ptr ope_; }; class BackReference : public Ope { public: BackReference(std::string &&name) : name_(name) {} BackReference(const std::string &name) : name_(name) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::string name_; }; class PrecedenceClimbing : public Ope { public: using BinOpeInfo = std::map>; PrecedenceClimbing(const std::shared_ptr &atom, const std::shared_ptr &binop, const BinOpeInfo &info, const Definition &rule) : atom_(atom), binop_(binop), info_(info), rule_(rule) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override { return parse_expression(s, n, vs, c, dt, 0); } void accept(Visitor &v) override; std::shared_ptr atom_; std::shared_ptr binop_; BinOpeInfo info_; const Definition &rule_; private: size_t parse_expression(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt, size_t min_prec) const; Definition &get_reference_for_binop(Context &c) const; }; class Recovery : public Ope { public: Recovery(const std::shared_ptr &ope) : ope_(ope) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; void accept(Visitor &v) override; std::shared_ptr ope_; }; class Cut : public Ope, public std::enable_shared_from_this { public: size_t parse_core(const char * /*s*/, size_t /*n*/, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const override { if (!c.cut_stack.empty()) { c.cut_stack.back() = true; } return 0; } void accept(Visitor &v) override; }; /* * Factories */ template std::shared_ptr seq(Args &&...args) { return std::make_shared(static_cast>(args)...); } template std::shared_ptr cho(Args &&...args) { return std::make_shared( false, static_cast>(args)...); } template std::shared_ptr cho4label_(Args &&...args) { return std::make_shared( true, static_cast>(args)...); } inline std::shared_ptr zom(const std::shared_ptr &ope) { return Repetition::zom(ope); } inline std::shared_ptr oom(const std::shared_ptr &ope) { return Repetition::oom(ope); } inline std::shared_ptr opt(const std::shared_ptr &ope) { return Repetition::opt(ope); } inline std::shared_ptr rep(const std::shared_ptr &ope, size_t min, size_t max) { return std::make_shared(ope, min, max); } inline std::shared_ptr apd(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr npd(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr dic(const std::vector &v, bool ignore_case) { return std::make_shared(v, ignore_case); } inline std::shared_ptr lit(std::string &&s) { return std::make_shared(s, false); } inline std::shared_ptr liti(std::string &&s) { return std::make_shared(s, true); } inline std::shared_ptr cls(const std::string &s) { return std::make_shared(s, false, false); } inline std::shared_ptr cls(const std::vector> &ranges, bool ignore_case = false) { return std::make_shared(ranges, false, ignore_case); } inline std::shared_ptr ncls(const std::string &s) { return std::make_shared(s, true, false); } inline std::shared_ptr ncls(const std::vector> &ranges, bool ignore_case = false) { return std::make_shared(ranges, true, ignore_case); } inline std::shared_ptr chr(char dt) { return std::make_shared(dt); } inline std::shared_ptr dot() { return std::make_shared(); } inline std::shared_ptr csc(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr cap(const std::shared_ptr &ope, Capture::MatchAction ma) { return std::make_shared(ope, ma); } inline std::shared_ptr tok(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr ign(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr usr(std::function fn) { return std::make_shared(fn); } inline std::shared_ptr ref(const Grammar &grammar, const std::string &name, const char *s, bool is_macro, const std::vector> &args) { return std::make_shared(grammar, name, s, is_macro, args); } inline std::shared_ptr wsp(const std::shared_ptr &ope) { return std::make_shared(std::make_shared(ope)); } inline std::shared_ptr bkr(std::string &&name) { return std::make_shared(name); } inline std::shared_ptr pre(const std::shared_ptr &atom, const std::shared_ptr &binop, const PrecedenceClimbing::BinOpeInfo &info, const Definition &rule) { return std::make_shared(atom, binop, info, rule); } inline std::shared_ptr rec(const std::shared_ptr &ope) { return std::make_shared(ope); } inline std::shared_ptr cut() { return std::make_shared(); } /* * Visitor */ struct Ope::Visitor { virtual ~Visitor() {} virtual void visit(Sequence &) {} virtual void visit(PrioritizedChoice &) {} virtual void visit(Repetition &) {} virtual void visit(AndPredicate &) {} virtual void visit(NotPredicate &) {} virtual void visit(Dictionary &) {} virtual void visit(LiteralString &) {} virtual void visit(CharacterClass &) {} virtual void visit(Character &) {} virtual void visit(AnyCharacter &) {} virtual void visit(CaptureScope &) {} virtual void visit(Capture &) {} virtual void visit(TokenBoundary &) {} virtual void visit(Ignore &) {} virtual void visit(User &) {} virtual void visit(WeakHolder &) {} virtual void visit(Holder &) {} virtual void visit(Reference &) {} virtual void visit(Whitespace &) {} virtual void visit(BackReference &) {} virtual void visit(PrecedenceClimbing &) {} virtual void visit(Recovery &) {} virtual void visit(Cut &) {} }; struct TraceOpeName : public Ope::Visitor { using Ope::Visitor::visit; void visit(Sequence &) override { name_ = "Sequence"; } void visit(PrioritizedChoice &) override { name_ = "PrioritizedChoice"; } void visit(Repetition &) override { name_ = "Repetition"; } void visit(AndPredicate &) override { name_ = "AndPredicate"; } void visit(NotPredicate &) override { name_ = "NotPredicate"; } void visit(Dictionary &) override { name_ = "Dictionary"; } void visit(LiteralString &) override { name_ = "LiteralString"; } void visit(CharacterClass &) override { name_ = "CharacterClass"; } void visit(Character &) override { name_ = "Character"; } void visit(AnyCharacter &) override { name_ = "AnyCharacter"; } void visit(CaptureScope &) override { name_ = "CaptureScope"; } void visit(Capture &) override { name_ = "Capture"; } void visit(TokenBoundary &) override { name_ = "TokenBoundary"; } void visit(Ignore &) override { name_ = "Ignore"; } void visit(User &) override { name_ = "User"; } void visit(WeakHolder &) override { name_ = "WeakHolder"; } void visit(Holder &ope) override { name_ = ope.trace_name().data(); } void visit(Reference &) override { name_ = "Reference"; } void visit(Whitespace &) override { name_ = "Whitespace"; } void visit(BackReference &) override { name_ = "BackReference"; } void visit(PrecedenceClimbing &) override { name_ = "PrecedenceClimbing"; } void visit(Recovery &) override { name_ = "Recovery"; } void visit(Cut &) override { name_ = "Cut"; } static std::string get(Ope &ope) { TraceOpeName vis; ope.accept(vis); return vis.name_; } private: const char *name_ = nullptr; }; struct AssignIDToDefinition : public Ope::Visitor { using Ope::Visitor::visit; void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(Repetition &ope) override { ope.ope_->accept(*this); } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override; void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override; void visit(Recovery &ope) override { ope.ope_->accept(*this); } std::unordered_map ids; }; struct IsLiteralToken : public Ope::Visitor { using Ope::Visitor::visit; void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { if (!IsLiteralToken::check(*op)) { return; } } result_ = true; } void visit(Dictionary &) override { result_ = true; } void visit(LiteralString &) override { result_ = true; } static bool check(Ope &ope) { IsLiteralToken vis; ope.accept(vis); return vis.result_; } private: bool result_ = false; }; struct TokenChecker : public Ope::Visitor { using Ope::Visitor::visit; void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(Repetition &ope) override { ope.ope_->accept(*this); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &) override { has_token_boundary_ = true; } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &) override { has_rule_ = true; } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } static bool is_token(Ope &ope) { if (IsLiteralToken::check(ope)) { return true; } TokenChecker vis; ope.accept(vis); return vis.has_token_boundary_ || !vis.has_rule_; } private: bool has_token_boundary_ = false; bool has_rule_ = false; }; struct FindLiteralToken : public Ope::Visitor { using Ope::Visitor::visit; void visit(LiteralString &ope) override { token_ = ope.lit_.data(); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Recovery &ope) override { ope.ope_->accept(*this); } static const char *token(Ope &ope) { FindLiteralToken vis; ope.accept(vis); return vis.token_; } private: const char *token_ = nullptr; }; struct DetectLeftRecursion : public Ope::Visitor { using Ope::Visitor::visit; DetectLeftRecursion(const std::string &name) : name_(name) {} void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); if (done_) { break; } else if (error_s) { done_ = true; break; } } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); if (error_s) { done_ = true; break; } } } void visit(Repetition &ope) override { ope.ope_->accept(*this); done_ = ope.min_ > 0; } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); done_ = false; } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); done_ = false; } void visit(Dictionary &) override { done_ = true; } void visit(LiteralString &ope) override { done_ = !ope.lit_.empty(); } void visit(CharacterClass &) override { done_ = true; } void visit(Character &) override { done_ = true; } void visit(AnyCharacter &) override { done_ = true; } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(User &) override { done_ = true; } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(BackReference &) override { done_ = true; } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } void visit(Cut &) override { done_ = true; } const char *error_s = nullptr; private: std::string name_; std::unordered_set refs_; bool done_ = false; }; struct HasEmptyElement : public Ope::Visitor { using Ope::Visitor::visit; HasEmptyElement(std::vector> &refs, std::unordered_map &has_error_cache) : refs_(refs), has_error_cache_(has_error_cache) {} void visit(Sequence &ope) override; void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); if (is_empty) { return; } } } void visit(Repetition &ope) override { if (ope.min_ == 0) { set_error(); } else { ope.ope_->accept(*this); } } void visit(AndPredicate &) override { set_error(); } void visit(NotPredicate &) override { set_error(); } void visit(LiteralString &ope) override { if (ope.lit_.empty()) { set_error(); } } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } bool is_empty = false; const char *error_s = nullptr; std::string error_name; private: void set_error() { is_empty = true; tie(error_s, error_name) = refs_.back(); } std::vector> &refs_; std::unordered_map &has_error_cache_; }; struct DetectInfiniteLoop : public Ope::Visitor { using Ope::Visitor::visit; DetectInfiniteLoop(const char *s, const std::string &name, std::vector> &refs, std::unordered_map &has_error_cache) : refs_(refs), has_error_cache_(has_error_cache) { refs_.emplace_back(s, name); } DetectInfiniteLoop(std::vector> &refs, std::unordered_map &has_error_cache) : refs_(refs), has_error_cache_(has_error_cache) {} void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); if (has_error) { return; } } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); if (has_error) { return; } } } void visit(Repetition &ope) override { if (ope.max_ == std::numeric_limits::max()) { HasEmptyElement vis(refs_, has_error_cache_); ope.ope_->accept(vis); if (vis.is_empty) { has_error = true; error_s = vis.error_s; error_name = vis.error_name; } } else { ope.ope_->accept(*this); } } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } bool has_error = false; const char *error_s = nullptr; std::string error_name; private: std::vector> &refs_; std::unordered_map &has_error_cache_; }; struct ReferenceChecker : public Ope::Visitor { using Ope::Visitor::visit; ReferenceChecker(const Grammar &grammar, const std::vector ¶ms) : grammar_(grammar), params_(params) {} void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(Repetition &ope) override { ope.ope_->accept(*this); } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } std::unordered_map error_s; std::unordered_map error_message; std::unordered_set referenced; private: const Grammar &grammar_; const std::vector ¶ms_; }; struct LinkReferences : public Ope::Visitor { using Ope::Visitor::visit; LinkReferences(Grammar &grammar, const std::vector ¶ms) : grammar_(grammar), params_(params) {} void visit(Sequence &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) { op->accept(*this); } } void visit(Repetition &ope) override { ope.ope_->accept(*this); } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } void visit(Capture &ope) override { ope.ope_->accept(*this); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } void visit(Ignore &ope) override { ope.ope_->accept(*this); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } void visit(Recovery &ope) override { ope.ope_->accept(*this); } private: Grammar &grammar_; const std::vector ¶ms_; }; struct FindReference : public Ope::Visitor { using Ope::Visitor::visit; FindReference(const std::vector> &args, const std::vector ¶ms) : args_(args), params_(params) {} void visit(Sequence &ope) override { std::vector> opes; for (auto o : ope.opes_) { o->accept(*this); opes.push_back(found_ope); } found_ope = std::make_shared(opes); } void visit(PrioritizedChoice &ope) override { std::vector> opes; for (auto o : ope.opes_) { o->accept(*this); opes.push_back(found_ope); } found_ope = std::make_shared(opes); } void visit(Repetition &ope) override { ope.ope_->accept(*this); found_ope = rep(found_ope, ope.min_, ope.max_); } void visit(AndPredicate &ope) override { ope.ope_->accept(*this); found_ope = apd(found_ope); } void visit(NotPredicate &ope) override { ope.ope_->accept(*this); found_ope = npd(found_ope); } void visit(Dictionary &ope) override { found_ope = ope.shared_from_this(); } void visit(LiteralString &ope) override { found_ope = ope.shared_from_this(); } void visit(CharacterClass &ope) override { found_ope = ope.shared_from_this(); } void visit(Character &ope) override { found_ope = ope.shared_from_this(); } void visit(AnyCharacter &ope) override { found_ope = ope.shared_from_this(); } void visit(CaptureScope &ope) override { ope.ope_->accept(*this); found_ope = csc(found_ope); } void visit(Capture &ope) override { ope.ope_->accept(*this); found_ope = cap(found_ope, ope.match_action_); } void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); found_ope = tok(found_ope); } void visit(Ignore &ope) override { ope.ope_->accept(*this); found_ope = ign(found_ope); } void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } void visit(Holder &ope) override { ope.ope_->accept(*this); } void visit(Reference &ope) override; void visit(Whitespace &ope) override { ope.ope_->accept(*this); found_ope = wsp(found_ope); } void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); found_ope = csc(found_ope); } void visit(Recovery &ope) override { ope.ope_->accept(*this); found_ope = rec(found_ope); } void visit(Cut &ope) override { found_ope = ope.shared_from_this(); } std::shared_ptr found_ope; private: const std::vector> &args_; const std::vector ¶ms_; }; /* * Keywords */ static const char *WHITESPACE_DEFINITION_NAME = "%whitespace"; static const char *WORD_DEFINITION_NAME = "%word"; static const char *RECOVER_DEFINITION_NAME = "%recover"; /* * Definition */ class Definition { public: struct Result { bool ret; bool recovered; size_t len; ErrorInfo error_info; }; Definition() : holder_(std::make_shared(this)) {} Definition(const Definition &rhs) : name(rhs.name), holder_(rhs.holder_) { holder_->outer_ = this; } Definition(const std::shared_ptr &ope) : holder_(std::make_shared(this)) { *this <= ope; } operator std::shared_ptr() { return std::make_shared(holder_); } Definition &operator<=(const std::shared_ptr &ope) { holder_->ope_ = ope; return *this; } Result parse(const char *s, size_t n, const char *path = nullptr, Log log = nullptr) const { SemanticValues vs; std::any dt; return parse_core(s, n, vs, dt, path, log); } Result parse(const char *s, const char *path = nullptr, Log log = nullptr) const { auto n = strlen(s); return parse(s, n, path, log); } Result parse(const char *s, size_t n, std::any &dt, const char *path = nullptr, Log log = nullptr) const { SemanticValues vs; return parse_core(s, n, vs, dt, path, log); } Result parse(const char *s, std::any &dt, const char *path = nullptr, Log log = nullptr) const { auto n = strlen(s); return parse(s, n, dt, path, log); } template Result parse_and_get_value(const char *s, size_t n, T &val, const char *path = nullptr, Log log = nullptr) const { SemanticValues vs; std::any dt; auto r = parse_core(s, n, vs, dt, path, log); if (r.ret && !vs.empty() && vs.front().has_value()) { val = std::any_cast(vs[0]); } return r; } template Result parse_and_get_value(const char *s, T &val, const char *path = nullptr, Log log = nullptr) const { auto n = strlen(s); return parse_and_get_value(s, n, val, path, log); } template Result parse_and_get_value(const char *s, size_t n, std::any &dt, T &val, const char *path = nullptr, Log log = nullptr) const { SemanticValues vs; auto r = parse_core(s, n, vs, dt, path, log); if (r.ret && !vs.empty() && vs.front().has_value()) { val = std::any_cast(vs[0]); } return r; } template Result parse_and_get_value(const char *s, std::any &dt, T &val, const char *path = nullptr, Log log = nullptr) const { auto n = strlen(s); return parse_and_get_value(s, n, dt, val, path, log); } #if defined(__cpp_lib_char8_t) Result parse(const char8_t *s, size_t n, const char *path = nullptr, Log log = nullptr) const { return parse(reinterpret_cast(s), n, path, log); } Result parse(const char8_t *s, const char *path = nullptr, Log log = nullptr) const { return parse(reinterpret_cast(s), path, log); } Result parse(const char8_t *s, size_t n, std::any &dt, const char *path = nullptr, Log log = nullptr) const { return parse(reinterpret_cast(s), n, dt, path, log); } Result parse(const char8_t *s, std::any &dt, const char *path = nullptr, Log log = nullptr) const { return parse(reinterpret_cast(s), dt, path, log); } template Result parse_and_get_value(const char8_t *s, size_t n, T &val, const char *path = nullptr, Log log = nullptr) const { return parse_and_get_value(reinterpret_cast(s), n, val, *path, log); } template Result parse_and_get_value(const char8_t *s, T &val, const char *path = nullptr, Log log = nullptr) const { return parse_and_get_value(reinterpret_cast(s), val, *path, log); } template Result parse_and_get_value(const char8_t *s, size_t n, std::any &dt, T &val, const char *path = nullptr, Log log = nullptr) const { return parse_and_get_value(reinterpret_cast(s), n, dt, val, *path, log); } template Result parse_and_get_value(const char8_t *s, std::any &dt, T &val, const char *path = nullptr, Log log = nullptr) const { return parse_and_get_value(reinterpret_cast(s), dt, val, *path, log); } #endif void operator=(Action a) { action = a; } template Definition &operator,(T fn) { operator=(fn); return *this; } Definition &operator~() { ignoreSemanticValue = true; return *this; } void accept(Ope::Visitor &v) { holder_->accept(v); } std::shared_ptr get_core_operator() const { return holder_->ope_; } bool is_token() const { std::call_once(is_token_init_, [this]() { is_token_ = TokenChecker::is_token(*get_core_operator()); }); return is_token_; } std::string name; const char *s_ = nullptr; std::pair line_ = {1, 1}; std::function predicate; size_t id = 0; Action action; std::function enter; std::function leave; bool ignoreSemanticValue = false; std::shared_ptr whitespaceOpe; std::shared_ptr wordOpe; bool enablePackratParsing = false; bool is_macro = false; std::vector params; bool disable_action = false; TracerEnter tracer_enter; TracerLeave tracer_leave; bool verbose_trace = false; TracerStartOrEnd tracer_start; TracerStartOrEnd tracer_end; std::string error_message; bool no_ast_opt = false; bool eoi_check = true; private: friend class Reference; friend class ParserGenerator; Definition &operator=(const Definition &rhs); Definition &operator=(Definition &&rhs); void initialize_definition_ids() const { std::call_once(definition_ids_init_, [&]() { AssignIDToDefinition vis; holder_->accept(vis); if (whitespaceOpe) { whitespaceOpe->accept(vis); } if (wordOpe) { wordOpe->accept(vis); } definition_ids_.swap(vis.ids); }); } Result parse_core(const char *s, size_t n, SemanticValues &vs, std::any &dt, const char *path, Log log) const { initialize_definition_ids(); std::shared_ptr ope = holder_; std::any trace_data; if (tracer_start) { tracer_start(trace_data); } auto se = scope_exit([&]() { if (tracer_end) { tracer_end(trace_data); } }); Context c(path, s, n, definition_ids_.size(), whitespaceOpe, wordOpe, enablePackratParsing, tracer_enter, tracer_leave, trace_data, verbose_trace, log); size_t i = 0; if (whitespaceOpe) { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); auto len = whitespaceOpe->parse(s, n, vs, c, dt); if (fail(len)) { return Result{false, c.recovered, i, c.error_info}; } i = len; } auto len = ope->parse(s + i, n - i, vs, c, dt); auto ret = success(len); if (ret) { i += len; if (eoi_check) { if (i < n) { if (c.error_info.error_pos - c.s < s + i - c.s) { c.error_info.message_pos = s + i; c.error_info.message = "expected end of input"; } ret = false; } } } return Result{ret, c.recovered, i, c.error_info}; } std::shared_ptr holder_; mutable std::once_flag is_token_init_; mutable bool is_token_ = false; mutable std::once_flag assign_id_to_definition_init_; mutable std::once_flag definition_ids_init_; mutable std::unordered_map definition_ids_; }; /* * Implementations */ inline size_t parse_literal(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt, const std::string &lit, std::once_flag &init_is_word, bool &is_word, bool ignore_case) { size_t i = 0; for (; i < lit.size(); i++) { if (i >= n || (ignore_case ? (std::tolower(s[i]) != std::tolower(lit[i])) : (s[i] != lit[i]))) { c.set_error_pos(s, lit.data()); return static_cast(-1); } } // Word check if (c.wordOpe) { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); std::call_once(init_is_word, [&]() { SemanticValues dummy_vs; Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, nullptr, nullptr, false, nullptr); std::any dummy_dt; auto len = c.wordOpe->parse(lit.data(), lit.size(), dummy_vs, dummy_c, dummy_dt); is_word = success(len); }); if (is_word) { SemanticValues dummy_vs; Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, nullptr, nullptr, false, nullptr); std::any dummy_dt; NotPredicate ope(c.wordOpe); auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); if (fail(len)) { c.set_error_pos(s, lit.data()); return len; } i += len; } } // Skip whitespace if (!c.in_token_boundary_count && c.whitespaceOpe) { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); auto len = c.whitespaceOpe->parse(s + i, n - i, vs, c, dt); if (fail(len)) { return len; } i += len; } return i; } inline std::pair SemanticValues::line_info() const { assert(c_); return c_->line_info(sv_.data()); } inline void ErrorInfo::output_log(const Log &log, const char *s, size_t n) { if (message_pos) { if (message_pos > last_output_pos) { last_output_pos = message_pos; auto line = line_info(s, message_pos); std::string msg; if (auto unexpected_token = heuristic_error_token(s, n, message_pos); !unexpected_token.empty()) { msg = replace_all(message, "%t", unexpected_token); auto unexpected_char = unexpected_token.substr( 0, codepoint_length(unexpected_token.data(), unexpected_token.size())); msg = replace_all(msg, "%c", unexpected_char); } else { msg = message; } log(line.first, line.second, msg, label); } } else if (error_pos) { if (error_pos > last_output_pos) { last_output_pos = error_pos; auto line = line_info(s, error_pos); std::string msg; if (expected_tokens.empty()) { msg = "syntax error."; } else { msg = "syntax error"; // unexpected token if (auto unexpected_token = heuristic_error_token(s, n, error_pos); !unexpected_token.empty()) { msg += ", unexpected '"; msg += unexpected_token; msg += "'"; } auto first_item = true; size_t i = 0; while (i < expected_tokens.size()) { auto [error_literal, error_rule] = expected_tokens[i]; // Skip rules start with '_' if (!(error_rule && error_rule->name[0] == '_')) { msg += (first_item ? ", expecting " : ", "); if (error_literal) { msg += "'"; msg += error_literal; msg += "'"; } else { msg += "<" + error_rule->name + ">"; if (label.empty()) { label = error_rule->name; } } first_item = false; } i++; } msg += "."; } log(line.first, line.second, msg, label); } } } inline void Context::set_error_pos(const char *a_s, const char *literal) { if (log) { if (error_info.error_pos <= a_s) { if (error_info.error_pos < a_s || !error_info.keep_previous_token) { error_info.error_pos = a_s; error_info.expected_tokens.clear(); } const char *error_literal = nullptr; const Definition *error_rule = nullptr; if (literal) { error_literal = literal; } else if (!rule_stack.empty()) { auto rule = rule_stack.back(); auto ope = rule->get_core_operator(); if (auto token = FindLiteralToken::token(*ope); token && token[0] != '\0') { error_literal = token; } } for (auto r : rule_stack) { error_rule = r; if (r->is_token()) { break; } } if (error_literal || error_rule) { error_info.add(error_literal, error_rule); } } } } inline void Context::trace_enter(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt) { trace_ids.push_back(next_trace_id++); tracer_enter(ope, a_s, n, vs, *this, dt, trace_data); } inline void Context::trace_leave(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt, size_t len) { tracer_leave(ope, a_s, n, vs, *this, dt, len, trace_data); trace_ids.pop_back(); } inline bool Context::is_traceable(const Ope &ope) const { if (tracer_enter && tracer_leave) { if (ignore_trace_state) { return false; } return !dynamic_cast(&ope); } return false; } inline size_t Ope::parse(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { if (c.is_traceable(*this)) { c.trace_enter(*this, s, n, vs, dt); auto len = parse_core(s, n, vs, c, dt); c.trace_leave(*this, s, n, vs, dt, len); return len; } return parse_core(s, n, vs, c, dt); } inline size_t Dictionary::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { size_t id; auto i = trie_.match(s, n, id); if (i == 0) { c.set_error_pos(s); return static_cast(-1); } vs.choice_count_ = trie_.size(); vs.choice_ = id; // Word check if (c.wordOpe) { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); { SemanticValues dummy_vs; Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, nullptr, nullptr, false, nullptr); std::any dummy_dt; NotPredicate ope(c.wordOpe); auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); if (fail(len)) { c.set_error_pos(s); return len; } i += len; } } // Skip whitespace if (!c.in_token_boundary_count && c.whitespaceOpe) { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); auto len = c.whitespaceOpe->parse(s + i, n - i, vs, c, dt); if (fail(len)) { return len; } i += len; } return i; } inline size_t LiteralString::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { return parse_literal(s, n, vs, c, dt, lit_, init_is_word_, is_word_, ignore_case_); } inline size_t TokenBoundary::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { auto save_ignore_trace_state = c.ignore_trace_state; c.ignore_trace_state = !c.verbose_trace; auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); size_t len; { c.in_token_boundary_count++; auto se = scope_exit([&]() { c.in_token_boundary_count--; }); len = ope_->parse(s, n, vs, c, dt); } if (success(len)) { vs.tokens.emplace_back(std::string_view(s, len)); if (!c.in_token_boundary_count) { if (c.whitespaceOpe) { auto l = c.whitespaceOpe->parse(s + len, n - len, vs, c, dt); if (fail(l)) { return l; } len += l; } } } return len; } inline size_t Holder::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { if (!ope_) { throw std::logic_error("Uninitialized definition ope was used..."); } // Macro reference if (outer_->is_macro) { c.rule_stack.push_back(outer_); auto len = ope_->parse(s, n, vs, c, dt); c.rule_stack.pop_back(); return len; } size_t len; std::any val; c.packrat(s, outer_->id, len, val, [&](std::any &a_val) { if (outer_->enter) { outer_->enter(c, s, n, dt); } auto &chvs = c.push_semantic_values_scope(); auto se = scope_exit([&]() { c.pop_semantic_values_scope(); if (outer_->leave) { outer_->leave(c, s, n, len, a_val, dt); } }); c.rule_stack.push_back(outer_); len = ope_->parse(s, n, chvs, c, dt); c.rule_stack.pop_back(); // Invoke action if (success(len)) { chvs.sv_ = std::string_view(s, len); chvs.name_ = outer_->name; auto ope_ptr = ope_.get(); { auto tok_ptr = dynamic_cast(ope_ptr); if (tok_ptr) { ope_ptr = tok_ptr->ope_.get(); } } if (!dynamic_cast(ope_ptr) && !dynamic_cast(ope_ptr)) { chvs.choice_count_ = 0; chvs.choice_ = 0; } std::string msg; if (outer_->predicate && !outer_->predicate(chvs, dt, msg)) { if (c.log && !msg.empty() && c.error_info.message_pos < s) { c.error_info.message_pos = s; c.error_info.message = msg; c.error_info.label = outer_->name; } len = static_cast(-1); } if (success(len)) { if (!c.recovered) { a_val = reduce(chvs, dt); } } else { if (c.log && !msg.empty() && c.error_info.message_pos < s) { c.error_info.message_pos = s; c.error_info.message = msg; c.error_info.label = outer_->name; } } } else { if (c.log && !outer_->error_message.empty() && c.error_info.message_pos < s) { c.error_info.message_pos = s; c.error_info.message = outer_->error_message; c.error_info.label = outer_->name; } } }); if (success(len)) { if (!outer_->ignoreSemanticValue) { vs.emplace_back(std::move(val)); vs.tags.emplace_back(str2tag(outer_->name)); } } return len; } inline std::any Holder::reduce(SemanticValues &vs, std::any &dt) const { if (outer_->action && !outer_->disable_action) { return outer_->action(vs, dt); } else if (vs.empty()) { return std::any(); } else { return std::move(vs.front()); } } inline const std::string &Holder::name() const { return outer_->name; } inline const std::string &Holder::trace_name() const { std::call_once(trace_name_init_, [this]() { trace_name_ = "[" + outer_->name + "]"; }); return trace_name_; } inline size_t Reference::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { auto save_ignore_trace_state = c.ignore_trace_state; if (rule_ && rule_->ignoreSemanticValue) { c.ignore_trace_state = !c.verbose_trace; } auto se = scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); if (rule_) { // Reference rule if (rule_->is_macro) { // Macro FindReference vis(c.top_args(), c.rule_stack.back()->params); // Collect arguments std::vector> args; for (auto arg : args_) { arg->accept(vis); args.emplace_back(std::move(vis.found_ope)); } c.push_args(std::move(args)); auto se = scope_exit([&]() { c.pop_args(); }); auto ope = get_core_operator(); return ope->parse(s, n, vs, c, dt); } else { // Definition c.push_args(std::vector>()); auto se = scope_exit([&]() { c.pop_args(); }); auto ope = get_core_operator(); return ope->parse(s, n, vs, c, dt); } } else { // Reference parameter in macro const auto &args = c.top_args(); return args[iarg_]->parse(s, n, vs, c, dt); } } inline std::shared_ptr Reference::get_core_operator() const { return rule_->holder_; } inline size_t BackReference::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { auto size = static_cast(c.capture_scope_stack_size); for (auto i = size - 1; i >= 0; i--) { auto index = static_cast(i); const auto &cs = c.capture_scope_stack[index]; if (cs.find(name_) != cs.end()) { const auto &lit = cs.at(name_); std::once_flag init_is_word; auto is_word = false; return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false); } } c.error_info.message_pos = s; c.error_info.message = "undefined back reference '$" + name_ + "'..."; return static_cast(-1); } inline Definition & PrecedenceClimbing::get_reference_for_binop(Context &c) const { if (rule_.is_macro) { // Reference parameter in macro const auto &args = c.top_args(); auto iarg = dynamic_cast(*binop_).iarg_; auto arg = args[iarg]; return *dynamic_cast(*arg).rule_; } return *dynamic_cast(*binop_).rule_; } inline size_t PrecedenceClimbing::parse_expression(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt, size_t min_prec) const { auto len = atom_->parse(s, n, vs, c, dt); if (fail(len)) { return len; } std::string tok; auto &rule = get_reference_for_binop(c); auto action = std::move(rule.action); rule.action = [&](SemanticValues &vs2, std::any &dt2) { tok = vs2.token(); if (action) { return action(vs2, dt2); } else if (!vs2.empty()) { return vs2[0]; } return std::any(); }; auto action_se = scope_exit([&]() { rule.action = std::move(action); }); auto i = len; while (i < n) { std::vector save_values(vs.begin(), vs.end()); auto save_tokens = vs.tokens; auto chvs = c.push_semantic_values_scope(); auto chlen = binop_->parse(s + i, n - i, chvs, c, dt); c.pop_semantic_values_scope(); if (fail(chlen)) { break; } auto it = info_.find(tok); if (it == info_.end()) { break; } auto level = std::get<0>(it->second); auto assoc = std::get<1>(it->second); if (level < min_prec) { break; } vs.emplace_back(std::move(chvs[0])); i += chlen; auto next_min_prec = level; if (assoc == 'L') { next_min_prec = level + 1; } chvs = c.push_semantic_values_scope(); chlen = parse_expression(s + i, n - i, chvs, c, dt, next_min_prec); c.pop_semantic_values_scope(); if (fail(chlen)) { vs.assign(save_values.begin(), save_values.end()); vs.tokens = save_tokens; i = chlen; break; } vs.emplace_back(std::move(chvs[0])); i += chlen; std::any val; if (rule_.action) { vs.sv_ = std::string_view(s, i); val = rule_.action(vs, dt); } else if (!vs.empty()) { val = vs[0]; } vs.clear(); vs.emplace_back(std::move(val)); } return i; } inline size_t Recovery::parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const { const auto &rule = dynamic_cast(*ope_); // Custom error message if (c.log) { auto label = dynamic_cast(rule.args_[0].get()); if (label && !label->rule_->error_message.empty()) { c.error_info.message_pos = s; c.error_info.message = label->rule_->error_message; c.error_info.label = label->rule_->name; } } // Recovery auto len = static_cast(-1); { auto save_log = c.log; c.log = nullptr; auto se = scope_exit([&]() { c.log = save_log; }); SemanticValues dummy_vs; std::any dummy_dt; len = rule.parse(s, n, dummy_vs, c, dummy_dt); } if (success(len)) { c.recovered = true; if (c.log) { c.error_info.output_log(c.log, c.s, c.l); c.error_info.clear(); } } // Cut if (!c.cut_stack.empty()) { c.cut_stack.back() = true; if (c.cut_stack.size() == 1) { // TODO: Remove unneeded entries in packrat memoise table } } return len; } inline void Sequence::accept(Visitor &v) { v.visit(*this); } inline void PrioritizedChoice::accept(Visitor &v) { v.visit(*this); } inline void Repetition::accept(Visitor &v) { v.visit(*this); } inline void AndPredicate::accept(Visitor &v) { v.visit(*this); } inline void NotPredicate::accept(Visitor &v) { v.visit(*this); } inline void Dictionary::accept(Visitor &v) { v.visit(*this); } inline void LiteralString::accept(Visitor &v) { v.visit(*this); } inline void CharacterClass::accept(Visitor &v) { v.visit(*this); } inline void Character::accept(Visitor &v) { v.visit(*this); } inline void AnyCharacter::accept(Visitor &v) { v.visit(*this); } inline void CaptureScope::accept(Visitor &v) { v.visit(*this); } inline void Capture::accept(Visitor &v) { v.visit(*this); } inline void TokenBoundary::accept(Visitor &v) { v.visit(*this); } inline void Ignore::accept(Visitor &v) { v.visit(*this); } inline void User::accept(Visitor &v) { v.visit(*this); } inline void WeakHolder::accept(Visitor &v) { v.visit(*this); } inline void Holder::accept(Visitor &v) { v.visit(*this); } inline void Reference::accept(Visitor &v) { v.visit(*this); } inline void Whitespace::accept(Visitor &v) { v.visit(*this); } inline void BackReference::accept(Visitor &v) { v.visit(*this); } inline void PrecedenceClimbing::accept(Visitor &v) { v.visit(*this); } inline void Recovery::accept(Visitor &v) { v.visit(*this); } inline void Cut::accept(Visitor &v) { v.visit(*this); } inline void AssignIDToDefinition::visit(Holder &ope) { auto p = static_cast(ope.outer_); if (ids.count(p)) { return; } auto id = ids.size(); ids[p] = id; ope.outer_->id = id; ope.ope_->accept(*this); } inline void AssignIDToDefinition::visit(Reference &ope) { if (ope.rule_) { for (auto arg : ope.args_) { arg->accept(*this); } ope.rule_->accept(*this); } } inline void AssignIDToDefinition::visit(PrecedenceClimbing &ope) { ope.atom_->accept(*this); ope.binop_->accept(*this); } inline void TokenChecker::visit(Reference &ope) { if (ope.is_macro_) { for (auto arg : ope.args_) { arg->accept(*this); } } else { has_rule_ = true; } } inline void FindLiteralToken::visit(Reference &ope) { if (ope.is_macro_) { ope.rule_->accept(*this); for (auto arg : ope.args_) { arg->accept(*this); } } } inline void DetectLeftRecursion::visit(Reference &ope) { if (ope.name_ == name_) { error_s = ope.s_; } else if (!refs_.count(ope.name_)) { refs_.insert(ope.name_); if (ope.rule_) { ope.rule_->accept(*this); if (done_ == false) { return; } } } done_ = true; } inline void HasEmptyElement::visit(Sequence &ope) { auto save_is_empty = false; const char *save_error_s = nullptr; std::string save_error_name; auto it = ope.opes_.begin(); while (it != ope.opes_.end()) { (*it)->accept(*this); if (!is_empty) { ++it; while (it != ope.opes_.end()) { DetectInfiniteLoop vis(refs_, has_error_cache_); (*it)->accept(vis); if (vis.has_error) { is_empty = true; error_s = vis.error_s; error_name = vis.error_name; } ++it; } return; } save_is_empty = is_empty; save_error_s = error_s; save_error_name = error_name; is_empty = false; error_name.clear(); ++it; } is_empty = save_is_empty; error_s = save_error_s; error_name = save_error_name; } inline void HasEmptyElement::visit(Reference &ope) { auto it = std::find_if(refs_.begin(), refs_.end(), [&](const std::pair &ref) { return ope.name_ == ref.second; }); if (it != refs_.end()) { return; } if (ope.rule_) { refs_.emplace_back(ope.s_, ope.name_); ope.rule_->accept(*this); refs_.pop_back(); } } inline void DetectInfiniteLoop::visit(Reference &ope) { auto it = std::find_if(refs_.begin(), refs_.end(), [&](const std::pair &ref) { return ope.name_ == ref.second; }); if (it != refs_.end()) { return; } if (ope.rule_) { auto it = has_error_cache_.find(ope.name_); if (it != has_error_cache_.end()) { has_error = it->second; } else { refs_.emplace_back(ope.s_, ope.name_); ope.rule_->accept(*this); refs_.pop_back(); has_error_cache_[ope.name_] = has_error; } } if (ope.is_macro_) { for (auto arg : ope.args_) { arg->accept(*this); } } } inline void ReferenceChecker::visit(Reference &ope) { auto it = std::find(params_.begin(), params_.end(), ope.name_); if (it != params_.end()) { return; } if (!grammar_.count(ope.name_)) { error_s[ope.name_] = ope.s_; error_message[ope.name_] = "'" + ope.name_ + "' is not defined."; } else { if (!referenced.count(ope.name_)) { referenced.insert(ope.name_); } const auto &rule = grammar_.at(ope.name_); if (rule.is_macro) { if (!ope.is_macro_ || ope.args_.size() != rule.params.size()) { error_s[ope.name_] = ope.s_; error_message[ope.name_] = "incorrect number of arguments."; } } else if (ope.is_macro_) { error_s[ope.name_] = ope.s_; error_message[ope.name_] = "'" + ope.name_ + "' is not macro."; } for (auto arg : ope.args_) { arg->accept(*this); } } } inline void LinkReferences::visit(Reference &ope) { // Check if the reference is a macro parameter auto found_param = false; for (size_t i = 0; i < params_.size(); i++) { const auto ¶m = params_[i]; if (param == ope.name_) { ope.iarg_ = i; found_param = true; break; } } // Check if the reference is a definition rule if (!found_param && grammar_.count(ope.name_)) { auto &rule = grammar_.at(ope.name_); ope.rule_ = &rule; } for (auto arg : ope.args_) { arg->accept(*this); } } inline void FindReference::visit(Reference &ope) { for (size_t i = 0; i < args_.size(); i++) { const auto &name = params_[i]; if (name == ope.name_) { found_ope = args_[i]; return; } } found_ope = ope.shared_from_this(); } /*----------------------------------------------------------------------------- * PEG parser generator *---------------------------------------------------------------------------*/ using Rules = std::unordered_map>; class ParserGenerator { public: static std::shared_ptr parse(const char *s, size_t n, const Rules &rules, std::string &start, bool &enablePackratParsing, Log log) { return get_instance().perform_core(s, n, rules, start, enablePackratParsing, log); } static std::shared_ptr parse(const char *s, size_t n, std::string &start, bool &enablePackratParsing, Log log) { Rules dummy; return parse(s, n, dummy, start, enablePackratParsing, log); } // For debugging purpose static Grammar &grammar() { return get_instance().g; } private: static ParserGenerator &get_instance() { static ParserGenerator instance; return instance; } ParserGenerator() { make_grammar(); setup_actions(); } struct Instruction { std::string type; std::any data; std::string_view sv; }; struct Data { std::shared_ptr grammar; std::string start; const char *start_pos = nullptr; std::vector> duplicates_of_definition; std::vector> duplicates_of_instruction; std::map> instructions; std::vector> undefined_back_references; std::vector> captures_stack{{}}; std::set captures_in_current_definition; bool enablePackratParsing = true; Data() : grammar(std::make_shared()) {} }; void make_grammar() { // Setup PEG syntax parser g["Grammar"] <= seq(g["Spacing"], oom(g["Definition"]), g["EndOfFile"]); g["Definition"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Parameters"], g["LEFTARROW"], g["Expression"], opt(g["Instruction"])), seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["Expression"], opt(g["Instruction"]))); g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"]))); g["Sequence"] <= zom(cho(g["CUT"], g["Prefix"])); g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]); g["SuffixWithLabel"] <= seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], npd(g["LEFTARROW"])), seq(g["Ignore"], g["Identifier"], npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"], seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], g["DictionaryI"], g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClassI"], g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= tok(seq(g["IdentStart"], zom(g["IdentRest"]))); const static std::vector> range = { {0x0080, 0xFFFF}}; g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))), cho(cls("a-zA-Z_%"), cls(range))); g["IdentRest"] <= cho(g["IdentStart"], cls("0-9")); g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"]))); g["DictionaryI"] <= seq(g["LiteralID"], oom(seq(g["PIPE"], g["LiteralID"]))); auto lit_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"])); g["Literal"] <= lit_ope; g["LiteralD"] <= lit_ope; auto lit_case_ignore_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"), g["Spacing"])); g["LiteralI"] <= lit_case_ignore_ope; g["LiteralID"] <= lit_case_ignore_ope; // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); g["ClassI"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"), g["Spacing"]); g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); g["NegatedClassI"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"), g["Spacing"]); // NOTE: This is different from The original Brian Ford's paper, and this // modification allows us to specify `[+-]` as a valid char class. g["Range"] <= cho(seq(g["Char"], chr('-'), npd(chr(']')), g["Char"]), g["Char"]); g["Char"] <= cho(seq(chr('\\'), cls("fnrtv'\"[]\\^")), seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), seq(lit("\\u"), cho(seq(cho(seq(chr('0'), cls("0-9a-fA-F")), lit("10")), rep(cls("0-9a-fA-F"), 4, 4)), rep(cls("0-9a-fA-F"), 4, 5))), seq(npd(chr('\\')), dot())); g["Repetition"] <= seq(g["BeginBracket"], g["RepetitionRange"], g["EndBracket"]); g["RepetitionRange"] <= cho(seq(g["Number"], g["COMMA"], g["Number"]), seq(g["Number"], g["COMMA"]), g["Number"], seq(g["COMMA"], g["Number"])); g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); g["CapScope"] <= seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]); g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8"←"))), g["Spacing"]); ~g["SLASH"] <= seq(chr('/'), g["Spacing"]); ~g["PIPE"] <= seq(chr('|'), g["Spacing"]); g["AND"] <= seq(chr('&'), g["Spacing"]); g["NOT"] <= seq(chr('!'), g["Spacing"]); g["QUESTION"] <= seq(chr('?'), g["Spacing"]); g["STAR"] <= seq(chr('*'), g["Spacing"]); g["PLUS"] <= seq(chr('+'), g["Spacing"]); ~g["OPEN"] <= seq(chr('('), g["Spacing"]); ~g["CLOSE"] <= seq(chr(')'), g["Spacing"]); g["DOT"] <= seq(chr('.'), g["Spacing"]); g["CUT"] <= seq(lit(u8(u8"↑")), g["Spacing"]); ~g["LABEL"] <= seq(cho(chr('^'), lit(u8(u8"⇑"))), g["Spacing"]); ~g["Spacing"] <= zom(cho(g["Space"], g["Comment"])); g["Comment"] <= seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]); g["Space"] <= cho(chr(' '), chr('\t'), g["EndOfLine"]); g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); g["EndOfFile"] <= npd(dot()); ~g["BeginTok"] <= seq(chr('<'), g["Spacing"]); ~g["EndTok"] <= seq(chr('>'), g["Spacing"]); ~g["BeginCapScope"] <= seq(chr('$'), chr('('), g["Spacing"]); ~g["EndCapScope"] <= seq(chr(')'), g["Spacing"]); g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]); ~g["EndCap"] <= seq(chr('>'), g["Spacing"]); g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]); g["IGNORE"] <= chr('~'); g["Ignore"] <= opt(g["IGNORE"]); g["Parameters"] <= seq(g["OPEN"], g["Identifier"], zom(seq(g["COMMA"], g["Identifier"])), g["CLOSE"]); g["Arguments"] <= seq(g["OPEN"], g["Expression"], zom(seq(g["COMMA"], g["Expression"])), g["CLOSE"]); ~g["COMMA"] <= seq(chr(','), g["Spacing"]); // Instruction grammars g["Instruction"] <= seq(g["BeginBracket"], opt(seq(g["InstructionItem"], zom(seq(g["InstructionItemSeparator"], g["InstructionItem"])))), g["EndBracket"]); g["InstructionItem"] <= cho(g["PrecedenceClimbing"], g["ErrorMessage"], g["NoAstOpt"]); ~g["InstructionItemSeparator"] <= seq(chr(';'), g["Spacing"]); ~g["SpacesZom"] <= zom(g["Space"]); ~g["SpacesOom"] <= oom(g["Space"]); ~g["BeginBracket"] <= seq(chr('{'), g["Spacing"]); ~g["EndBracket"] <= seq(chr('}'), g["Spacing"]); // PrecedenceClimbing instruction g["PrecedenceClimbing"] <= seq(lit("precedence"), g["SpacesOom"], g["PrecedenceInfo"], zom(seq(g["SpacesOom"], g["PrecedenceInfo"])), g["SpacesZom"]); g["PrecedenceInfo"] <= seq(g["PrecedenceAssoc"], oom(seq(ign(g["SpacesOom"]), g["PrecedenceOpe"]))); g["PrecedenceOpe"] <= cho(seq(cls("'"), tok(zom(seq(npd(cho(g["Space"], cls("'"))), g["Char"]))), cls("'")), seq(cls("\""), tok(zom(seq(npd(cho(g["Space"], cls("\""))), g["Char"]))), cls("\"")), tok(oom(seq(npd(cho(g["PrecedenceAssoc"], g["Space"], chr('}'))), dot())))); g["PrecedenceAssoc"] <= cls("LR"); // Error message instruction g["ErrorMessage"] <= seq(lit("error_message"), g["SpacesOom"], g["LiteralD"], g["SpacesZom"]); // No Ast node optimization instruction g["NoAstOpt"] <= seq(lit("no_ast_opt"), g["SpacesZom"]); // Set definition names for (auto &x : g) { x.second.name = x.first; } } void setup_actions() { g["Definition"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); auto is_macro = vs.choice() == 0; auto ignore = std::any_cast(vs[0]); auto name = std::any_cast(vs[1]); std::vector params; std::shared_ptr ope; auto has_instructions = false; if (is_macro) { params = std::any_cast>(vs[2]); ope = std::any_cast>(vs[4]); if (vs.size() == 6) { has_instructions = true; } } else { ope = std::any_cast>(vs[3]); if (vs.size() == 5) { has_instructions = true; } } if (has_instructions) { auto index = is_macro ? 5 : 4; std::unordered_set types; for (const auto &instruction : std::any_cast>(vs[index])) { const auto &type = instruction.type; if (types.find(type) == types.end()) { data.instructions[name].push_back(instruction); types.insert(instruction.type); if (type == "declare_symbol" || type == "check_symbol") { if (!TokenChecker::is_token(*ope)) { ope = tok(ope); } } } else { data.duplicates_of_instruction.emplace_back(type, instruction.sv.data()); } } } auto &grammar = *data.grammar; if (!grammar.count(name)) { auto &rule = grammar[name]; rule <= ope; rule.name = name; rule.s_ = vs.sv().data(); rule.line_ = line_info(vs.ss, rule.s_); rule.ignoreSemanticValue = ignore; rule.is_macro = is_macro; rule.params = params; if (data.start.empty()) { data.start = rule.name; data.start_pos = rule.s_; } } else { data.duplicates_of_definition.emplace_back(name, vs.sv().data()); } }; g["Definition"].enter = [](const Context & /*c*/, const char * /*s*/, size_t /*n*/, std::any &dt) { auto &data = *std::any_cast(dt); data.captures_in_current_definition.clear(); }; g["Expression"] = [&](const SemanticValues &vs) { if (vs.size() == 1) { return std::any_cast>(vs[0]); } else { std::vector> opes; for (auto i = 0u; i < vs.size(); i++) { opes.emplace_back(std::any_cast>(vs[i])); } const std::shared_ptr ope = std::make_shared(opes); return ope; } }; g["Sequence"] = [&](const SemanticValues &vs) { if (vs.empty()) { return npd(lit("")); } else if (vs.size() == 1) { return std::any_cast>(vs[0]); } else { std::vector> opes; for (const auto &x : vs) { opes.emplace_back(std::any_cast>(x)); } const std::shared_ptr ope = std::make_shared(opes); return ope; } }; g["Prefix"] = [&](const SemanticValues &vs) { std::shared_ptr ope; if (vs.size() == 1) { ope = std::any_cast>(vs[0]); } else { assert(vs.size() == 2); auto tok = std::any_cast(vs[0]); ope = std::any_cast>(vs[1]); if (tok == '&') { ope = apd(ope); } else { // '!' ope = npd(ope); } } return ope; }; g["SuffixWithLabel"] = [&](const SemanticValues &vs, std::any &dt) { auto ope = std::any_cast>(vs[0]); if (vs.size() == 1) { return ope; } else { assert(vs.size() == 2); auto &data = *std::any_cast(dt); const auto &ident = std::any_cast(vs[1]); auto label = ref(*data.grammar, ident, vs.sv().data(), false, {}); auto recovery = rec(ref(*data.grammar, RECOVER_DEFINITION_NAME, vs.sv().data(), true, {label})); return cho4label_(ope, recovery); } }; struct Loop { enum class Type { opt = 0, zom, oom, rep }; Type type; std::pair range; }; g["Suffix"] = [&](const SemanticValues &vs) { auto ope = std::any_cast>(vs[0]); if (vs.size() == 1) { return ope; } else { assert(vs.size() == 2); auto loop = std::any_cast(vs[1]); switch (loop.type) { case Loop::Type::opt: return opt(ope); case Loop::Type::zom: return zom(ope); case Loop::Type::oom: return oom(ope); default: // Regex-like repetition return rep(ope, loop.range.first, loop.range.second); } } }; g["Loop"] = [&](const SemanticValues &vs) { switch (vs.choice()) { case 0: // Option return Loop{Loop::Type::opt, std::pair()}; case 1: // Zero or More return Loop{Loop::Type::zom, std::pair()}; case 2: // One or More return Loop{Loop::Type::oom, std::pair()}; default: // Regex-like repetition return Loop{Loop::Type::rep, std::any_cast>(vs[0])}; } }; g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); switch (vs.choice()) { case 0: // Macro Reference case 1: { // Reference auto is_macro = vs.choice() == 0; auto ignore = std::any_cast(vs[0]); const auto &ident = std::any_cast(vs[1]); std::vector> args; if (is_macro) { args = std::any_cast>>(vs[2]); } auto ope = ref(*data.grammar, ident, vs.sv().data(), is_macro, args); if (ident == RECOVER_DEFINITION_NAME) { ope = rec(ope); } if (ignore) { return ign(ope); } else { return ope; } } case 2: { // (Expression) return std::any_cast>(vs[0]); } case 3: { // TokenBoundary return tok(std::any_cast>(vs[0])); } case 4: { // CaptureScope return csc(std::any_cast>(vs[0])); } case 5: { // Capture const auto &name = std::any_cast(vs[0]); auto ope = std::any_cast>(vs[1]); data.captures_stack.back().insert(name); data.captures_in_current_definition.insert(name); return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1]; cs[name] = std::string(a_s, a_n); }); } default: { return std::any_cast>(vs[0]); } } }; g["IdentCont"] = [](const SemanticValues &vs) { return std::string(vs.sv().data(), vs.sv().length()); }; g["Dictionary"] = [](const SemanticValues &vs) { auto items = vs.transform(); return dic(items, false); }; g["DictionaryI"] = [](const SemanticValues &vs) { auto items = vs.transform(); return dic(items, true); }; g["Literal"] = [](const SemanticValues &vs) { const auto &tok = vs.tokens.front(); return lit(resolve_escape_sequence(tok.data(), tok.size())); }; g["LiteralI"] = [](const SemanticValues &vs) { const auto &tok = vs.tokens.front(); return liti(resolve_escape_sequence(tok.data(), tok.size())); }; g["LiteralD"] = [](const SemanticValues &vs) { auto &tok = vs.tokens.front(); return resolve_escape_sequence(tok.data(), tok.size()); }; g["LiteralID"] = [](const SemanticValues &vs) { auto &tok = vs.tokens.front(); return resolve_escape_sequence(tok.data(), tok.size()); }; g["Class"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return cls(ranges); }; g["ClassI"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return cls(ranges, true); }; g["NegatedClass"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return ncls(ranges); }; g["NegatedClassI"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return ncls(ranges, true); }; g["Range"] = [](const SemanticValues &vs) { switch (vs.choice()) { case 0: { auto s1 = std::any_cast(vs[0]); auto s2 = std::any_cast(vs[1]); auto cp1 = decode_codepoint(s1.data(), s1.length()); auto cp2 = decode_codepoint(s2.data(), s2.length()); return std::pair(cp1, cp2); } case 1: { auto s = std::any_cast(vs[0]); auto cp = decode_codepoint(s.data(), s.length()); return std::pair(cp, cp); } } return std::pair(0, 0); }; g["Char"] = [](const SemanticValues &vs) { return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); }; g["RepetitionRange"] = [&](const SemanticValues &vs) { switch (vs.choice()) { case 0: { // Number COMMA Number auto min = std::any_cast(vs[0]); auto max = std::any_cast(vs[1]); return std::pair(min, max); } case 1: // Number COMMA return std::pair(std::any_cast(vs[0]), std::numeric_limits::max()); case 2: { // Number auto n = std::any_cast(vs[0]); return std::pair(n, n); } default: // COMMA Number return std::pair(std::numeric_limits::min(), std::any_cast(vs[0])); } }; g["Number"] = [&](const SemanticValues &vs) { return vs.token_to_number(); }; g["CapScope"].enter = [](const Context & /*c*/, const char * /*s*/, size_t /*n*/, std::any &dt) { auto &data = *std::any_cast(dt); data.captures_stack.emplace_back(); }; g["CapScope"].leave = [](const Context & /*c*/, const char * /*s*/, size_t /*n*/, size_t /*matchlen*/, std::any & /*value*/, std::any &dt) { auto &data = *std::any_cast(dt); data.captures_stack.pop_back(); }; g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["STAR"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["PLUS"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); }; g["CUT"] = [](const SemanticValues & /*vs*/) { return cut(); }; g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); }; g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); // Undefined back reference check { auto found = false; auto it = data.captures_stack.rbegin(); while (it != data.captures_stack.rend()) { if (it->find(vs.token()) != it->end()) { found = true; break; } ++it; } if (!found) { auto ptr = vs.token().data() - 1; // include '$' symbol data.undefined_back_references.emplace_back(vs.token(), ptr); } } // NOTE: Disable packrat parsing if a back reference is not defined in // captures in the current definition rule. if (data.captures_in_current_definition.find(vs.token()) == data.captures_in_current_definition.end()) { data.enablePackratParsing = false; } return bkr(vs.token_to_string()); }; g["Ignore"] = [](const SemanticValues &vs) { return vs.size() > 0; }; g["Parameters"] = [](const SemanticValues &vs) { return vs.transform(); }; g["Arguments"] = [](const SemanticValues &vs) { return vs.transform>(); }; g["PrecedenceClimbing"] = [](const SemanticValues &vs) { PrecedenceClimbing::BinOpeInfo binOpeInfo; size_t level = 1; for (auto v : vs) { auto tokens = std::any_cast>(v); auto assoc = tokens[0][0]; for (size_t i = 1; i < tokens.size(); i++) { binOpeInfo[tokens[i]] = std::pair(level, assoc); } level++; } Instruction instruction; instruction.type = "precedence"; instruction.data = binOpeInfo; instruction.sv = vs.sv(); return instruction; }; g["PrecedenceInfo"] = [](const SemanticValues &vs) { return vs.transform(); }; g["PrecedenceOpe"] = [](const SemanticValues &vs) { return vs.token(); }; g["PrecedenceAssoc"] = [](const SemanticValues &vs) { return vs.token(); }; g["ErrorMessage"] = [](const SemanticValues &vs) { Instruction instruction; instruction.type = "error_message"; instruction.data = std::any_cast(vs[0]); instruction.sv = vs.sv(); return instruction; }; g["NoAstOpt"] = [](const SemanticValues &vs) { Instruction instruction; instruction.type = "no_ast_opt"; instruction.sv = vs.sv(); return instruction; }; g["Instruction"] = [](const SemanticValues &vs) { return vs.transform(); }; } bool apply_precedence_instruction(Definition &rule, const PrecedenceClimbing::BinOpeInfo &info, const char *s, Log log) { try { auto &seq = dynamic_cast(*rule.get_core_operator()); auto atom = seq.opes_[0]; auto &rep = dynamic_cast(*seq.opes_[1]); auto &seq1 = dynamic_cast(*rep.ope_); auto binop = seq1.opes_[0]; auto atom1 = seq1.opes_[1]; auto atom_name = dynamic_cast(*atom).name_; auto binop_name = dynamic_cast(*binop).name_; auto atom1_name = dynamic_cast(*atom1).name_; if (!rep.is_zom() || atom_name != atom1_name || atom_name == binop_name) { if (log) { auto line = line_info(s, rule.s_); log(line.first, line.second, "'precedence' instruction cannot be applied to '" + rule.name + "'.", ""); } return false; } rule.holder_->ope_ = pre(atom, binop, info, rule); rule.disable_action = true; } catch (...) { if (log) { auto line = line_info(s, rule.s_); log(line.first, line.second, "'precedence' instruction cannot be applied to '" + rule.name + "'.", ""); } return false; } return true; } std::shared_ptr perform_core(const char *s, size_t n, const Rules &rules, std::string &start, bool &enablePackratParsing, Log log) { Data data; auto &grammar = *data.grammar; // Built-in macros { // `%recover` { auto &rule = grammar[RECOVER_DEFINITION_NAME]; rule <= ref(grammar, "x", "", false, {}); rule.name = RECOVER_DEFINITION_NAME; rule.s_ = "[native]"; rule.ignoreSemanticValue = true; rule.is_macro = true; rule.params = {"x"}; } } std::any dt = &data; auto r = g["Grammar"].parse(s, n, dt, nullptr, log); if (!r.ret) { if (log) { if (r.error_info.message_pos) { auto line = line_info(s, r.error_info.message_pos); log(line.first, line.second, r.error_info.message, r.error_info.label); } else { auto line = line_info(s, r.error_info.error_pos); log(line.first, line.second, "syntax error", r.error_info.label); } } return nullptr; } // User provided rules for (auto [user_name, user_rule] : rules) { auto name = user_name; auto ignore = false; if (!name.empty() && name[0] == '~') { ignore = true; name.erase(0, 1); } if (!name.empty()) { auto &rule = grammar[name]; rule <= user_rule; rule.name = name; rule.ignoreSemanticValue = ignore; } } // Check duplicated definitions auto ret = true; if (!data.duplicates_of_definition.empty()) { for (const auto &[name, ptr] : data.duplicates_of_definition) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, "The definition '" + name + "' is already defined.", ""); } } ret = false; } // Check duplicated instructions if (!data.duplicates_of_instruction.empty()) { for (const auto &[type, ptr] : data.duplicates_of_instruction) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, "The instruction '" + type + "' is already defined.", ""); } } ret = false; } // Check undefined back references if (!data.undefined_back_references.empty()) { for (const auto &[name, ptr] : data.undefined_back_references) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, "The back reference '" + name + "' is undefined.", ""); } } ret = false; } // Set root definition auto &start_rule = grammar[data.start]; // Check if the start rule has ignore operator { if (start_rule.ignoreSemanticValue) { if (log) { auto line = line_info(s, start_rule.s_); log(line.first, line.second, "Ignore operator cannot be applied to '" + start_rule.name + "'.", ""); } ret = false; } } if (!ret) { return nullptr; } // Check missing definitions auto referenced = std::unordered_set{ WHITESPACE_DEFINITION_NAME, WORD_DEFINITION_NAME, RECOVER_DEFINITION_NAME, start_rule.name, }; for (auto &[_, rule] : grammar) { ReferenceChecker vis(grammar, rule.params); rule.accept(vis); referenced.insert(vis.referenced.begin(), vis.referenced.end()); for (const auto &[name, ptr] : vis.error_s) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, vis.error_message[name], ""); } ret = false; } } for (auto &[name, rule] : grammar) { if (!referenced.count(name)) { if (log) { auto line = line_info(s, rule.s_); auto msg = "'" + name + "' is not referenced."; log(line.first, line.second, msg, ""); } } } if (!ret) { return nullptr; } // Link references for (auto &x : grammar) { auto &rule = x.second; LinkReferences vis(grammar, rule.params); rule.accept(vis); } // Check left recursion ret = true; for (auto &[name, rule] : grammar) { DetectLeftRecursion vis(name); rule.accept(vis); if (vis.error_s) { if (log) { auto line = line_info(s, vis.error_s); log(line.first, line.second, "'" + name + "' is left recursive.", ""); } ret = false; } } if (!ret) { return nullptr; } // Check infinite loop if (detect_infiniteLoop(data, start_rule, log, s)) { return nullptr; } // Automatic whitespace skipping if (grammar.count(WHITESPACE_DEFINITION_NAME)) { for (auto &x : grammar) { auto &rule = x.second; auto ope = rule.get_core_operator(); if (IsLiteralToken::check(*ope)) { rule <= tok(ope); } } auto &rule = grammar[WHITESPACE_DEFINITION_NAME]; start_rule.whitespaceOpe = wsp(rule.get_core_operator()); if (detect_infiniteLoop(data, rule, log, s)) { return nullptr; } } // Word expression if (grammar.count(WORD_DEFINITION_NAME)) { auto &rule = grammar[WORD_DEFINITION_NAME]; start_rule.wordOpe = rule.get_core_operator(); if (detect_infiniteLoop(data, rule, log, s)) { return nullptr; } } // Apply instructions for (const auto &[name, instructions] : data.instructions) { auto &rule = grammar[name]; for (const auto &instruction : instructions) { if (instruction.type == "precedence") { const auto &info = std::any_cast(instruction.data); if (!apply_precedence_instruction(rule, info, s, log)) { return nullptr; } } else if (instruction.type == "error_message") { rule.error_message = std::any_cast(instruction.data); } else if (instruction.type == "no_ast_opt") { rule.no_ast_opt = true; } } } // Set root definition start = data.start; enablePackratParsing = data.enablePackratParsing; return data.grammar; } bool detect_infiniteLoop(const Data &data, Definition &rule, const Log &log, const char *s) const { std::vector> refs; std::unordered_map has_error_cache; DetectInfiniteLoop vis(data.start_pos, rule.name, refs, has_error_cache); rule.accept(vis); if (vis.has_error) { if (log) { auto line = line_info(s, vis.error_s); log(line.first, line.second, "infinite loop is detected in '" + vis.error_name + "'.", ""); } return true; } return false; } Grammar g; }; /*----------------------------------------------------------------------------- * AST *---------------------------------------------------------------------------*/ template struct AstBase : public Annotation { AstBase(const char *path, size_t line, size_t column, const char *name, const std::vector> &nodes, size_t position = 0, size_t length = 0, size_t choice_count = 0, size_t choice = 0) : path(path ? path : ""), line(line), column(column), name(name), position(position), length(length), choice_count(choice_count), choice(choice), original_name(name), original_choice_count(choice_count), original_choice(choice), tag(str2tag(name)), original_tag(tag), is_token(false), nodes(nodes) {} AstBase(const char *path, size_t line, size_t column, const char *name, const std::string_view &token, size_t position = 0, size_t length = 0, size_t choice_count = 0, size_t choice = 0) : path(path ? path : ""), line(line), column(column), name(name), position(position), length(length), choice_count(choice_count), choice(choice), original_name(name), original_choice_count(choice_count), original_choice(choice), tag(str2tag(name)), original_tag(tag), is_token(true), token(token) {} AstBase(const AstBase &ast, const char *original_name, size_t position = 0, size_t length = 0, size_t original_choice_count = 0, size_t original_choice = 0) : path(ast.path), line(ast.line), column(ast.column), name(ast.name), position(position), length(length), choice_count(ast.choice_count), choice(ast.choice), original_name(original_name), original_choice_count(original_choice_count), original_choice(original_choice), tag(ast.tag), original_tag(str2tag(original_name)), is_token(ast.is_token), token(ast.token), nodes(ast.nodes), parent(ast.parent) {} const std::string path; const size_t line = 1; const size_t column = 1; const std::string name; size_t position; size_t length; const size_t choice_count; const size_t choice; const std::string original_name; const size_t original_choice_count; const size_t original_choice; const unsigned int tag; const unsigned int original_tag; const bool is_token; const std::string_view token; std::vector>> nodes; std::weak_ptr> parent; std::string token_to_string() const { assert(is_token); return std::string(token); } template T token_to_number() const { return token_to_number_(token); } }; template void ast_to_s_core(const std::shared_ptr &ptr, std::string &s, int level, std::function fn) { const auto &ast = *ptr; for (auto i = 0; i < level; i++) { s += " "; } auto name = ast.original_name; if (ast.original_choice_count > 0) { name += "/" + std::to_string(ast.original_choice); } if (ast.name != ast.original_name) { name += "[" + ast.name + "]"; } if (ast.is_token) { s += "- " + name + " ("; s += ast.token; s += ")\n"; } else { s += "+ " + name + "\n"; } if (fn) { s += fn(ast, level + 1); } for (auto node : ast.nodes) { ast_to_s_core(node, s, level + 1, fn); } } template std::string ast_to_s(const std::shared_ptr &ptr, std::function fn = nullptr) { std::string s; ast_to_s_core(ptr, s, 0, fn); return s; } struct AstOptimizer { AstOptimizer(bool mode, const std::vector &rules = {}) : mode_(mode), rules_(rules) {} template std::shared_ptr optimize(std::shared_ptr original, std::shared_ptr parent = nullptr) { auto found = std::find(rules_.begin(), rules_.end(), original->name) != rules_.end(); auto opt = mode_ ? !found : found; if (opt && original->nodes.size() == 1) { auto child = optimize(original->nodes[0], parent); auto ast = std::make_shared(*child, original->name.data(), original->choice_count, original->position, original->length, original->choice); for (auto node : ast->nodes) { node->parent = ast; } return ast; } auto ast = std::make_shared(*original); ast->parent = parent; ast->nodes.clear(); for (auto node : original->nodes) { auto child = optimize(node, ast); ast->nodes.push_back(child); } return ast; } private: const bool mode_; const std::vector rules_; }; struct EmptyType {}; using Ast = AstBase; template void add_ast_action(Definition &rule) { rule.action = [&](const SemanticValues &vs) { auto line = vs.line_info(); if (rule.is_token()) { return std::make_shared( vs.path, line.first, line.second, rule.name.data(), vs.token(), std::distance(vs.ss, vs.sv().data()), vs.sv().length(), vs.choice_count(), vs.choice()); } auto ast = std::make_shared(vs.path, line.first, line.second, rule.name.data(), vs.transform>(), std::distance(vs.ss, vs.sv().data()), vs.sv().length(), vs.choice_count(), vs.choice()); for (auto node : ast->nodes) { node->parent = ast; } return ast; }; } #define PEG_EXPAND(...) __VA_ARGS__ #define PEG_CONCAT(a, b) a##b #define PEG_CONCAT2(a, b) PEG_CONCAT(a, b) #define PEG_PICK( \ a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, \ a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, \ a32, a33, a34, a35, a36, a37, a38, a39, a40, a41, a42, a43, a44, a45, a46, \ a47, a48, a49, a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, a60, a61, \ a62, a63, a64, a65, a66, a67, a68, a69, a70, a71, a72, a73, a74, a75, a76, \ a77, a78, a79, a80, a81, a82, a83, a84, a85, a86, a87, a88, a89, a90, a91, \ a92, a93, a94, a95, a96, a97, a98, a99, a100, ...) \ a100 #define PEG_COUNT(...) \ PEG_EXPAND(PEG_PICK( \ __VA_ARGS__, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, \ 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, \ 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, \ 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) #define PEG_DEF_1(r) \ peg::Definition r; \ r.name = #r; \ peg::add_ast_action(r); #define PEG_DEF_2(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_1(__VA_ARGS__)) #define PEG_DEF_3(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_2(__VA_ARGS__)) #define PEG_DEF_4(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_3(__VA_ARGS__)) #define PEG_DEF_5(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_4(__VA_ARGS__)) #define PEG_DEF_6(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_5(__VA_ARGS__)) #define PEG_DEF_7(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_6(__VA_ARGS__)) #define PEG_DEF_8(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_7(__VA_ARGS__)) #define PEG_DEF_9(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_8(__VA_ARGS__)) #define PEG_DEF_10(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_9(__VA_ARGS__)) #define PEG_DEF_11(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_10(__VA_ARGS__)) #define PEG_DEF_12(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_11(__VA_ARGS__)) #define PEG_DEF_13(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_12(__VA_ARGS__)) #define PEG_DEF_14(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_13(__VA_ARGS__)) #define PEG_DEF_15(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_14(__VA_ARGS__)) #define PEG_DEF_16(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_15(__VA_ARGS__)) #define PEG_DEF_17(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_16(__VA_ARGS__)) #define PEG_DEF_18(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_17(__VA_ARGS__)) #define PEG_DEF_19(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_18(__VA_ARGS__)) #define PEG_DEF_20(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_19(__VA_ARGS__)) #define PEG_DEF_21(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_20(__VA_ARGS__)) #define PEG_DEF_22(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_21(__VA_ARGS__)) #define PEG_DEF_23(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_22(__VA_ARGS__)) #define PEG_DEF_24(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_23(__VA_ARGS__)) #define PEG_DEF_25(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_24(__VA_ARGS__)) #define PEG_DEF_26(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_25(__VA_ARGS__)) #define PEG_DEF_27(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_26(__VA_ARGS__)) #define PEG_DEF_28(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_27(__VA_ARGS__)) #define PEG_DEF_29(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_28(__VA_ARGS__)) #define PEG_DEF_30(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_29(__VA_ARGS__)) #define PEG_DEF_31(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_30(__VA_ARGS__)) #define PEG_DEF_32(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_31(__VA_ARGS__)) #define PEG_DEF_33(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_32(__VA_ARGS__)) #define PEG_DEF_34(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_33(__VA_ARGS__)) #define PEG_DEF_35(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_34(__VA_ARGS__)) #define PEG_DEF_36(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_35(__VA_ARGS__)) #define PEG_DEF_37(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_36(__VA_ARGS__)) #define PEG_DEF_38(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_37(__VA_ARGS__)) #define PEG_DEF_39(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_38(__VA_ARGS__)) #define PEG_DEF_40(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_39(__VA_ARGS__)) #define PEG_DEF_41(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_40(__VA_ARGS__)) #define PEG_DEF_42(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_41(__VA_ARGS__)) #define PEG_DEF_43(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_42(__VA_ARGS__)) #define PEG_DEF_44(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_43(__VA_ARGS__)) #define PEG_DEF_45(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_44(__VA_ARGS__)) #define PEG_DEF_46(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_45(__VA_ARGS__)) #define PEG_DEF_47(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_46(__VA_ARGS__)) #define PEG_DEF_48(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_47(__VA_ARGS__)) #define PEG_DEF_49(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_48(__VA_ARGS__)) #define PEG_DEF_50(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_49(__VA_ARGS__)) #define PEG_DEF_51(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_50(__VA_ARGS__)) #define PEG_DEF_52(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_51(__VA_ARGS__)) #define PEG_DEF_53(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_52(__VA_ARGS__)) #define PEG_DEF_54(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_53(__VA_ARGS__)) #define PEG_DEF_55(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_54(__VA_ARGS__)) #define PEG_DEF_56(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_55(__VA_ARGS__)) #define PEG_DEF_57(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_56(__VA_ARGS__)) #define PEG_DEF_58(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_57(__VA_ARGS__)) #define PEG_DEF_59(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_58(__VA_ARGS__)) #define PEG_DEF_60(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_59(__VA_ARGS__)) #define PEG_DEF_61(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_60(__VA_ARGS__)) #define PEG_DEF_62(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_61(__VA_ARGS__)) #define PEG_DEF_63(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_62(__VA_ARGS__)) #define PEG_DEF_64(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_63(__VA_ARGS__)) #define PEG_DEF_65(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_64(__VA_ARGS__)) #define PEG_DEF_66(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_65(__VA_ARGS__)) #define PEG_DEF_67(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_66(__VA_ARGS__)) #define PEG_DEF_68(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_67(__VA_ARGS__)) #define PEG_DEF_69(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_68(__VA_ARGS__)) #define PEG_DEF_70(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_69(__VA_ARGS__)) #define PEG_DEF_71(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_70(__VA_ARGS__)) #define PEG_DEF_72(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_71(__VA_ARGS__)) #define PEG_DEF_73(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_72(__VA_ARGS__)) #define PEG_DEF_74(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_73(__VA_ARGS__)) #define PEG_DEF_75(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_74(__VA_ARGS__)) #define PEG_DEF_76(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_75(__VA_ARGS__)) #define PEG_DEF_77(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_76(__VA_ARGS__)) #define PEG_DEF_78(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_77(__VA_ARGS__)) #define PEG_DEF_79(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_78(__VA_ARGS__)) #define PEG_DEF_80(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_79(__VA_ARGS__)) #define PEG_DEF_81(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_80(__VA_ARGS__)) #define PEG_DEF_82(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_81(__VA_ARGS__)) #define PEG_DEF_83(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_82(__VA_ARGS__)) #define PEG_DEF_84(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_83(__VA_ARGS__)) #define PEG_DEF_85(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_84(__VA_ARGS__)) #define PEG_DEF_86(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_85(__VA_ARGS__)) #define PEG_DEF_87(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_86(__VA_ARGS__)) #define PEG_DEF_88(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_87(__VA_ARGS__)) #define PEG_DEF_89(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_88(__VA_ARGS__)) #define PEG_DEF_90(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_89(__VA_ARGS__)) #define PEG_DEF_91(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_90(__VA_ARGS__)) #define PEG_DEF_92(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_91(__VA_ARGS__)) #define PEG_DEF_93(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_92(__VA_ARGS__)) #define PEG_DEF_94(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_93(__VA_ARGS__)) #define PEG_DEF_95(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_94(__VA_ARGS__)) #define PEG_DEF_96(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_95(__VA_ARGS__)) #define PEG_DEF_97(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_96(__VA_ARGS__)) #define PEG_DEF_98(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_97(__VA_ARGS__)) #define PEG_DEF_99(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_98(__VA_ARGS__)) #define PEG_DEF_100(r1, ...) PEG_EXPAND(PEG_DEF_1(r1) PEG_DEF_99(__VA_ARGS__)) #define AST_DEFINITIONS(...) \ PEG_EXPAND(PEG_CONCAT2(PEG_DEF_, PEG_COUNT(__VA_ARGS__))(__VA_ARGS__)) /*----------------------------------------------------------------------------- * parser *---------------------------------------------------------------------------*/ class parser { public: parser() = default; parser(const char *s, size_t n, const Rules &rules) { load_grammar(s, n, rules); } parser(const char *s, size_t n) : parser(s, n, Rules()) {} parser(std::string_view sv, const Rules &rules) : parser(sv.data(), sv.size(), rules) {} parser(std::string_view sv) : parser(sv.data(), sv.size(), Rules()) {} #if defined(__cpp_lib_char8_t) parser(std::u8string_view sv, const Rules &rules) : parser(reinterpret_cast(sv.data()), sv.size(), rules) {} parser(std::u8string_view sv) : parser(reinterpret_cast(sv.data()), sv.size(), Rules()) {} #endif operator bool() { return grammar_ != nullptr; } bool load_grammar(const char *s, size_t n, const Rules &rules) { grammar_ = ParserGenerator::parse(s, n, rules, start_, enablePackratParsing_, log_); return grammar_ != nullptr; } bool load_grammar(const char *s, size_t n) { return load_grammar(s, n, Rules()); } bool load_grammar(std::string_view sv, const Rules &rules) { return load_grammar(sv.data(), sv.size(), rules); } bool load_grammar(std::string_view sv) { return load_grammar(sv.data(), sv.size()); } bool parse_n(const char *s, size_t n, const char *path = nullptr) const { if (grammar_ != nullptr) { const auto &rule = (*grammar_)[start_]; auto result = rule.parse(s, n, path, log_); return post_process(s, n, result); } return false; } bool parse_n(const char *s, size_t n, std::any &dt, const char *path = nullptr) const { if (grammar_ != nullptr) { const auto &rule = (*grammar_)[start_]; auto result = rule.parse(s, n, dt, path, log_); return post_process(s, n, result); } return false; } template bool parse_n(const char *s, size_t n, T &val, const char *path = nullptr) const { if (grammar_ != nullptr) { const auto &rule = (*grammar_)[start_]; auto result = rule.parse_and_get_value(s, n, val, path, log_); return post_process(s, n, result); } return false; } template bool parse_n(const char *s, size_t n, std::any &dt, T &val, const char *path = nullptr) const { if (grammar_ != nullptr) { const auto &rule = (*grammar_)[start_]; auto result = rule.parse_and_get_value(s, n, dt, val, path, log_); return post_process(s, n, result); } return false; } bool parse(std::string_view sv, const char *path = nullptr) const { return parse_n(sv.data(), sv.size(), path); } bool parse(std::string_view sv, std::any &dt, const char *path = nullptr) const { return parse_n(sv.data(), sv.size(), dt, path); } template bool parse(std::string_view sv, T &val, const char *path = nullptr) const { return parse_n(sv.data(), sv.size(), val, path); } template bool parse(std::string_view sv, std::any &dt, T &val, const char *path = nullptr) const { return parse_n(sv.data(), sv.size(), dt, val, path); } #if defined(__cpp_lib_char8_t) bool parse(std::u8string_view sv, const char *path = nullptr) const { return parse_n(reinterpret_cast(sv.data()), sv.size(), path); } bool parse(std::u8string_view sv, std::any &dt, const char *path = nullptr) const { return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, path); } template bool parse(std::u8string_view sv, T &val, const char *path = nullptr) const { return parse_n(reinterpret_cast(sv.data()), sv.size(), val, path); } template bool parse(std::u8string_view sv, std::any &dt, T &val, const char *path = nullptr) const { return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, val, path); } #endif Definition &operator[](const char *s) { return (*grammar_)[s]; } const Definition &operator[](const char *s) const { return (*grammar_)[s]; } const Grammar &get_grammar() const { return *grammar_; } void disable_eoi_check() { if (grammar_ != nullptr) { auto &rule = (*grammar_)[start_]; rule.eoi_check = false; } } void enable_packrat_parsing() { if (grammar_ != nullptr) { auto &rule = (*grammar_)[start_]; rule.enablePackratParsing = enablePackratParsing_ && true; } } void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave) { if (grammar_ != nullptr) { auto &rule = (*grammar_)[start_]; rule.tracer_enter = tracer_enter; rule.tracer_leave = tracer_leave; } } void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave, TracerStartOrEnd tracer_start, TracerStartOrEnd tracer_end) { if (grammar_ != nullptr) { auto &rule = (*grammar_)[start_]; rule.tracer_enter = tracer_enter; rule.tracer_leave = tracer_leave; rule.tracer_start = tracer_start; rule.tracer_end = tracer_end; } } void set_verbose_trace(bool verbose_trace) { if (grammar_ != nullptr) { auto &rule = (*grammar_)[start_]; rule.verbose_trace = verbose_trace; } } template parser &enable_ast() { for (auto &[_, rule] : *grammar_) { if (!rule.action) { add_ast_action(rule); } } return *this; } template std::shared_ptr optimize_ast(std::shared_ptr ast, bool opt_mode = true) const { return AstOptimizer(opt_mode, get_no_ast_opt_rules()).optimize(ast); } void set_logger(Log log) { log_ = log; } void set_logger( std::function log) { log_ = [log](size_t line, size_t col, const std::string &msg, const std::string & /*rule*/) { log(line, col, msg); }; } private: bool post_process(const char *s, size_t n, Definition::Result &r) const { if (log_ && !r.ret) { r.error_info.output_log(log_, s, n); } return r.ret && !r.recovered; } std::vector get_no_ast_opt_rules() const { std::vector rules; for (auto &[name, rule] : *grammar_) { if (rule.no_ast_opt) { rules.push_back(name); } } return rules; } std::shared_ptr grammar_; std::string start_; bool enablePackratParsing_ = false; Log log_; }; /*----------------------------------------------------------------------------- * enable_tracing *---------------------------------------------------------------------------*/ inline void enable_tracing(parser &parser, std::ostream &os) { parser.enable_trace( [&](auto &ope, auto s, auto, auto &, auto &c, auto &, auto &trace_data) { auto prev_pos = std::any_cast(trace_data); auto pos = static_cast(s - c.s); auto backtrack = (pos < prev_pos ? "*" : ""); std::string indent; auto level = c.trace_ids.size() - 1; while (level--) { indent += "│"; } std::string name; { name = peg::TraceOpeName::get(const_cast(ope)); auto lit = dynamic_cast(&ope); if (lit) { name += " '" + peg::escape_characters(lit->lit_) + "'"; } } os << "E " << pos + 1 << backtrack << "\t" << indent << "┌" << name << " #" << c.trace_ids.back() << std::endl; trace_data = static_cast(pos); }, [&](auto &ope, auto s, auto, auto &sv, auto &c, auto &, auto len, auto &) { auto pos = static_cast(s - c.s); if (len != static_cast(-1)) { pos += len; } std::string indent; auto level = c.trace_ids.size() - 1; while (level--) { indent += "│"; } auto ret = len != static_cast(-1) ? "└o " : "└x "; auto name = peg::TraceOpeName::get(const_cast(ope)); std::stringstream choice; if (sv.choice_count() > 0) { choice << " " << sv.choice() << "/" << sv.choice_count(); } std::string token; if (!sv.tokens.empty()) { token += ", token '"; token += sv.tokens[0]; token += "'"; } std::string matched; if (peg::success(len) && peg::TokenChecker::is_token(const_cast(ope))) { matched = ", match '" + peg::escape_characters(s, len) + "'"; } os << "L " << pos + 1 << "\t" << indent << ret << name << " #" << c.trace_ids.back() << choice.str() << token << matched << std::endl; }, [&](auto &trace_data) { trace_data = static_cast(0); }, [&](auto &) {}); } /*----------------------------------------------------------------------------- * enable_profiling *---------------------------------------------------------------------------*/ inline void enable_profiling(parser &parser, std::ostream &os) { struct Stats { struct Item { std::string name; size_t success; size_t fail; }; std::vector items; std::map index; size_t total = 0; std::chrono::steady_clock::time_point start; }; parser.enable_trace( [&](auto &ope, auto, auto, auto &, auto &, auto &, std::any &trace_data) { if (auto holder = dynamic_cast(&ope)) { auto &stats = *std::any_cast(trace_data); auto &name = holder->name(); if (stats.index.find(name) == stats.index.end()) { stats.index[name] = stats.index.size(); stats.items.push_back({name, 0, 0}); } stats.total++; } }, [&](auto &ope, auto, auto, auto &, auto &, auto &, auto len, std::any &trace_data) { if (auto holder = dynamic_cast(&ope)) { auto &stats = *std::any_cast(trace_data); auto &name = holder->name(); auto index = stats.index[name]; auto &stat = stats.items[index]; if (len != static_cast(-1)) { stat.success++; } else { stat.fail++; } if (index == 0) { auto end = std::chrono::steady_clock::now(); auto nano = std::chrono::duration_cast( end - stats.start) .count(); auto sec = nano / 1000000.0; os << "duration: " << sec << "s (" << nano << "µs)" << std::endl << std::endl; char buff[BUFSIZ]; size_t total_success = 0; size_t total_fail = 0; for (auto &[name, success, fail] : stats.items) { total_success += success; total_fail += fail; } os << " id total % success fail " "definition" << std::endl; auto grand_total = total_success + total_fail; snprintf(buff, BUFSIZ, "%4s %10zu %5s %10zu %10zu %s", "", grand_total, "", total_success, total_fail, "Total counters"); os << buff << std::endl; snprintf(buff, BUFSIZ, "%4s %10s %5s %10.2f %10.2f %s", "", "", "", total_success * 100.0 / grand_total, total_fail * 100.0 / grand_total, "% success/fail"); os << buff << std::endl << std::endl; ; size_t id = 0; for (auto &[name, success, fail] : stats.items) { auto total = success + fail; auto ratio = total * 100.0 / stats.total; snprintf(buff, BUFSIZ, "%4zu %10zu %5.2f %10zu %10zu %s", id, total, ratio, success, fail, name.c_str()); os << buff << std::endl; id++; } } } }, [&](auto &trace_data) { auto stats = new Stats{}; stats->start = std::chrono::steady_clock::now(); trace_data = stats; }, [&](auto &trace_data) { auto stats = std::any_cast(trace_data); delete stats; }); } } // namespace peg