// BEGIN of GENERIC RULES // insignificant whitespace, not repeated ws = _{ " " | "\t" } meta_attr_key = ${ "name" | "version" | "overlay-set" } meta_attr_value = ${ string | char+} meta_key_pair = @{ meta_attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ meta_attr_value } meta_comment = @{ "--" ~ ws* ~ (!NEWLINE ~ meta_key_pair)* } meta_comment_line = _{ ws* ~ meta_comment ~ NEWLINE? } comment = @{ "#" ~ (!NEWLINE ~ ANY)* } comment_line = _{ ws* ~ comment ~ NEWLINE? } empty_line = @{ ws* ~ NEWLINE } /// Double quoted string double_quoted_string = _{ "\"" ~ inner ~ "\"" } inner = @{ (!("\"" | "\\" | "\u{0000}" | "\u{001F}") ~ ANY)* ~ (escape ~ inner)? } escape = @{ "\\" ~ ("b" | "t" | "n" | "f" | "r" | "\"" | "\\" | "'" | unicode | NEWLINE)? } unicode = @{ "u" ~ ASCII_HEX_DIGIT{4} | "U" ~ ASCII_HEX_DIGIT{8} } single_quoted_string = _{ "'" ~ single_quoted_inner ~ "'" } single_quoted_inner = @{ (!("'" | "\\" | "\u{0000}" | "\u{001F}") ~ ANY)* ~ (escape ~ single_quoted_inner)? } string = { single_quoted_string | double_quoted_string } // a line continuation, allowing an instruction to continue onto a new line line_continuation = _{ "\\" ~ ws* ~ NEWLINE } // whitespace that may appear between instruction arguments // this allows instructions to expand past a newline if escaped arg_ws = _{ (ws | line_continuation ~ (comment_line | empty_line)*)+ } // like arg_ws, but where whitespace is optional arg_ws_maybe = _{ (ws | line_continuation ~ (comment_line | empty_line)*)* } // continues consuming input beyond a newline, if the newline is preceeded by an // escape (\) // these tokens need to be preserved in the final tree so they can be handled // appropraitely; pest's ignore rules aren't sufficient for our needs any_content = @{ ( !NEWLINE ~ !line_continuation ~ ANY )+ } any_breakable = ${ ( // can be any comment string (no line continuation required) comment_line ~ any_breakable? ) | ( // ... OR some piece of content, requiring a continuation EXCEPT on the // final line any_content ~ (line_continuation ~ any_breakable)? ) } // consumes any character until the end of the line any_eol = _{ (!NEWLINE ~ ANY)* } // consumes all characters until the next whitespace until_whitespace = _{ (!(NEWLINE | EOI | arg_ws) ~ ANY)+ } // consumes identifier characters until the next whitespace identifier_whitespace = _{ (!ws ~ (ASCII_ALPHANUMERIC | "_" | "-"))+ } // consumes until whitespace or = (for key in key=value pairs) any_equals = _{ (!(NEWLINE | ws | "=") ~ ANY)+ } // END of GENERIC RULES commands = _{ ( from | add | modify | remove ) ~ NEWLINE? } from = { ^"from" ~ ws* ~ from_said} add = { ^"add" ~ arg_ws* ~ oca_object } modify = { ^"modify" ~ char+ } remove = { ^"remove" ~ arg_ws* ~ remove_oca_object } SCRIPTS = { ADLAM | AHOM | ANATOLIAN_HIEROGLYPHS | ARABIC | ARMENIAN | AVESTAN | BALINESE | BAMUM | BASSA_VAH | BATAK | BENGALI | BHAIKSUKI | BOPOMOFO | BRAHMI | BRAILLE | BUGINESE | BUHID | CANADIAN_ABORIGINAL | CARIAN | CAUCASIAN_ALBANIAN | CHAKMA | CHAM | CHEROKEE | CHORASMIAN | COPTIC | CUNEIFORM | CYPRIOT | CYPRO_MINOAN | CYRILLIC | DESERET | DEVANAGARI | DIVES_AKURU | DOGRA | DUPLOYAN | EGYPTIAN_HIEROGLYPHS | ELBASAN | ELYMAIC | ETHIOPIC | GEORGIAN | GLAGOLITIC | GOTHIC | GRANTHA | GREEK | GUJARATI | GUNJALA_GONDI | GURMUKHI | HAN | HANGUL | HANIFI_ROHINGYA | HANUNOO | HATRAN | HEBREW | HIRAGANA | IMPERIAL_ARAMAIC | INHERITED | INSCRIPTIONAL_PAHLAVI | INSCRIPTIONAL_PARTHIAN | JAVANESE | KAITHI | KANNADA | KATAKANA | KAWI | KAYAH_LI | KHAROSHTHI | KHITAN_SMALL_SCRIPT | KHMER | KHOJKI | KHUDAWADI | LAO | LATIN | LEPCHA | LIMBU | LINEAR_A | LINEAR_B | LISU | LYCIAN | LYDIAN | MAHAJANI | MAKASAR | MALAYALAM | MANDAIC | MANICHAEAN | MARCHEN | MASARAM_GONDI | MEDEFAIDRIN | MEETEI_MAYEK | MENDE_KIKAKUI | MEROITIC_CURSIVE | MEROITIC_HIEROGLYPHS | MIAO | MODI | MONGOLIAN | MRO | MULTANI | MYANMAR | NABATAEAN | NAG_MUNDARI | NANDINAGARI | NEW_TAI_LUE | NEWA | NKO | NUSHU | NYIAKENG_PUACHUE_HMONG | OGHAM | OL_CHIKI | OLD_HUNGARIAN | OLD_ITALIC | OLD_NORTH_ARABIAN | OLD_PERMIC | OLD_PERSIAN | OLD_SOGDIAN | OLD_SOUTH_ARABIAN | OLD_TURKIC | OLD_UYGHUR | ORIYA | OSAGE | OSMANYA | PAHAWH_HMONG | PALMYRENE | PAU_CIN_HAU | PHAGS_PA | PHOENICIAN | PSALTER_PAHLAVI | REJANG | RUNIC | SAMARITAN | SAURASHTRA | SHARADA | SHAVIAN | SIDDHAM | SIGNWRITING | SINHALA | SOGDIAN | SORA_SOMPENG | SOYOMBO | SUNDANESE | SYLOTI_NAGRI | SYRIAC | TAGALOG | TAGBANWA | TAI_LE | TAI_THAM | TAI_VIET | TAKRI | TAMIL | TANGSA | TANGUT | TELUGU | THAANA | THAI | TIBETAN | TIFINAGH | TIRHUTA | TOTO | UGARITIC | VAI | VITHKUQI | WANCHO | WARANG_CITI | YEZIDI | YI | ZANABAZAR_SQUARE } char = _{ LETTER | NUMBER | "." | "-" | "_" | "/" | ":" | SCRIPTS } from_said = { ws* ~ char+ } oca_object = _{ ( capture_base | overlay ) } remove_oca_object = _{ ( remove_attribute | remove_overlay ) } attrs_key = _{ ^"attrs" ~ arg_ws} url = ${ string } json_key = ${ string } json_value = ${ string | url | json_object } json_pair = ${ json_key ~ arg_ws* ~ ":" ~ (arg_ws | NEWLINE)? ~ json_value ~ (arg_ws? ~ "," ~ (arg_ws | NEWLINE)?)? } json_object = ${ "{" ~ ((arg_ws | NEWLINE)? ~ arg_ws* ~ json_pair ~ (arg_ws | NEWLINE)?)+ ~ arg_ws* ~ "}" } capture_base = { ^"attribute" ~ attr_pairs+ } overlay = { overlay_header ~ overlay_body } overlay_header = { ^"overlay" ~ arg_ws+ ~ overlay_name ~ NEWLINE+ } remove_attribute = { ^"attribute" ~ (arg_ws ~ attr_key)* } overlay_body = { (kv_pair | nested_block)+ } nested_body = { (kv_pair | nested_block)+ } kv_pair = { ws+ ~ key_pair+ ~ NEWLINE* } nested_block = { ws+ ~ attr_key ~ NEWLINE+ ~ nested_body+} remove_overlay = { ^"label" ~ arg_ws ~ lang ~ (arg_ws ~ attrs_key ~ attr_key+)? } overlay_name = ${ string | char+} attr_key = ${ string | char+} key_value = { reference_type | string | array_type | char+} array_type = _{ "[" ~ arg_ws? ~ array? ~ arg_ws? ~ "]" } array = { key_value ~ (arg_ws? ~ "," ~ arg_ws? ~ key_value)* } key_pair = ${ attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ key_value } base_attr_type = @{ ("Text" | "Numeric" | "Boolean" | "Binary" | "DateTime" )} array_attr_type = ${( "["~ arg_ws? ~ (base_attr_type | reference_type | array_attr_type ) ~ arg_ws? ~"]" )} reference_type = _{ ref_said | ref_alias } alias = @{ char+ } said = @{ char+ } refs = _{^"refs:"} refn = _{^"refn:"} ref_said = _{ refs ~ said} ref_alias = _{ refn ~ alias } /// Type of the attribute, can be a base type, an array of base types or a reference to another object attr_type = ${ base_attr_type | array_attr_type | ref_said | ref_alias } /// Capture base attribute key=type pair attr_pair = @{attr_key ~ arg_ws? ~ "=" ~ arg_ws? ~ attr_type} /// List of capture base attributes attr_pairs = ${ (arg_ws ~ attr_pair)+ ~ NEWLINE*} // TODO add support for lang ISO lang = ${ ASCII_ALPHA{2} ~ ("-" ~ ASCII_ALPHA{2})? } file = { SOI ~ (empty_line | meta_comment_line | comment_line | commands)* ~ EOI }