unit regexpr; { TRegExpr class library Delphi Regular Expressions Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia You can choose to use this Pascal unit in one of the two following licenses: Option 1> You may use this software in any kind of development, including comercial, redistribute, and modify it freely, under the following restrictions : 1. This software is provided as it is, without any kind of warranty given. Use it at Your own risk.The author is not responsible for any consequences of use of this software. 2. The origin of this software may not be mispresented, You must not claim that You wrote the original software. If You use this software in any kind of product, it would be appreciated that there in a information box, or in the documentation would be an acknowledgement like Partial Copyright (c) 2004 Andrey V. Sorokin https://sorokin.engineer/ andrey@sorokin.engineer 3. You may not have any income from distributing this source (or altered version of it) to other developers. When You use this product in a comercial package, the source may not be charged seperatly. 4. Altered versions must be plainly marked as such, and must not be misrepresented as being the original software. 5. RegExp Studio application and all the visual components as well as documentation is not part of the TRegExpr library and is not free for usage. https://sorokin.engineer/ andrey@sorokin.engineer Option 2> The same modified LGPL with static linking exception as the Free Pascal RTL } { program is essentially a linear encoding of a nondeterministic finite-state machine (aka syntax charts or "railroad normal form" in parsing technology). Each node is an opcode plus a "next" pointer, possibly plus an operand. "Next" pointers of all nodes except BRANCH implement concatenation; a "next" pointer with a BRANCH on both ends of it connects two alternatives. (Here we have one of the subtle syntax dependencies: an individual BRANCH (as opposed to a collection of them) is never concatenated with anything because of operator precedence.) The operand of some types of node is a literal string; for others, it is a node leading into a sub-FSM. In particular, the operand of a BRANCH node is the first node of the branch. (NB this is *not* a tree structure: the tail of the branch connects to the thing following the set of BRANCHes.) } interface { off $DEFINE DebugSynRegExpr } // ======== Determine compiler {$I regexpr_compilers.inc} // ======== Define base compiler options {$BOOLEVAL OFF} {$EXTENDEDSYNTAX ON} {$LONGSTRINGS ON} {$IFDEF FPC} {$MODE DELPHI} // Delphi-compatible mode in FreePascal {$INLINE ON} {$ENDIF} // ======== Define options for TRegExpr engine {$DEFINE UnicodeRE} // Use WideChar for characters and UnicodeString/WideString for strings { off $DEFINE UnicodeEx} // Support Unicode >0xFFFF, e.g. emoji, e.g. "." must find 2 WideChars of 1 emoji { off $DEFINE UseWordChars} // Use WordChars property, otherwise fixed list 'a'..'z','A'..'Z','0'..'9','_' { off $DEFINE UseSpaceChars} // Use SpaceChars property, otherwise fixed list { off $DEFINE UseLineSep} // Use LineSeparators property, otherwise fixed line-break chars {$IFDEF UNICODE} {$IFNDEF UnicodeRE} {$MESSAGE ERROR 'You cannot undefine UnicodeRE for Unicode Delphi versions'} {$ENDIF} {$ENDIF} {$IFDEF FPC} {$DEFINE FastUnicodeData} // Use arrays for UpperCase/LowerCase/IsWordChar, they take 320K more memory {$ENDIF} { off $DEFINE RegExpWithStackOverflowCheck} // Check the recursion depth and abort matching before stack overflows (available only for some OS/CPU) {$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string {$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string {$IFNDEF FPC} // Not supported in FreePascal {$DEFINE reRealExceptionAddr} // Exceptions will point to appropriate source line, not to Error procedure {$ENDIF} {$DEFINE ComplexBraces} // Support braces in complex cases {$IFNDEF UnicodeRE} {$UNDEF UnicodeEx} {$UNDEF FastUnicodeData} {$ENDIF} {.$DEFINE Compat} // Enable compatability methods/properties for forked version in Free Pascal 3.0 // ======== Define Pascal-language options // Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes // completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options. {$IFDEF D3} { $DEFINE WITH_REGEX_ASSERT} {$ENDIF} {$IFDEF FPC}{$IFOPT C+} {$DEFINE WITH_REGEX_ASSERT} {$ENDIF}{$ENDIF} // Only if compile with -Sa // Define 'use subroutine parameters default values' option (do not edit this definition). {$IFDEF D4} {$DEFINE DefParam} {$ENDIF} {$IFDEF FPC} {$DEFINE DefParam} {$ENDIF} // Define 'OverMeth' options, to use method overloading (do not edit this definitions). {$IFDEF D5} {$DEFINE OverMeth} {$ENDIF} {$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF} // Define 'InlineFuncs' options, to use inline keyword (do not edit this definitions). {$IFDEF D8} {$DEFINE InlineFuncs} {$ENDIF} {$IFDEF FPC} {$DEFINE InlineFuncs} {$ENDIF} {$PointerMath on} {$IFDEF RegExpWithStackOverflowCheck} // Define the stack checking algorithm for the current platform/CPU {$IF defined(Linux) or defined(Windows)}{$IF defined(CPU386) or defined(CPUX86_64)} {$DEFINE RegExpWithStackOverflowCheck_DecStack_Frame} // Stack-pointer decrements // use getframe over Sptr() {$ENDIF}{$ENDIF} {$ENDIF} uses SysUtils, // Exception {$IFDEF D2009} {$IFDEF D_XE2} System.Character, {$ELSE} Character, {$ENDIF} {$ENDIF} Classes; // TStrings in Split method type {$IFNDEF FPC} // Delphi doesn't have PtrInt but has NativeInt // but unfortunately NativeInt is declared wrongly in several versions {$IF SizeOf(Pointer)=4} PtrInt = Integer; PtrUInt = Cardinal; {$ELSE} PtrInt = Int64; PtrUInt = UInt64; {$IFEND} {$ENDIF} {$IFDEF UnicodeRE} PRegExprChar = PWideChar; {$IFDEF FPC} RegExprString = UnicodeString; {$ELSE} {$IFDEF D2009} RegExprString = UnicodeString; {$ELSE} RegExprString = WideString; {$ENDIF} {$ENDIF} REChar = WideChar; {$ELSE} PRegExprChar = PAnsiChar; RegExprString = AnsiString; REChar = AnsiChar; {$ENDIF} TREOp = REChar; // internal opcode type PREOp = ^TREOp; type TRegExprCharset = set of Byte; const // Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc) EscChar = '\'; // Substitute method: prefix of group reference: $1 .. $9 and $ SubstituteGroupChar = '$'; RegExprModifierI: Boolean = False; // default value for ModifierI RegExprModifierR: Boolean = True; // default value for ModifierR RegExprModifierS: Boolean = True; // default value for ModifierS RegExprModifierG: Boolean = True; // default value for ModifierG RegExprModifierM: Boolean = False; // default value for ModifierM RegExprModifierX: Boolean = False; // default value for ModifierX {$IFDEF UseSpaceChars} // default value for SpaceChars RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C; {$ENDIF} {$IFDEF UseWordChars} // default value for WordChars RegExprWordChars: RegExprString = '0123456789' + 'abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_'; {$ENDIF} {$IFDEF UseLineSep} // default value for LineSeparators RegExprLineSeparators: RegExprString = #$d#$a#$b#$c {$IFDEF UnicodeRE} + #$2028#$2029#$85 {$ENDIF}; {$ENDIF} // Tab and Unicode category "Space Separator": // https://www.compart.com/en/unicode/category/Zs RegExprHorzSeparators: RegExprString = #9#$20#$A0 {$IFDEF UnicodeRE} + #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000 {$ENDIF}; RegExprUsePairedBreak: Boolean = True; RegExprReplaceLineBreak: RegExprString = sLineBreak; const // Increment/keep-capacity for the size of arrays holding 'Group' related data // e.g., GrpBounds, GrpOpCodes and GrpNames RegexGroupCountIncrement = 50; // Max possible amount of groups. // Don't change it! It's defined by internal TRegExpr design. RegexMaxMaxGroups = MaxInt div 16; // Max depth of recursion for (?R) and (?1)..(?9) RegexMaxRecursion = 20; type TRegExprModifiers = record I: Boolean; // Case-insensitive. R: Boolean; // Extended syntax for Russian ranges in []. // If True, then а-я additionally includes letter 'ё', // А-Я additionally includes 'Ё', and а-Я includes all Russian letters. // Turn it off if it interferes with your national alphabet. S: Boolean; // Dot '.' matches any char, otherwise only [^\n]. G: Boolean; // Greedy. Switching it off switches all operators to non-greedy style, // so if G=False, then '*' works like '*?', '+' works like '+?' and so on. M: Boolean; // Treat string as multiple lines. It changes `^' and `$' from // matching at only the very start/end of the string to the start/end // of any line anywhere within the string. X: Boolean; // Allow comments in regex using # char. end; function IsModifiersEqual(const A, B: TRegExprModifiers): Boolean; type TRegExpr = class; TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object; TRegExprCharChecker = function(ch: REChar): Boolean of object; TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker; TRegExprCharCheckerInfo = record CharBegin, CharEnd: REChar; CheckerIndex: Integer; end; TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo; TRegExprAnchor = ( raNone, // Not anchored raBOL, // Must start at BOL raEOL, // Must start at EOL (maybe look behind) raContinue, // Must start at continue pos \G raOnlyOnce // Starts with .* must match from the start pos only. Must not be tried from a later pos ); TRegExprFindFixedLengthFlag = ( flfReturnAtNextNil, flfSkipLookAround ); TRegExprFindFixedLengthFlags = set of TRegExprFindFixedLengthFlag; {$IFDEF Compat} TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object; {$ENDIF} {$IFDEF ComplexBraces} POpLoopInfo = ^TOpLoopInfo; TOpLoopInfo = record Count: Integer; CurrentRegInput: PRegExprChar; BackTrackingAsAtom: Boolean; OuterLoop: POpLoopInfo; // for nested loops end; {$ENDIF} PPRegExprChar = ^PRegExprChar; TRegExprBoundsPtr = record TmpStart: PPRegExprChar; // pointer start of not yet finished group start in InputString // OP_CLOSE not yet reached // does not need to be cleared GrpStart: PPRegExprChar; // pointer to group start in InputString GrpEnd: PPRegExprChar; // pointer to group end in InputString end; TRegExprBounds = record TmpStart: array of PRegExprChar; // pointer start of not yet finished group start in InputString // OP_CLOSE not yet reached // does not need to be cleared GrpStart: array of PRegExprChar; // pointer to group start in InputString GrpEnd: array of PRegExprChar; // pointer to group end in InputString end; TRegExprBoundsArray = array[0 .. RegexMaxRecursion] of TRegExprBounds; PRegExprLookAroundInfo = ^TRegExprLookAroundInfo; TRegExprLookAroundInfo = record InputPos: PRegExprChar; // pointer to start of look-around in the input string savedInputCurrentEnd: PRegExprChar; // pointer to start of look-around in the input string IsNegative, HasMatchedToEnd: Boolean; IsBackTracking: Boolean; OuterInfo: PRegExprLookAroundInfo; // for nested lookaround end; TRegExprGroupName = record Name: RegExprString; Index: Integer; end; { TRegExprGroupNameList } TRegExprGroupNameList = object Names: array of TRegExprGroupName; NameCount: Integer; // get index of group (subexpression) by name, to support named groups // like in Python: (?Pregex) function MatchIndexFromName(const AName: RegExprString): Integer; procedure Clear; procedure Add(const AName: RegExprString; AnIndex: Integer); end; { TRegExpr } TRegExpr = class private FAllowBraceWithoutMin: Boolean; FAllowUnsafeLookBehind: Boolean; FAllowLiteralBraceWithoutRange: Boolean; FMatchesCleared: Boolean; fRaiseForRuntimeError: Boolean; GrpBounds: TRegExprBoundsArray; CurrentGrpBounds: TRegExprBoundsPtr; GrpNames: TRegExprGroupNameList; // names of groups, if non-empty GrpBacktrackingAsAtom: array of Boolean; // close of group[i] has set IsBacktrackingGroupAsAtom IsBacktrackingGroupAsAtom: Boolean; // Backtracking an entire atomic group that had matched. // Once the group matched it should not try any alternative matches within the group // If the pattern after the group fails, then the group fails (regardless of any alternative match in the group) GrpOpCodes: array of PRegExprChar; // pointer to opcode of group[i] (used by OP_SUBCALL*) GrpCount, ParsedGrpCount: Integer; {$IFDEF ComplexBraces} CurrentLoopInfoListPtr: POpLoopInfo; {$ENDIF} // The "internal use only" fields to pass info from compile // to execute that permits the execute phase to run lots faster on // simple cases. regAnchored: TRegExprAnchor; // is the match anchored (at beginning-of-line only)? // regAnchored permits very fast decisions on suitable starting points // for a match, cutting down the work a lot. regMust permits fast rejection // of lines that cannot possibly match. The regMust tests are costly enough // that regcomp() supplies a regMust only if the r.e. contains something // potentially expensive (at present, the only such thing detected is * or + // at the start of the r.e., which can involve a lot of backup). regMustLen is // supplied because the test in regexec() needs it and regcomp() is computing // it anyway. regMust: PRegExprChar; // string (pointer into program) that match must include, or nil regMustLen: Integer; // length of regMust string regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen) LookAroundInfoList: PRegExprLookAroundInfo; //regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used CurrentSubCalled: Integer; FMinMatchLen: integer; {$IFDEF UseFirstCharSet} FirstCharSet: TRegExprCharset; FirstCharArray: array[Byte] of Boolean; {$ENDIF} // work variables for Exec routines - save stack in recursion regInput: PRegExprChar; // pointer to currently handling char of input string fInputStart: PRegExprChar; // pointer to first char of input string fInputContinue: PRegExprChar; // pointer to char specified with Exec(AOffset), or start pos of ExecNext fInputEnd: PRegExprChar; // pointer after last char of input string fInputCurrentEnd: PRegExprChar; // pointer after last char of the current visible part of input string (can be limited by look-behind) fRegexStart: PRegExprChar; // pointer to first char of regex fRegexEnd: PRegExprChar; // pointer after last char of regex regRecursion: Integer; // current level of recursion (?R) (?1); always 0 if no recursion is used hasRecursion: Boolean; // work variables for compiler's routines regParse: PRegExprChar; // pointer to currently handling char of regex regNumBrackets: Integer; // count of () brackets regNumAtomicBrackets: Integer; // count of (?>) brackets regDummy: array [0..8 div SizeOf(REChar)] of REChar; // dummy pointer, used to detect 1st/2nd pass of Compile // if p=@regDummy, it is pass-1: opcode memory is not yet allocated programm: PRegExprChar; // pointer to opcode, =nil in pass-1 regCode: PRegExprChar; // pointer to last emitted opcode; changing in pass-2, but =@regDummy in pass-1 regCodeSize: Integer; // total opcode size in REChars regCodeWork: PRegExprChar; // pointer to opcode, to first code after MAGIC regExactlyLen: PLongInt; // pointer to length of substring of OP_EXACTLY* inside opcode fSecondPass: Boolean; // true inside pass-2 of Compile fExpression: RegExprString; // regex string fInputString: RegExprString; // input string fLastError: Integer; // Error call sets code of LastError fLastErrorOpcode: TREOp; fLastErrorSymbol: REChar; fModifiers: TRegExprModifiers; // regex modifiers fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation {$IFDEF UseSpaceChars} fSpaceChars: RegExprString; {$ENDIF} {$IFDEF UseWordChars} fWordChars: RegExprString; {$ENDIF} {$IFDEF UseLineSep} fLineSeparators: RegExprString; {$ENDIF} fUsePairedBreak: Boolean; fReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method fSlowChecksSizeMax: Integer; // Exec() param ASlowChecks is set to True, when Length(InputString) '1$ is \rub\' // If you want to place any number after '$' you must enclose it // with curly braces: '${12}'. // Example: 'a$12bc' -> 'abc' // 'a${1}2bc' -> 'a2bc'. function Substitute(const ATemplate: RegExprString): RegExprString; // Splits AInputStr to list by positions of all r.e. occurencies. // Internally calls Exec, ExecNext. procedure Split(const AInputStr: RegExprString; APieces: TStrings); function Replace(const AInputStr: RegExprString; const AReplaceStr: RegExprString; AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}) : RegExprString; {$IFDEF OverMeth} overload; function Replace(const AInputStr: RegExprString; AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload; {$ENDIF} // Returns AInputStr with r.e. occurencies replaced by AReplaceStr. // If AUseSubstitution is true, then AReplaceStr will be used // as template for Substitution methods. // For example: // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*'; // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True); // will return: def 'BLOCK' value 'test1' // Replace ('BLOCK( test1)', 'def "$1" value "$2"') // will return: def "$1" value "$2" // Internally calls Exec, ExecNext. // Overloaded version and ReplaceEx operate with callback function, // so you can implement really complex functionality. function ReplaceEx(const AInputStr: RegExprString; AReplaceFunc: TRegExprReplaceFunction): RegExprString; {$IFDEF Compat} function ExecPos(AOffset: Integer; ATryOnce: Boolean): Boolean; overload; deprecated 'Use modern form of ExecPos()'; class function InvertCaseFunction(const Ch: REChar): REChar; deprecated 'This has no effect now'; property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; deprecated 'This has no effect now'; property UseUnicodeWordDetection: Boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE UnicodeRE} instead'; property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; deprecated 'This has no effect now'; property EmptyInputRaisesError: Boolean read fEmptyInputRaisesError write fEmptyInputRaisesError; deprecated 'This has no effect now'; property UseOsLineEndOnReplace: Boolean read fUseOsLineEndOnReplace write SetUseOsLineEndOnReplace; deprecated 'Use property ReplaceLineEnd instead'; {$ENDIF} // Returns ID of last error, 0 if no errors (unusable if // Error method raises exception) and clear internal status // into 0 (no errors). function LastError: Integer; // Returns Error message for error with ID = AErrorID. function ErrorMsg(AErrorID: Integer): RegExprString; virtual; // Re-compile regex procedure Compile; {$IFDEF RegExpPCodeDump} // Show compiled regex in textual form function Dump(Indent: Integer = 0): RegExprString; // Show single opcode in textual form function DumpOp(op: TREOp): RegExprString; {$ENDIF} function IsCompiled: Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} // Opcode contains only operations for fixed match length: EXACTLY*, ANY*, etc function IsFixedLength(var op: TREOp; var ALen: Integer): Boolean; function IsFixedLengthEx(var op: TREOp; var AMinLen, AMaxLen: integer): boolean; // Regular expression. // For optimization, TRegExpr will automatically compiles it into 'P-code' // (You can see it with help of Dump method) and stores in internal // structures. Real [re]compilation occures only when it really needed - // while calling Exec, ExecNext, Substitute, Dump, etc // and only if Expression or other P-code affected properties was changed // after last [re]compilation. // If any errors while [re]compilation occures, Error method is called // (by default Error raises exception - see below) property Expression: RegExprString read fExpression write SetExpression; // Set/get default values of r.e.syntax modifiers. Modifiers in // r.e. (?ismx-ismx) will replace this default values. // If you try to set unsupported modifier, Error will be called // (by defaul Error raises exception ERegExpr). property ModifierStr: RegExprString read GetModifierStr write SetModifierStr; property ModifierI: Boolean read GetModifierI write SetModifierI; property ModifierR: Boolean read GetModifierR write SetModifierR; property ModifierS: Boolean read GetModifierS write SetModifierS; property ModifierG: Boolean read GetModifierG write SetModifierG; property ModifierM: Boolean read GetModifierM write SetModifierM; property ModifierX: Boolean read GetModifierX write SetModifierX; // returns current input string (from last Exec call or last assign // to this property). // Any assignment to this property clear Match* properties ! property InputString: RegExprString read fInputString write SetInputString; // SetInputSubString // Only looks at copy(AInputString, AInputStartPos, AInputLen) procedure SetInputSubString(const AInputString: RegExprString; AInputStartPos, AInputLen: Integer); // Number of subexpressions has been found in last Exec* call. // If there are no subexpr. but whole expr was found (Exec* returned True), // then SubExprMatchCount=0, if no subexpressions nor whole // r.e. found (Exec* returned false) then SubExprMatchCount=-1. // Note, that some subexpr. may be not found and for such // subexpr. MathPos=MatchLen=-1 and Match=''. // For example: Expression := '(1)?2(3)?'; // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3' // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1' // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3' // Exec ('2'): SubExprMatchCount=0, Match[0]='2' // Exec ('7') - return False: SubExprMatchCount=-1 property SubExprMatchCount: Integer read GetSubExprCount; // pos of entrance subexpr. #Idx into tested in last Exec* // string. First subexpr. has Idx=1, last - MatchCount, // whole r.e. has Idx=0. // Returns -1 if in r.e. no such subexpr. or this subexpr. // not found in input string. property MatchPos[Idx: Integer]: PtrInt read GetMatchPos; // len of entrance subexpr. #Idx r.e. into tested in last Exec* // string. First subexpr. has Idx=1, last - MatchCount, // whole r.e. has Idx=0. // Returns -1 if in r.e. no such subexpr. or this subexpr. // not found in input string. // Remember - MatchLen may be 0 (if r.e. match empty string) ! property MatchLen[Idx: Integer]: PtrInt read GetMatchLen; // == copy (InputString, MatchPos [Idx], MatchLen [Idx]) // Returns '' if in r.e. no such subexpr. or this subexpr. // not found in input string. property Match[Idx: Integer]: RegExprString read GetMatch; // get index of group (subexpression) by name, to support named groups // like in Python: (?Pregex) function MatchIndexFromName(const AName: RegExprString): Integer; function MatchFromName(const AName: RegExprString): RegExprString; // Returns position in r.e. where compiler stopped. // Useful for error diagnostics property CompilerErrorPos: PtrInt read GetCompilerErrorPos; {$IFDEF UseSpaceChars} // Contains chars, treated as /s (initially filled with RegExprSpaceChars // global constant) property SpaceChars: RegExprString read fSpaceChars write fSpaceChars; {$ENDIF} {$IFDEF UseWordChars} // Contains chars, treated as /w (initially filled with RegExprWordChars // global constant) property WordChars: RegExprString read fWordChars write fWordChars; {$ENDIF} {$IFDEF UseLineSep} // line separators (like \n in Unix) property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; {$ENDIF} // support paired line-break CR LF property UseLinePairedBreak: Boolean read fUsePairedBreak write SetUsePairedBreak; property ReplaceLineEnd: RegExprString read fReplaceLineEnd write fReplaceLineEnd; property SlowChecksSizeMax: Integer read fSlowChecksSizeMax write fSlowChecksSizeMax; // Errors during Exec() return false and set LastError. This option allows // them to raise an Exception property RaiseForRuntimeError: Boolean read fRaiseForRuntimeError write fRaiseForRuntimeError; property AllowUnsafeLookBehind: Boolean read FAllowUnsafeLookBehind write FAllowUnsafeLookBehind; // Make sure a { always is a range / don't allow unescaped literal usage property AllowLiteralBraceWithoutRange: Boolean read FAllowLiteralBraceWithoutRange write FAllowLiteralBraceWithoutRange; // support {,123} defaulting the min-matches to 0 property AllowBraceWithoutMin: Boolean read FAllowBraceWithoutMin write FAllowBraceWithoutMin; end; type ERegExpr = class(Exception) public ErrorCode: Integer; CompilerErrorPos: PtrInt; end; // true if string AInputString match regular expression ARegExpr // ! will raise exeption if syntax errors in ARegExpr function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): Boolean; // Split AInputStr into APieces by r.e. ARegExpr occurencies procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString; APieces: TStrings); // Returns AInputStr with r.e. occurencies replaced by AReplaceStr // If AUseSubstitution is true, then AReplaceStr will be used // as template for Substitution methods. // For example: // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*', // 'BLOCK( test1)', 'def "$1" value "$2"', True) // will return: def 'BLOCK' value 'test1' // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*', // 'BLOCK( test1)', 'def "$1" value "$2"') // will return: def "$1" value "$2" function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString; AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString; {$IFDEF OverMeth}overload; // Alternate form allowing to set more parameters. type TRegexReplaceOption = ( rroModifierI, rroModifierR, rroModifierS, rroModifierG, rroModifierM, rroModifierX, rroUseSubstitution, rroUseOsLineEnd ); TRegexReplaceOptions = set of TRegexReplaceOption; function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString; Options: TRegexReplaceOptions): RegExprString; overload; {$ENDIF} // Replace all metachars with its safe representation, // for example 'abc$cd.(' converts into 'abc\$cd\.\(' // This function useful for r.e. autogeneration from // user input function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString; // Makes list of subexpressions found in ARegExpr r.e. // In ASubExps every item represent subexpression, // from first to last, in format: // String - subexpression text (without '()') // low word of Object - starting position in ARegExpr, including '(' // if exists! (first position is 1) // high word of Object - length, including starting '(' and ending ')' // if exist! // AExtendedSyntax - must be True if modifier /m will be On while // using the r.e. // Useful for GUI editors of r.e. etc (You can find example of using // in TestRExp.dpr project) // Returns // 0 Success. No unbalanced brackets was found; // -1 There are not enough closing brackets ')'; // -(n+1) At position n was found opening '[' without // corresponding closing ']'; // n At position n was found closing bracket ')' without // corresponding opening '('. // If Result <> 0, then ASubExpr can contain empty items or illegal ones function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings; AExtendedSyntax: Boolean{$IFDEF DefParam} = False{$ENDIF}): Integer; implementation {$IFDEF FastUnicodeData} uses regexpr_unicodedata; {$ENDIF} const // TRegExpr.VersionMajor/Minor return values of these constants: REVersionMajor = 1; REVersionMinor = 184; OpKind_End = REChar(1); OpKind_MetaClass = REChar(2); OpKind_Range = REChar(3); OpKind_Char = REChar(4); OpKind_CategoryYes = REChar(5); OpKind_CategoryNo = REChar(6); RegExprAllSet = [0 .. 255]; RegExprWordSet = [Ord('a') .. Ord('z'), Ord('A') .. Ord('Z'), Ord('0') .. Ord('9'), Ord('_')]; RegExprDigitSet = [Ord('0') .. Ord('9')]; RegExprLowerAzSet = [Ord('a') .. Ord('z')]; RegExprUpperAzSet = [Ord('A') .. Ord('Z')]; RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet; RegExprSpaceSet = [Ord(' '), $9, $A, $D, $C]; RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UnicodeRE} + [$85] {$ENDIF}; RegExprHorzSeparatorsSet = [9, $20, $A0]; MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments type TRENextOff = PtrInt; // internal Next "pointer" (offset to current p-code) PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. TREBracesArg = Integer; // type of {m,n} arguments PREBracesArg = ^TREBracesArg; TREGroupKind = ( gkNormalGroup, gkNonCapturingGroup, gkAtomicGroup, gkNamedGroupReference, gkComment, gkModifierString, gkLookahead, gkLookaheadNeg, gkLookbehind, gkLookbehindNeg, gkRecursion, gkSubCall ); TReOpLookBehindOptions = packed record MatchLenMin, MatchLenMax: TREBracesArg; IsGreedy: REChar; end; PReOpLookBehindOptions = ^TReOpLookBehindOptions; const ReOpLookBehindOptionsSz = SizeOf(TReOpLookBehindOptions) div SizeOf(REChar); OPT_LOOKBEHIND_NON_GREEDY = REChar(0); OPT_LOOKBEHIND_GREEDY = REChar(1); OPT_LOOKBEHIND_FIXED = REChar(2); // Alexey T.: handling of that define FPC_REQUIRES_PROPER_ALIGNMENT was present even 15 years ago, // but with it, we have failing of some RegEx tests, on ARM64 CPU. // If I undefine FPC_REQUIRES_PROPER_ALIGNMENT, all tests run OK on ARM64 again. {$undef FPC_REQUIRES_PROPER_ALIGNMENT} const REOpSz = SizeOf(TREOp) div SizeOf(REChar); // size of OP_ command in REChars {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT} // add space for aligning pointer // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1; REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar)); // add space for aligning pointer {$ELSE} RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar)); // size of Next pointer in REChars REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar); // size of BRACES arguments in REChars {$ENDIF} RENumberSz = SizeOf(LongInt) div SizeOf(REChar); REBranchArgSz = 2; // 2 * (REChar div REChar) type TReGroupIndex = LongInt; PReGroupIndex = ^TReGroupIndex; const ReGroupIndexSz = SizeOf(TReGroupIndex) div SizeOf(REChar); type PtrPair = {$IFDEF UnicodeRE} ^LongInt; {$ELSE} ^Word; {$ENDIF} function GroupDataArraySize(ARequired, ACurrent: Integer): Integer; begin Result := ARequired; if Result > ACurrent then Exit; // Keep some extra if Result > ACurrent - RegexGroupCountIncrement then Result := ACurrent; end; function IsPairedBreak(p: PRegExprChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} const cBreak = {$IFDEF UnicodeRE} $000D000A; {$ELSE} $0D0A; {$ENDIF} begin Result := PtrPair(p)^ = cBreak; end; function IsAnyLineBreak(C: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case C of #10, #13, #$0B, #$0C {$ifdef UnicodeRE} , #$85 , #$2028 , #$2029 {$endif}: Result := True; else Result := False; end; end; function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF} begin while SBegin < SEnd do begin if SBegin^ = Ch then begin Result := SBegin; Exit; end; Inc(SBegin); end; Result := nil; end; function IsIgnoredChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case AChar of ' ', #9, #$d, #$a: Result := True else Result := False; end; end; function _IsMetaChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case AChar of 'd', 'D', 's', 'S', 'w', 'W', 'v', 'V', 'h', 'H', 'R': Result := True else Result := False; end; end; function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF} begin {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT} Result := Align(p, SizeOf(Pointer)); {$ELSE} Result := p; {$ENDIF} end; function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF} begin {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT} Result := Align(p, SizeOf(Integer)); {$ELSE} Result := p; {$ENDIF} end; function StrLScan(P: PRegExprChar; C: REChar; len: PtrInt): PRegExprChar; Var count: PtrInt; Begin count := 0; { Find first matching character of Ch in Str } while (count < len) do begin if C = P[count] then begin StrLScan := @(P[count]); exit; end; Inc(count); end; { nothing found. } StrLScan := nil; end; function StrLComp(str1,str2 : PRegExprChar; len : PtrInt) : PtrInt; var counter: PtrInt; c1, c2: REChar; begin if len = 0 then begin StrLComp := 0; exit; end; counter:=0; repeat c1:=str1[counter]; c2:=str2[counter]; inc(counter); until (c1<>c2) or (counter>=len) or (c1=#0) or (c2=#0); StrLComp:=ord(c1)-ord(c2); end; function StrLPos(str1,str2 : PRegExprChar; len1, len2: PtrInt) : PRegExprChar; var p : PRegExprChar; begin StrLPos := nil; if (str1 = nil) or (str2 = nil) then exit; len1 := len1 - len2 + 1; p := StrLScan(str1,str2^, len1); while p <> nil do begin if StrLComp(p, str2, len2)=0 then begin StrLPos := p; exit; end; inc(p); p := StrLScan(p, str2^, len1 - (p-str1)); end; end; {$IFDEF FastUnicodeData} function _UpperCase(Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF} begin Result := CharUpperArray[Ord(Ch)]; end; function _LowerCase(Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF} begin Result := CharLowerArray[Ord(Ch)]; end; {$ELSE} function _UpperCase(Ch: REChar): REChar; begin Result := Ch; if (Ch >= 'a') and (Ch <= 'z') then begin Dec(Result, 32); Exit; end; if Ord(Ch) < 128 then Exit; {$IFDEF FPC} {$IFDEF UnicodeRE} Result := UnicodeUpperCase(Ch)[1]; {$ELSE} Result := AnsiUpperCase(Ch)[1]; {$ENDIF} {$ELSE} {$IFDEF UnicodeRE} {$IFDEF D_XE4} Result := Ch.ToUpper; {$ELSE} {$IFDEF D2009} Result := TCharacter.ToUpper(Ch); {$ENDIF} {$ENDIF} {$ELSE} Result := AnsiUpperCase(Ch)[1]; {$ENDIF} {$ENDIF} end; function _LowerCase(Ch: REChar): REChar; begin Result := Ch; if (Ch >= 'A') and (Ch <= 'Z') then begin Inc(Result, 32); Exit; end; if Ord(Ch) < 128 then Exit; {$IFDEF FPC} {$IFDEF UnicodeRE} Result := UnicodeLowerCase(Ch)[1]; {$ELSE} Result := AnsiLowerCase(Ch)[1]; {$ENDIF} {$ELSE} {$IFDEF UnicodeRE} {$IFDEF D_XE4} Result := Ch.ToLower; {$ELSE} {$IFDEF D2009} Result := TCharacter.ToLower(Ch); {$ENDIF} {$ENDIF} {$ELSE} Result := AnsiLowerCase(Ch)[1]; {$ENDIF} {$ENDIF} end; {$ENDIF} function InvertCase(const Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF} begin Result := _UpperCase(Ch); if Result = Ch then Result := _LowerCase(Ch); end; function _FindClosingBracket(P, PEnd: PRegExprChar): PRegExprChar; var Level: Integer; begin Result := nil; Level := 1; repeat if P >= PEnd then Exit; case P^ of EscChar: Inc(P); '(': begin Inc(Level); end; ')': begin Dec(Level); if Level = 0 then begin Result := P; Exit; end; end; end; Inc(P); until False; end; {$IFDEF UNICODEEX} procedure IncUnicode(var p: PRegExprChar); {$IFDEF InlineFuncs}inline;{$ENDIF} // make additional increment if we are on low-surrogate char // no need to check p= $DC00) and (Ord(ch) <= $DFFF) then Inc(p); end; procedure IncUnicode2(var p: PRegExprChar; var N: Integer); {$IFDEF InlineFuncs}inline;{$ENDIF} var ch: REChar; begin Inc(p); Inc(N); ch := p^; if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then begin Inc(p); Inc(N); end; end; {$ENDIF} { ============================================================= } { ===================== Global functions ====================== } { ============================================================= } function IsModifiersEqual(const A, B: TRegExprModifiers): Boolean; begin Result := (A.I = B.I) and (A.G = B.G) and (A.M = B.M) and (A.S = B.S) and (A.R = B.R) and (A.X = B.X); end; function ParseModifiers(const APtr: PRegExprChar; ALen: Integer; var AValue: TRegExprModifiers): Boolean; // Parse string and set AValue if it's in format 'ismxrg-ismxrg' var IsOn: Boolean; i: Integer; begin Result := True; IsOn := True; for i := 0 to ALen-1 do case APtr[i] of '-': if IsOn then begin IsOn := False; end else begin Result := False; Exit; end; 'I', 'i': AValue.I := IsOn; 'R', 'r': AValue.R := IsOn; 'S', 's': AValue.S := IsOn; 'G', 'g': AValue.G := IsOn; 'M', 'm': AValue.M := IsOn; 'X', 'x': AValue.X := IsOn; else Result := False; Exit; end; end; function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): Boolean; var r: TRegExpr; begin r := TRegExpr.Create; try r.Expression := ARegExpr; Result := r.Exec(AInputStr); finally r.Free; end; end; { of function ExecRegExpr -------------------------------------------------------------- } procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString; APieces: TStrings); var r: TRegExpr; begin APieces.Clear; r := TRegExpr.Create; try r.Expression := ARegExpr; r.Split(AInputStr, APieces); finally r.Free; end; end; { of procedure SplitRegExpr -------------------------------------------------------------- } function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString; AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString; begin with TRegExpr.Create do try Expression := ARegExpr; Result := Replace(AInputStr, AReplaceStr, AUseSubstitution); finally Free; end; end; { of function ReplaceRegExpr -------------------------------------------------------------- } {$IFDEF OverMeth} function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString; Options: TRegexReplaceOptions): RegExprString; overload; begin with TRegExpr.Create do try ModifierI := (rroModifierI in Options); ModifierR := (rroModifierR in Options); ModifierS := (rroModifierS in Options); ModifierG := (rroModifierG in Options); ModifierM := (rroModifierM in Options); ModifierX := (rroModifierX in Options); // Set this after the above, if the regex contains modifiers, they will be applied. Expression := ARegExpr; if rroUseOsLineEnd in Options then ReplaceLineEnd := sLineBreak else ReplaceLineEnd := #10; Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options); finally Free; end; end; {$ENDIF} (* const MetaChars_Init = '^$.[()|?+*' + EscChar + '{'; MetaChars = MetaChars_Init; // not needed to be a variable, const is faster MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed. *) function _IsMetaSymbol1(ch: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case ch of '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{': Result := True else Result := False end; end; function _IsMetaSymbol2(ch: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case ch of '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{', ']', '}': Result := True else Result := False end; end; function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString; var i, i0, Len: Integer; ch: REChar; begin Result := ''; Len := Length(AStr); i := 1; i0 := i; while i <= Len do begin ch := AStr[i]; if _IsMetaSymbol2(ch) then begin Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch; i0 := i + 1; end; Inc(i); end; Result := Result + System.Copy(AStr, i0, MaxInt); // Tail end; { of function QuoteRegExprMetaChars -------------------------------------------------------------- } function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings; AExtendedSyntax: Boolean{$IFDEF DefParam} = False{$ENDIF}): Integer; type TStackItemRec = record SubExprIdx: Integer; StartPos: PtrInt; end; TStackArray = packed array [0 .. RegexMaxMaxGroups - 1] of TStackItemRec; var Len, SubExprLen: Integer; i, i0: Integer; Modif: TRegExprModifiers; Stack: ^TStackArray; StackIdx, StackSz: Integer; begin Result := 0; // no unbalanced brackets found at this very moment FillChar(Modif, SizeOf(Modif), 0); ASubExprs.Clear; // I don't think that adding to non empty list // can be useful, so I simplified algorithm to work only with empty list Len := Length(ARegExpr); // some optimization tricks // first we have to calculate number of subexpression to reserve // space in Stack array (may be we'll reserve more than needed, but // it's faster then memory reallocation during parsing) StackSz := 1; // add 1 for entire r.e. for i := 1 to Len do if ARegExpr[i] = '(' then Inc(StackSz); // SetLength (Stack, StackSz); GetMem(Stack, SizeOf(TStackItemRec) * StackSz); try StackIdx := 0; i := 1; while (i <= Len) do begin case ARegExpr[i] of '(': begin if (i < Len) and (ARegExpr[i + 1] = '?') then begin // this is not subexpression, but comment or other // Perl extension. We must check is it (?ismxrg-ismxrg) // and change AExtendedSyntax if /x is changed. Inc(i, 2); // skip '(?' i0 := i; while (i <= Len) and (ARegExpr[i] <> ')') do Inc(i); if i > Len then Result := -1 // unbalansed '(' else if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then // Alexey-T: original code had copy from i, not from i0 AExtendedSyntax := Modif.X; end else begin // subexpression starts ASubExprs.Add(''); // just reserve space with Stack[StackIdx] do begin SubExprIdx := ASubExprs.Count - 1; StartPos := i; end; Inc(StackIdx); end; end; ')': begin if StackIdx = 0 then Result := i // unbalanced ')' else begin Dec(StackIdx); with Stack[StackIdx] do begin SubExprLen := i - StartPos + 1; ASubExprs.Objects[SubExprIdx] := TObject(StartPos or (SubExprLen ShL 16)); ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1, SubExprLen - 2); // add without brackets end; end; end; EscChar: Inc(i); // skip quoted symbol '[': begin // we have to skip character ranges at once, because they can // contain '#', and '#' in it must NOT be recognized as eXtended // comment beginning! i0 := i; Inc(i); if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '[' then Inc(i); while (i <= Len) and (ARegExpr[i] <> ']') do if ARegExpr[i] = EscChar then Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]' else Inc(i); if (i > Len) or (ARegExpr[i] <> ']') then Result := -(i0 + 1); // unbalanced '[' end; '#': if AExtendedSyntax then begin // skip eXtended comments while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a) // do not use [#$d, #$a] due to Unicode compatibility do Inc(i); while (i + 1 <= Len) and ((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do Inc(i); // attempt to work with different kinds of line separators // now we are at the line separator that must be skipped. end; // here is no 'else' clause - we simply skip ordinary chars end; // of case Inc(i); // skip scanned char // ! can move after Len due to skipping quoted symbol end; // check brackets balance if StackIdx <> 0 then Result := -1; // unbalansed '(' // check if entire r.e. added if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1) or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len) // whole r.e. wasn't added because it isn't bracketed // well, we add it now: then ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1)); finally FreeMem(Stack); end; end; { of function RegExprSubExpressions -------------------------------------------------------------- } const OP_MAGIC = TREOp(216); // programm signature OP_EEND = TREOp(0); // End of program OP_BOL = TREOp(1); // Empty match at beginning of line OP_EOL = TREOp(2); // Empty match at end of line OP_ANY = TREOp(3); // Match any one character OP_ANYOF = TREOp(4); // Match any character in string OP_ANYBUT = TREOp(5); // Match any character not in string OP_BRANCH = TREOp(6); // Match this alternative, or the next OP_BACK = TREOp(7); // Jump backward (Next < 0) OP_EXACTLY = TREOp(8); // Match string exactly OP_NOTHING = TREOp(9); // Match empty string OP_STAR = TREOp(10); // Match this (simple) thing 0 or more times OP_PLUS = TREOp(11); // Match this (simple) thing 1 or more times OP_ANYDIGIT = TREOp(12); // Match any digit (equiv [0-9]) OP_NOTDIGIT = TREOp(13); // Match not digit (equiv [0-9]) OP_ANYLETTER = TREOp(14); // Match any 'word' char OP_NOTLETTER = TREOp(15); // Match any 'non-word' char OP_ANYSPACE = TREOp(16); // Match any 'space' char OP_NOTSPACE = TREOp(17); // Match 'not space' char OP_BRACES = TREOp(18); // Node,Min,Max Match this (simple) thing from Min to Max times. // Min and Max are TREBracesArg OP_COMMENT = TREOp(19); // Comment OP_EXACTLY_CI = TREOp(20); // Match string, case insensitive OP_ANYOF_CI = TREOp(21); // Match any character in string, case insensitive OP_ANYBUT_CI = TREOp(22); // Match any char not in string, case insensitive OP_LOOPENTRY = TREOp(23); // Start of loop (Node - LOOP for this loop) OP_LOOP = TREOp(24); // Back jump for LOOPENTRY // Min and Max are TREBracesArg // Node - next node in sequence, // LoopEntryJmp - associated LOOPENTRY node addr OP_EOL2 = TReOp(25); // like OP_EOL, but also matches before final line-break OP_CONTINUE_POS = TReOp(26); // \G, where offset is from last match end or from Exec(AOffset) OP_ANYLINEBREAK = TReOp(27); // \R OP_BSUBEXP = TREOp(28); // Match previously matched subexpression #Idx (stored as REChar) OP_BSUBEXP_CI = TREOp(29); // -"- in case-insensitive mode // Non-greedy ops OP_STAR_NG = TREOp(30); // Same as OP_START but in non-greedy mode OP_PLUS_NG = TREOp(31); // Same as OP_PLUS but in non-greedy mode OP_BRACES_NG = TREOp(32); // Same as OP_BRACES but in non-greedy mode OP_LOOP_NG = TREOp(33); // Same as OP_LOOP but in non-greedy mode // Multiline mode \m OP_BOL_ML = TREOp(34); // Match "" at beginning of line OP_EOL_ML = TREOp(35); // Match "" at end of line OP_ANY_ML = TREOp(36); // Match any one character // Word boundary OP_BOUND = TREOp(37); // Match "" between word char and non-word char OP_NOTBOUND = TREOp(38); // Opposite to OP_BOUND OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V OP_ANYCATEGORY = TREOp(43); // \p{L} OP_NOTCATEGORY = TREOp(44); // \P{L} // Possessive quantifiers OP_STAR_POSS = TReOp(45); OP_PLUS_POSS = TReOp(46); OP_BRACES_POSS = TReOp(47); OP_RECUR = TReOp(48); OP_OPEN = TREOp(50); // Opening of group OP_CLOSE = TREOp(51); // Closing of group OP_OPEN_ATOMIC = TREOp(52); // Opening of group OP_CLOSE_ATOMIC = TREOp(53); // Closing of group OP_LOOKAHEAD = TREOp(55); OP_LOOKAHEAD_NEG = TREOp(56); OP_LOOKAHEAD_END = TREOp(57); OP_LOOKBEHIND = TREOp(58); OP_LOOKBEHIND_NEG = TREOp(59); OP_LOOKBEHIND_END = TREOp(60); OP_SUBCALL = TREOp(65); // Call of subroutine; OP_SUBCALL+i is for group i OP_LOOP_POSS = TREOp(66); // Same as OP_LOOP but in non-greedy mode // Guarded branch // If a branch is know to begin with a specific letter (starts with OP_EXACTLY[_CI]) // then that letter can be tested before recursively calling MatchPrim. (guarded from non-match entering) OP_GBRANCH = TREOp(67); OP_GBRANCH_EX = TREOp(68); OP_GBRANCH_EX_CI = TREOp(69); OP_RESET_MATCHPOS = TReOp(70); OP_NONE = High(TREOp); // We work with p-code through pointers, compatible with PRegExprChar. // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc) // must have lengths that can be divided by SizeOf (REChar) ! // A node is TREOp of opcode followed Next "pointer" of TRENextOff type. // The Next is a offset from the opcode of the node containing it. // An operand, if any, simply follows the node. (Note that much of // the code generation knows about this implicit relationship!) // Using TRENextOff=PtrInt speed up p-code processing. // Opcodes description: // // BRANCH The set of branches constituting a single choice are hooked // together with their "next" pointers, since precedence prevents // anything being concatenated to any individual branch. The // "next" pointer of the last BRANCH in a choice points to the // thing following the whole choice. This is also where the // final "next" pointer of each individual branch points; each // branch starts with the operand node of a BRANCH node. // BACK Normal "next" pointers all implicitly point forward; BACK // exists to make loop structures possible. // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as // circular BRANCH structures using BACK. Complex '{min,max}' // - as pair LOOPENTRY-LOOP (see below). Simple cases (one // character per match) are implemented with STAR, PLUS and // BRACES for speed and to minimize recursive plunges. // LOOPENTRY,LOOP {min,max} are implemented as special pair // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for // current level. // OPEN,CLOSE are numbered at compile time. { ============================================================= } { ================== Error handling section =================== } { ============================================================= } const reeOk = 0; reeCompNullArgument = 100; reeUnknownMetaSymbol = 101; reeCompParseRegTooManyBrackets = 102; reeCompParseRegUnmatchedBrackets = 103; reeCompParseRegUnmatchedBrackets2 = 104; reeCompParseRegJunkOnEnd = 105; reeNotQuantifiable = 106; reeNestedQuantif = 107; reeBadHexDigit = 108; reeInvalidRange = 109; reeParseAtomTrailingBackSlash = 110; reeNoHexCodeAfterBSlashX = 111; reeHexCodeAfterBSlashXTooBig = 112; reeUnmatchedSqBrackets = 113; reeInternalUrp = 114; reeQuantifFollowsNothing = 115; reeTrailingBackSlash = 116; reeNoLetterAfterBSlashC = 117; reeMetaCharAfterMinusInRange = 118; reeRarseAtomInternalDisaster = 119; reeIncorrectSpecialBrackets = 120; reeIncorrectBraces = 121; reeBRACESArgTooBig = 122; reeUnknownOpcodeInFillFirst = 123; reeBracesMinParamGreaterMax = 124; reeUnclosedComment = 125; reeComplexBracesNotImplemented = 126; reeUnrecognizedModifier = 127; reeBadLinePairedSeparator = 128; reeBadUnicodeCategory = 129; reeTooSmallCheckersArray = 130; reeBadRecursion = 132; reeBadSubCall = 133; reeNamedGroupBad = 140; reeNamedGroupBadName = 141; reeNamedGroupBadRef = 142; reeNamedGroupDupName = 143; reeLookaheadBad = 150; reeLookbehindBad = 152; reeLookaroundNotSafe = 153; reeBadReference = 154; // Runtime errors must be >= reeFirstRuntimeCode reeFirstRuntimeCode = 1000; reeRegRepeatCalledInappropriately = 1000; reeMatchPrimMemoryCorruption = 1001; reeNoExpression = 1003; reeCorruptedProgram = 1004; reeOffsetMustBePositive = 1006; reeExecNextWithoutExec = 1007; reeBadOpcodeInCharClass = 1008; reeDumpCorruptedOpcode = 1011; reeLoopStackExceeded = 1014; reeLoopWithoutEntry = 1015; reeUnknown = 1016; function TRegExpr.ErrorMsg(AErrorID: Integer): RegExprString; begin case AErrorID of reeOk: Result := 'No errors'; reeCompNullArgument: Result := 'TRegExpr compile: null argument'; reeUnknownMetaSymbol: Result := 'TRegExpr compile: unknown meta-character: \' + fLastErrorSymbol; reeCompParseRegTooManyBrackets: Result := 'TRegExpr compile: ParseReg: too many ()'; reeCompParseRegUnmatchedBrackets: Result := 'TRegExpr compile: ParseReg: unmatched ()'; reeCompParseRegUnmatchedBrackets2: Result := 'TRegExpr compile: ParseReg: unmatched ()'; reeCompParseRegJunkOnEnd: Result := 'TRegExpr compile: ParseReg: junk at end'; reeNotQuantifiable: Result := 'TRegExpr compile: Token before *+ operand is not quantifiable'; reeNestedQuantif: Result := 'TRegExpr compile: nested quantifier *?+'; reeBadHexDigit: Result := 'TRegExpr compile: bad hex digit'; reeInvalidRange: Result := 'TRegExpr compile: invalid [] range'; reeParseAtomTrailingBackSlash: Result := 'TRegExpr compile: parse atom trailing \'; reeNoHexCodeAfterBSlashX: Result := 'TRegExpr compile: no hex code after \x'; reeNoLetterAfterBSlashC: Result := 'TRegExpr compile: no letter "A".."Z" after \c'; reeMetaCharAfterMinusInRange: Result := 'TRegExpr compile: metachar after "-" in [] range'; reeHexCodeAfterBSlashXTooBig: Result := 'TRegExpr compile: hex code after \x is too big'; reeUnmatchedSqBrackets: Result := 'TRegExpr compile: unmatched []'; reeInternalUrp: Result := 'TRegExpr compile: internal fail on char "|", ")"'; reeQuantifFollowsNothing: Result := 'TRegExpr compile: quantifier ?+*{ follows nothing'; reeTrailingBackSlash: Result := 'TRegExpr compile: trailing \'; reeRarseAtomInternalDisaster: Result := 'TRegExpr compile: RarseAtom internal disaster'; reeIncorrectSpecialBrackets: Result := 'TRegExpr compile: incorrect expression in (?...) brackets'; reeIncorrectBraces: Result := 'TRegExpr compile: incorrect {} braces'; reeBRACESArgTooBig: Result := 'TRegExpr compile: braces {} argument too big'; reeUnknownOpcodeInFillFirst: Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')'; reeBracesMinParamGreaterMax: Result := 'TRegExpr compile: braces {} min param greater then max'; reeUnclosedComment: Result := 'TRegExpr compile: unclosed (?#comment)'; reeComplexBracesNotImplemented: Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}'; reeUnrecognizedModifier: Result := 'TRegExpr compile: incorrect modifier'; reeBadLinePairedSeparator: Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty'; reeBadUnicodeCategory: Result := 'TRegExpr compile: invalid category after \p or \P'; reeTooSmallCheckersArray: Result := 'TRegExpr compile: too small CharCheckers array'; reeBadRecursion: Result := 'TRegExpr compile: bad recursion (?R)'; reeBadSubCall: Result := 'TRegExpr compile: bad subroutine call'; reeNamedGroupBad: Result := 'TRegExpr compile: bad named group'; reeNamedGroupBadName: Result := 'TRegExpr compile: bad identifier in named group'; reeNamedGroupBadRef: Result := 'TRegExpr compile: bad back-reference to named group'; reeNamedGroupDupName: Result := 'TRegExpr compile: named group defined more than once'; reeLookaheadBad: Result := 'TRegExpr compile: bad lookahead'; reeLookbehindBad: Result := 'TRegExpr compile: bad lookbehind'; reeLookaroundNotSafe: Result := 'TRegExpr compile: lookbehind brackets with variable length do not support captures'; reeBadReference: Result := 'TRegExpr compile: invalid syntax for reference to capture group'; reeRegRepeatCalledInappropriately: Result := 'TRegExpr exec: RegRepeat called inappropriately'; reeMatchPrimMemoryCorruption: Result := 'TRegExpr exec: MatchPrim memory corruption'; reeNoExpression: Result := 'TRegExpr exec: empty expression'; reeCorruptedProgram: Result := 'TRegExpr exec: corrupted opcode (no magic byte)'; reeOffsetMustBePositive: Result := 'TRegExpr exec: offset must be >0'; reeExecNextWithoutExec: Result := 'TRegExpr exec: ExecNext without Exec(Pos)'; reeBadOpcodeInCharClass: Result := 'TRegExpr exec: invalid opcode in char class'; reeDumpCorruptedOpcode: Result := 'TRegExpr dump: corrupted opcode'; reeLoopStackExceeded: Result := 'TRegExpr exec: loop stack exceeded'; reeLoopWithoutEntry: Result := 'TRegExpr exec: loop without loop entry'; reeUnknown: Result := 'TRegExpr exec: unknow error'; else Result := 'Unknown error'; end; end; { of procedure TRegExpr.Error -------------------------------------------------------------- } function TRegExpr.LastError: Integer; begin Result := fLastError; fLastError := reeOk; end; { of function TRegExpr.LastError -------------------------------------------------------------- } { ============================================================= } { ===================== Common section ======================== } { ============================================================= } class function TRegExpr.VersionMajor: Integer; begin Result := REVersionMajor; end; class function TRegExpr.VersionMinor: Integer; begin Result := REVersionMinor; end; constructor TRegExpr.Create; begin inherited; programm := nil; fExpression := ''; fInputString := ''; FillChar(fModifiers, SizeOf(fModifiers), 0); fModifiers.I := RegExprModifierI; fModifiers.R := RegExprModifierR; fModifiers.S := RegExprModifierS; fModifiers.G := RegExprModifierG; fModifiers.M := RegExprModifierM; fModifiers.X := RegExprModifierX; {$IFDEF UseSpaceChars} SpaceChars := RegExprSpaceChars; {$ENDIF} {$IFDEF UseWordChars} WordChars := RegExprWordChars; {$ENDIF} {$IFDEF UseLineSep} fLineSeparators := RegExprLineSeparators; {$ENDIF} fUsePairedBreak := RegExprUsePairedBreak; fReplaceLineEnd := RegExprReplaceLineBreak; fSlowChecksSizeMax := 2000; FAllowUnsafeLookBehind := False; fRaiseForRuntimeError := True; {$IFDEF UseLineSep} InitLineSepArray; {$ENDIF} InitCharCheckers; {$IFDEF Compat} fInvertCase := OldInvertCase; {$ENDIF} end; { of constructor TRegExpr.Create -------------------------------------------------------------- } { TRegExprGroupNameList } function TRegExprGroupNameList.MatchIndexFromName(const AName: RegExprString ): Integer; var i: Integer; begin for i := 0 to NameCount - 1 do if Names[i].Name = AName then begin Result := Names[i].Index; Exit; end; Result := -1; end; procedure TRegExprGroupNameList.Clear; begin NameCount := 0; if Length(Names) > RegexGroupCountIncrement then SetLength(Names, RegexGroupCountIncrement); end; procedure TRegExprGroupNameList.Add(const AName: RegExprString; AnIndex: Integer ); begin if NameCount >= Length(Names) then SetLength(Names, Length(Names) + 1 + RegexGroupCountIncrement); Names[NameCount].Name := AName; Names[NameCount].Index := AnIndex; inc(NameCount); end; {$IFDEF OverMeth} constructor TRegExpr.Create(const AExpression: RegExprString); begin Create; Expression := AExpression; end; {$ENDIF} destructor TRegExpr.Destroy; begin if programm <> nil then begin FreeMem(programm); programm := nil; end; end; procedure TRegExpr.SetExpression(const AStr: RegExprString); begin if (AStr <> fExpression) or not IsCompiled then begin fExpression := AStr; //UniqueString(fExpression); fRegexStart := PRegExprChar(fExpression); fRegexEnd := fRegexStart + Length(fExpression); InvalidateProgramm; end; end; function TRegExpr.GetSubExprCount: Integer; begin Result := -1; // if nothing found, we must return -1 per TRegExpr docs if (GrpBounds[0].GrpStart[0] <> nil) then Result := GrpCount; end; function TRegExpr.GetMatchPos(Idx: Integer): PtrInt; begin Result := -1; if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then Exit; if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then Result := GrpBounds[0].GrpStart[Idx] - fInputStart + 1; end; function TRegExpr.GetMatchLen(Idx: Integer): PtrInt; begin Result := -1; if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then Exit; if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then Result := GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]; end; function TRegExpr.GetMatch(Idx: Integer): RegExprString; begin Result := ''; if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then Exit; if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) and (GrpBounds[0].GrpEnd[Idx] > GrpBounds[0].GrpStart[Idx]) then SetString(Result, GrpBounds[0].GrpStart[Idx], GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]); end; function TRegExpr.MatchIndexFromName(const AName: RegExprString): Integer; begin Result := GrpNames.MatchIndexFromName(AName); end; function TRegExpr.MatchFromName(const AName: RegExprString): RegExprString; var Idx: Integer; begin Result := ''; Idx := GrpNames.MatchIndexFromName(AName); if Idx >= 0 then Result := GetMatch(Idx) else Result := ''; end; function TRegExpr.GetModifierStr: RegExprString; begin Result := '-'; if ModifierI then Result := 'i' + Result else Result := Result + 'i'; if ModifierR then Result := 'r' + Result else Result := Result + 'r'; if ModifierS then Result := 's' + Result else Result := Result + 's'; if ModifierG then Result := 'g' + Result else Result := Result + 'g'; if ModifierM then Result := 'm' + Result else Result := Result + 'm'; if ModifierX then Result := 'x' + Result else Result := Result + 'x'; if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On' then System.Delete(Result, Length(Result), 1); end; { of function TRegExpr.GetModifierStr -------------------------------------------------------------- } procedure TRegExpr.SetModifierG(AValue: Boolean); begin if fModifiers.G <> AValue then begin fModifiers.G := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierI(AValue: Boolean); begin if fModifiers.I <> AValue then begin fModifiers.I := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierM(AValue: Boolean); begin if fModifiers.M <> AValue then begin fModifiers.M := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierR(AValue: Boolean); begin if fModifiers.R <> AValue then begin fModifiers.R := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierS(AValue: Boolean); begin if fModifiers.S <> AValue then begin fModifiers.S := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierX(AValue: Boolean); begin if fModifiers.X <> AValue then begin fModifiers.X := AValue; InvalidateProgramm; end; end; procedure TRegExpr.SetModifierStr(const AStr: RegExprString); begin if ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then InvalidateProgramm else Error(reeUnrecognizedModifier); end; { ============================================================= } { ==================== Compiler section ======================= } { ============================================================= } {$IFDEF FastUnicodeData} function TRegExpr.IsWordChar(AChar: REChar): Boolean; begin // bit 7 in value: is word char Result := CharCategoryArray[Ord(AChar)] and 128 <> 0; end; (* // Unicode General Category UGC_UppercaseLetter = 0; Lu UGC_LowercaseLetter = 1; Ll UGC_TitlecaseLetter = 2; Lt UGC_ModifierLetter = 3; Lm UGC_OtherLetter = 4; Lo UGC_NonSpacingMark = 5; Mn UGC_CombiningMark = 6; Mc UGC_EnclosingMark = 7; Me UGC_DecimalNumber = 8; Nd UGC_LetterNumber = 9; Nl UGC_OtherNumber = 10; No UGC_ConnectPunctuation = 11; Pc UGC_DashPunctuation = 12; Pd UGC_OpenPunctuation = 13; Ps UGC_ClosePunctuation = 14; Pe UGC_InitialPunctuation = 15; Pi UGC_FinalPunctuation = 16; Pf UGC_OtherPunctuation = 17; Po UGC_MathSymbol = 18; Sm UGC_CurrencySymbol = 19; Sc UGC_ModifierSymbol = 20; Sk UGC_OtherSymbol = 21; So UGC_SpaceSeparator = 22; Zs UGC_LineSeparator = 23; Zl UGC_ParagraphSeparator = 24; Zp UGC_Control = 25; Cc UGC_Format = 26; Cf UGC_Surrogate = 27; Cs UGC_PrivateUse = 28; Co UGC_Unassigned = 29; Cn *) const CategoryNames: array[0..29] of array[0..1] of REChar = ( ('L', 'u'), ('L', 'l'), ('L', 't'), ('L', 'm'), ('L', 'o'), ('M', 'n'), ('M', 'c'), ('M', 'e'), ('N', 'd'), ('N', 'l'), ('N', 'o'), ('P', 'c'), ('P', 'd'), ('P', 's'), ('P', 'e'), ('P', 'i'), ('P', 'f'), ('P', 'o'), ('S', 'm'), ('S', 'c'), ('S', 'k'), ('S', 'o'), ('Z', 's'), ('Z', 'l'), ('Z', 'p'), ('C', 'c'), ('C', 'f'), ('C', 's'), ('C', 'o'), ('C', 'n') ); function IsCategoryFirstChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case AChar of 'L', 'M', 'N', 'P', 'S', 'C', 'Z': Result := True; else Result := False; end; end; function IsCategoryChars(AChar, AChar2: REChar): Boolean; var i: Integer; begin for i := Low(CategoryNames) to High(CategoryNames) do if (AChar = CategoryNames[i][0]) then if (AChar2 = CategoryNames[i][1]) then begin Result := True; Exit end; Result := False; end; function CheckCharCategory(AChar: REChar; Ch0, Ch1: REChar): Boolean; // AChar: check this char against opcode // Ch0, Ch1: opcode operands after OP_*CATEGORY var N: Byte; Name0, Name1: REChar; begin Result := False; // bits 0..6 are category N := CharCategoryArray[Ord(AChar)] and 127; if N <= High(CategoryNames) then begin Name0 := CategoryNames[N][0]; Name1 := CategoryNames[N][1]; if Ch0 <> Name0 then Exit; if Ch1 <> #0 then if Ch1 <> Name1 then Exit; Result := True; end; end; function MatchOneCharCategory(opnd, scan: PRegExprChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} // opnd: points to opcode operands after OP_*CATEGORY // scan: points into InputString begin Result := CheckCharCategory(scan^, opnd^, (opnd + 1)^); end; {$ELSE} function TRegExpr.IsWordChar(AChar: REChar): Boolean; begin {$IFDEF UseWordChars} Result := Pos(AChar, fWordChars) > 0; {$ELSE} case AChar of 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '_': Result := True else Result := False; end; {$ENDIF} end; {$ENDIF} function TRegExpr.IsSpaceChar(AChar: REChar): Boolean; begin {$IFDEF UseSpaceChars} Result := Pos(AChar, fSpaceChars) > 0; {$ELSE} case AChar of ' ', #$9, #$A, #$D, #$C: Result := True else Result := False; end; {$ENDIF} end; function TRegExpr.IsCustomLineSeparator(AChar: REChar): Boolean; begin {$IFDEF UseLineSep} {$IFDEF UnicodeRE} Result := Pos(AChar, fLineSeparators) > 0; {$ELSE} Result := fLineSepArray[Byte(AChar)]; {$ENDIF} {$ELSE} case AChar of #$d, #$a, {$IFDEF UnicodeRE} #$85, #$2028, #$2029, {$ENDIF} #$b, #$c: Result := True; else Result := False; end; {$ENDIF} end; function IsDigitChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case AChar of '0' .. '9': Result := True; else Result := False; end; end; function IsHorzSeparator(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin // Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs case AChar of #9, #$20, #$A0: Result := True; {$IFDEF UnicodeRE} #$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000: Result := True; {$ENDIF} else Result := False; end; end; function IsVertLineSeparator(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF} begin case AChar of #$d, #$a, #$b, #$c: Result := True; {$IFDEF UnicodeRE} #$2028, #$2029, #$85: Result := True; {$ENDIF} else Result := False; end; end; procedure TRegExpr.InvalidateProgramm; begin if programm <> nil then begin FreeMem(programm); programm := nil; end; end; { of procedure TRegExpr.InvalidateProgramm -------------------------------------------------------------- } procedure TRegExpr.Compile; begin if fExpression = '' then begin Error(reeNoExpression); Exit; end; CompileRegExpr(fRegexStart); end; { of procedure TRegExpr.Compile -------------------------------------------------------------- } {$IFDEF UseLineSep} procedure TRegExpr.InitLineSepArray; {$IFNDEF UnicodeRE} var i: Integer; {$ENDIF} begin {$IFNDEF UnicodeRE} FillChar(fLineSepArray, SizeOf(fLineSepArray), 0); for i := 1 to Length(fLineSeparators) do fLineSepArray[Byte(fLineSeparators[i])] := True; {$ENDIF} end; {$ENDIF} function TRegExpr.IsProgrammOk: Boolean; begin Result := False; // check modifiers if not IsModifiersEqual(fModifiers, fProgModifiers) then InvalidateProgramm; // compile if needed if programm = nil then begin Compile; // Check compiled programm if programm = nil then Exit; end; if programm[0] <> OP_MAGIC then Error(reeCorruptedProgram) else Result := True; end; { of function TRegExpr.IsProgrammOk -------------------------------------------------------------- } procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar); // set the next-pointer at the end of a node chain var scan: PRegExprChar; begin if p = @regDummy then Exit; // Find last node. scan := regLast(p); // Set Next 'pointer' if val < scan then PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // work around PWideChar subtraction bug (Delphi uses // shr after subtraction to calculate widechar distance %-( ) // so, if difference is negative we have .. the "feature" :( // I could wrap it in $IFDEF UnicodeRE, but I didn't because // "P – Q computes the difference between the address given // by P (the higher address) and the address given by Q (the // lower address)" - Delphi help quotation. else PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; end; { of procedure TRegExpr.Tail -------------------------------------------------------------- } procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar); // regtail on operand of first argument; nop if operandless begin // "Operandless" and "op != OP_BRANCH" are synonymous in practice. if (p = nil) or (p = @regDummy) or (PREOp(p)^ <> OP_BRANCH) and (PREOp(p)^ <> OP_GBRANCH) and (PREOp(p)^ <> OP_GBRANCH_EX) and (PREOp(p)^ <> OP_GBRANCH_EX_CI) then Exit; Tail(p + REOpSz + RENextOffSz + REBranchArgSz, val); end; { of procedure TRegExpr.OpTail -------------------------------------------------------------- } function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // emit a node, return location begin Result := regCode; if Result <> @regDummy then begin PREOp(regCode)^ := op; Inc(regCode, REOpSz); PRENextOff(AlignToPtr(regCode))^ := 0; // Next "pointer" := nil Inc(regCode, RENextOffSz); if (op = OP_EXACTLY) or (op = OP_EXACTLY_CI) then regExactlyLen := PLongInt(regCode) else regExactlyLen := nil; {$IFDEF DebugSynRegExpr} if regcode - programm > regCodeSize then raise Exception.Create('TRegExpr.EmitNode buffer overrun'); {$ENDIF} end else Inc(regCodeSize, REOpSz + RENextOffSz); // compute code size without code generation end; { of function TRegExpr.EmitNode -------------------------------------------------------------- } function TRegExpr.EmitBranch: PRegExprChar; begin Result := EmitNode(OP_BRANCH); EmitC(#0); EmitC(#0); end; procedure TRegExpr.EmitC(ch: REChar); begin if regCode <> @regDummy then begin regCode^ := ch; Inc(regCode); {$IFDEF DebugSynRegExpr} if regcode - programm > regCodeSize then raise Exception.Create('TRegExpr.EmitC buffer overrun'); {$ENDIF} end else Inc(regCodeSize, REOpSz); // Type of p-code pointer always is ^REChar end; { of procedure TRegExpr.EmitC -------------------------------------------------------------- } procedure TRegExpr.EmitInt(AValue: LongInt); begin if regCode <> @regDummy then begin PLongInt(regCode)^ := AValue; Inc(regCode, RENumberSz); {$IFDEF DebugSynRegExpr} if regcode - programm > regCodeSize then raise Exception.Create('TRegExpr.EmitInt buffer overrun'); {$ENDIF} end else Inc(regCodeSize, RENumberSz); end; function TRegExpr.EmitNodeWithGroupIndex(op: TREOp; AIndex: Integer): PRegExprChar; begin Result := EmitNode(op); EmitInt(AIndex); // TReGroupIndex = LongInt; end; function TRegExpr.EmitGroupRef(AIndex: Integer; AIgnoreCase: Boolean): PRegExprChar; begin if AIgnoreCase then Result := EmitNode(OP_BSUBEXP_CI) else Result := EmitNode(OP_BSUBEXP); EmitInt(AIndex); // TReGroupIndex = LongInt; end; {$IFDEF FastUnicodeData} procedure TRegExpr.FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar); // scan: points into regex string after '\p', to find category name // ch1, ch2: 2-char name of category; ch2 can be #0 var ch: REChar; pos1, pos2, namePtr: PRegExprChar; nameLen: Integer; begin ch1 := #0; ch2 := #0; ch := scan^; if IsCategoryFirstChar(ch) then begin ch1 := ch; Exit; end; if ch = '{' then begin pos1 := scan; pos2 := pos1; while (pos2 < fRegexEnd) and (pos2^ <> '}') do Inc(pos2); if pos2 >= fRegexEnd then Error(reeIncorrectBraces); namePtr := pos1+1; nameLen := pos2-pos1-1; Inc(scan, nameLen+1); if nameLen<1 then Error(reeBadUnicodeCategory); if nameLen>2 then Error(reeBadUnicodeCategory); if nameLen = 1 then begin ch1 := namePtr^; ch2 := #0; if not IsCategoryFirstChar(ch1) then Error(reeBadUnicodeCategory); Exit; end; if nameLen = 2 then begin ch1 := namePtr^; ch2 := (namePtr+1)^; if not IsCategoryChars(ch1, ch2) then Error(reeBadUnicodeCategory); Exit; end; end else Error(reeBadUnicodeCategory); end; function TRegExpr.EmitCategoryMain(APositive: Boolean): PRegExprChar; var ch, ch2: REChar; begin Inc(regParse); if regParse >= fRegexEnd then Error(reeBadUnicodeCategory); FindCategoryName(regParse, ch, ch2); if APositive then Result := EmitNode(OP_ANYCATEGORY) else Result := EmitNode(OP_NOTCATEGORY); EmitC(ch); EmitC(ch2); end; {$ENDIF} procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: Integer); // insert an operator in front of already-emitted operand // Means relocating the operand. var src, dst, place: PRegExprChar; i: Integer; begin if regCode = @regDummy then begin Inc(regCodeSize, sz); Exit; end; // move code behind insert position src := regCode; Inc(regCode, sz); {$IFDEF DebugSynRegExpr} if regCode - programm > regCodeSize then raise Exception.Create('TRegExpr.InsertOperator buffer overrun'); if fSecondPass and ( (opndregCodeSize) ) then raise Exception.Create('TRegExpr.InsertOperator invalid opnd'); {$ENDIF} dst := regCode; while src > opnd do begin Dec(dst); Dec(src); dst^ := src^; end; place := opnd; // Op node, where operand used to be. PREOp(place)^ := op; Inc(place, REOpSz); for i := 1 + REOpSz to sz do begin place^ := #0; Inc(place); end; for i := 0 to regNumBrackets - 1 do if (GrpOpCodes[i] <> nil) and (GrpOpCodes[i] >= opnd) then GrpOpCodes[i] := GrpOpCodes[i] + sz; end; { of procedure TRegExpr.InsertOperator -------------------------------------------------------------- } procedure TRegExpr.RemoveOperator(opnd: PRegExprChar; sz: Integer); // remove an operator in front of already-emitted operand // Means relocating the operand. var src, dst: PRegExprChar; i: Integer; begin if regCode = @regDummy then begin // Do not decrement regCodeSize => the fSecondPass may temporary fill the extra memory; Exit; end; // move code behind insert position {$IFDEF DebugSynRegExpr} if fSecondPass and ( (opnd=regCodeWork+regCodeSize) ) then raise Exception.Create('TRegExpr.RemoveOperator() invalid opnd'); if (sz > regCodeSize-(opnd-regCodeWork)) then raise Exception.Create('TRegExpr.RemoveOperator buffer underrun'); {$ENDIF} src := opnd + sz; dst := opnd; while src < regCode do begin dst^ := src^; Inc(dst); Inc(src); end; Dec(regCode, sz); for i := 0 to regNumBrackets - 1 do if (GrpOpCodes[i] <> nil) and (GrpOpCodes[i] > opnd) then GrpOpCodes[i] := GrpOpCodes[i] - sz; end; function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): Integer; {$IFDEF InlineFuncs}inline;{$ENDIF} // find length of initial segment of PStart string consisting // entirely of characters not from IsMetaSymbol1. begin Result := 0; while PStart < PEnd do begin if _IsMetaSymbol1(PStart^) then Exit; Inc(Result); Inc(PStart) end; end; const // Flags to be passed up and down. FLAG_WORST = 0; // Worst case FLAG_HASWIDTH = 1; // Cannot match empty string FLAG_SIMPLE = 2; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand FLAG_SPECSTART = 4; // Starts with * or + FLAG_LOOP = 8; // Has eithe *, + or {,n} with n>=2 FLAG_GREEDY = 16; // Has any greedy code FLAG_NOT_QUANTIFIABLE = 64; // "Piece" (ParsePiece) is look-around {$IFDEF UnicodeRE} RusRangeLoLow = #$430; // 'а' RusRangeLoHigh = #$44F; // 'я' RusRangeHiLow = #$410; // 'А' RusRangeHiHigh = #$42F; // 'Я' {$ELSE} RusRangeLoLow = #$E0; // 'а' in cp1251 RusRangeLoHigh = #$FF; // 'я' in cp1251 RusRangeHiLow = #$C0; // 'А' in cp1251 RusRangeHiHigh = #$DF; // 'Я' in cp1251 {$ENDIF} function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar): Boolean; // Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values, // and Data depends on Kind var OpKind: REChar; {$IFDEF FastUnicodeData} ch, ch2: REChar; {$ENDIF} N: integer; begin repeat OpKind := ABuffer^; case OpKind of OpKind_End: begin Result := False; Exit; end; OpKind_Range: begin Inc(ABuffer); if (AChar >= ABuffer^) then begin Inc(ABuffer); if (AChar <= ABuffer^) then begin Result := True; Exit; end; Inc(ABuffer); end else Inc(ABuffer, 2); end; OpKind_MetaClass: begin Inc(ABuffer); N := Ord(ABuffer^); if CharCheckers[N](AChar) then begin Result := True; Exit end; Inc(ABuffer); end; OpKind_Char: begin Inc(ABuffer); N := PLongInt(ABuffer)^; Inc(ABuffer, RENumberSz); repeat if ABuffer^ = AChar then begin Result := True; Exit; end; Inc(ABuffer); dec(n); until n = 0; end; {$IFDEF FastUnicodeData} OpKind_CategoryYes, OpKind_CategoryNo: begin Inc(ABuffer); ch := ABuffer^; Inc(ABuffer); ch2 := ABuffer^; Inc(ABuffer); Result := CheckCharCategory(AChar, ch, ch2); if OpKind = OpKind_CategoryNo then Result := not Result; if Result then Exit; end; {$ENDIF} {$IFDEF WITH_REGEX_ASSERT} else Error(reeBadOpcodeInCharClass); {$ENDIF} end; until False; // assume that Buffer is ended correctly end; procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharSet); {$IFDEF UseWordChars} var i: Integer; ch: REChar; {$ENDIF} begin {$IFDEF UseWordChars} ARes := []; for i := 1 to Length(fWordChars) do begin ch := fWordChars[i]; {$IFDEF UnicodeRE} if Ord(ch) <= $FF then {$ENDIF} Include(ARes, Byte(ch)); end; {$ELSE} ARes := RegExprWordSet; {$ENDIF} end; procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset); {$IFDEF UseSpaceChars} var i: Integer; ch: REChar; {$ENDIF} begin {$IFDEF UseSpaceChars} ARes := []; for i := 1 to Length(fSpaceChars) do begin ch := fSpaceChars[i]; {$IFDEF UnicodeRE} if Ord(ch) <= $FF then {$ENDIF} Include(ARes, Byte(ch)); end; {$ELSE} ARes := RegExprSpaceSet; {$ENDIF} end; procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: Boolean; var ARes: TRegExprCharset); var ch, ch2: REChar; TempSet: TRegExprCharSet; N, i: Integer; begin ARes := []; TempSet := []; repeat case ABuffer^ of OpKind_End: Exit; OpKind_Range: begin Inc(ABuffer); ch := ABuffer^; Inc(ABuffer); ch2 := ABuffer^; {$IFDEF UnicodeRE} if Ord(ch2) > $FF then ch2 := REChar($FF); {$ENDIF} Inc(ABuffer); for i := Ord(ch) to Ord(ch2) do begin Include(ARes, Byte(i)); if AIgnoreCase then Include(ARes, Byte(InvertCase(REChar(i)))); end; end; OpKind_MetaClass: begin Inc(ABuffer); N := Ord(ABuffer^); Inc(ABuffer); if N = CheckerIndex_Word then begin GetCharSetFromWordChars(TempSet); ARes := ARes + TempSet; end else if N = CheckerIndex_NotWord then begin GetCharSetFromWordChars(TempSet); ARes := ARes + (RegExprAllSet - TempSet); end else if N = CheckerIndex_Space then begin GetCharSetFromSpaceChars(TempSet); ARes := ARes + TempSet; end else if N = CheckerIndex_NotSpace then begin GetCharSetFromSpaceChars(TempSet); ARes := ARes + (RegExprAllSet - TempSet); end else if N = CheckerIndex_Digit then ARes := ARes + RegExprDigitSet else if N = CheckerIndex_NotDigit then ARes := ARes + (RegExprAllSet - RegExprDigitSet) else if N = CheckerIndex_VertSep then ARes := ARes + RegExprLineSeparatorsSet else if N = CheckerIndex_NotVertSep then ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet) else if N = CheckerIndex_HorzSep then ARes := ARes + RegExprHorzSeparatorsSet else if N = CheckerIndex_NotHorzSep then ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet) else if N = CheckerIndex_LowerAZ then begin if AIgnoreCase then ARes := ARes + RegExprAllAzSet else ARes := ARes + RegExprLowerAzSet; end else if N = CheckerIndex_UpperAZ then begin if AIgnoreCase then ARes := ARes + RegExprAllAzSet else ARes := ARes + RegExprUpperAzSet; end else if N = CheckerIndex_AnyLineBreak then begin ARes := ARes + RegExprLineSeparatorsSet; //we miss U+2028 and U+2029 here end else Error(reeBadOpcodeInCharClass); end; OpKind_Char: begin Inc(ABuffer); N := PLongInt(ABuffer)^; Inc(ABuffer, RENumberSz); for i := 1 to N do begin ch := ABuffer^; Inc(ABuffer); {$IFDEF UnicodeRE} if Ord(ch) <= $FF then {$ENDIF} begin Include(ARes, Byte(ch)); if AIgnoreCase then Include(ARes, Byte(InvertCase(ch))); end; end; end; {$IFDEF FastUnicodeData} OpKind_CategoryYes, OpKind_CategoryNo: begin // usage of FirstCharSet makes no sense for regex with \p \P ARes := RegExprAllSet; Exit; end; {$ENDIF} {$IFDEF WITH_REGEX_ASSERT} else Error(reeBadOpcodeInCharClass); {$ENDIF} end; until False; // assume that Buffer is ended correctly end; function TRegExpr.GetModifierG: Boolean; begin Result := fModifiers.G; end; function TRegExpr.GetModifierI: Boolean; begin Result := fModifiers.I; end; function TRegExpr.GetModifierM: Boolean; begin Result := fModifiers.M; end; function TRegExpr.GetModifierR: Boolean; begin Result := fModifiers.R; end; function TRegExpr.GetModifierS: Boolean; begin Result := fModifiers.S; end; function TRegExpr.GetModifierX: Boolean; begin Result := fModifiers.X; end; function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): Boolean; // Compile a regular expression into internal code // We can't allocate space until we know how big the compiled form will be, // but we can't compile it (and thus know how big it is) until we've got a // place to put the code. So we cheat: we compile it twice, once with code // generation turned off and size counting turned on, and once "for real". // This also means that we don't allocate space until we are sure that the // thing really will compile successfully, and we never have to move the // code and thus invalidate pointers into it. (Note that it has to be in // one piece because free() must be able to free it all.) // Beware that the optimization-preparation code in here knows about some // of the structure of the compiled regexp. var scan, scanTemp, longest, longestTemp: PRegExprChar; Len, LenTemp: Integer; FlagTemp, MaxMatchLen: integer; op: TREOp; begin Result := False; FlagTemp := 0; regParse := nil; // for correct error handling regExactlyLen := nil; GrpCount := 0; ParsedGrpCount := 0; GrpNames.Clear; fLastError := reeOk; fLastErrorOpcode := TREOp(0); hasRecursion := False; try if programm <> nil then begin FreeMem(programm); programm := nil; end; if ARegExp = nil then begin Error(reeCompNullArgument); Exit; end; fProgModifiers := fModifiers; // well, may it's paranoia. I'll check it later. // First pass: calculate opcode size, validate regex fSecondPass := False; fCompModifiers := fModifiers; regParse := ARegExp; regNumBrackets := 1; regNumAtomicBrackets := 0; regCodeSize := 0; regCode := @regDummy; regCodeWork := nil; EmitC(OP_MAGIC); if ParseReg(FlagTemp) = nil then begin regNumBrackets := 0; // Not calling InitInternalGroupData => array sizes not adjusted for FillChar regNumAtomicBrackets := 0; Exit; end; // Allocate memory GetMem(programm, regCodeSize * SizeOf(REChar)); InitInternalGroupData; // Second pass: emit opcode fSecondPass := True; fCompModifiers := fModifiers; regParse := ARegExp; regNumBrackets := 1; regNumAtomicBrackets := 0; GrpCount := ParsedGrpCount; ParsedGrpCount := 0; regCode := programm; regCodeWork := programm + REOpSz; EmitC(OP_MAGIC); if ParseReg(FlagTemp) = nil then Exit; // Dig out information for optimizations. IsFixedLengthEx(op, FMinMatchLen, MaxMatchLen); {$IFDEF UseFirstCharSet} FirstCharSet := []; FillFirstCharSet(regCodeWork); for Len := 0 to 255 do FirstCharArray[Len] := Byte(Len) in FirstCharSet; {$ENDIF} regAnchored := raNone; regMust := nil; regMustLen := 0; regMustString := ''; scan := regCodeWork; // First OP_BRANCH. // Starting-point info. if PREOp(scan)^ = OP_BOL then regAnchored := raBOL else if PREOp(scan)^ = OP_EOL then regAnchored := raEOL else if PREOp(scan)^ = OP_CONTINUE_POS then regAnchored := raContinue else // ".*", ".*?", ".*+" at the very start of the pattern, only need to be // tested from the start-pos of the InputString. // If a pattern matches, then the ".*" will always go forward to where the // rest of the pattern starts matching // OP_ANY is "ModifierS=True" if (PREOp(scan)^ = OP_STAR) or (PREOp(scan)^ = OP_STAR_NG) or (PREOp(scan)^ = OP_STAR_POSS) then begin scanTemp := AlignToInt(scan + REOpSz + RENextOffSz); if PREOp(scanTemp)^ = OP_ANY then regAnchored := raOnlyOnce; end else // "{0,} is the same as ".*". So the same optimization applies if (PREOp(scan)^ = OP_BRACES) or (PREOp(scan)^ = OP_BRACES_NG) or (PREOp(scan)^ = OP_BRACES_POSS) then begin scanTemp := AlignToInt(scan + REOpSz + RENextOffSz); if (PREBracesArg(scanTemp)^ = 0) // BracesMinCount and (PREBracesArg(scanTemp + REBracesArgSz)^ = MaxBracesArg) // BracesMaxCount then begin scanTemp := AlignToPtr(scanTemp + REBracesArgSz + REBracesArgSz); if PREOp(scanTemp)^ = OP_ANY then regAnchored := raOnlyOnce; end; end; // If there's something expensive in the r.e., find the longest // literal string that must appear and make it the regMust. Resolve // ties in favor of later strings, since the regstart check works // with the beginning of the r.e. and avoiding duplication // strengthens checking. Not a strong reason, but sufficient in the // absence of others. if (FlagTemp and FLAG_SPECSTART) <> 0 then begin longest := nil; Len := 0; while scan <> nil do begin if PREOp(scan)^ = OP_EXACTLY then begin longestTemp := scan + REOpSz + RENextOffSz + RENumberSz; LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^; if LenTemp >= Len then begin longest := longestTemp; Len := LenTemp; end; end; scan := regNext(scan); end; regMust := longest; regMustLen := Len; if regMustLen > 1 then // don't use regMust if too short SetString(regMustString, regMust, regMustLen); end; Result := True; finally begin if not Result then InvalidateProgramm; end; end; end; { of function TRegExpr.CompileRegExpr -------------------------------------------------------------- } function TRegExpr.ParseReg(var FlagParse: Integer): PRegExprChar; begin Result := DoParseReg(False, nil, FlagParse, OP_NONE, OP_COMMENT); // can't use OP_NONE // The "ender" op will not be omitted anyway end; function TRegExpr.DoParseReg(InBrackets: Boolean; BracketCounter: PInteger; var FlagParse: Integer; BeginGroupOp, EndGroupOP: TReOp): PRegExprChar; // regular expression, i.e. main body or parenthesized thing // Caller must absorb opening parenthesis. // Combining parenthesis handling with the base level of regular expression // is a trifle forced, but the need to tie the tails of the branches to what // follows makes it hard to avoid. var ret, br, ender, brStart: PRegExprChar; NBrackets: Integer; FlagTemp: Integer; SavedModifiers: TRegExprModifiers; HasGBranch, HasChoice: Boolean; begin Result := nil; FlagTemp := 0; FlagParse := FLAG_HASWIDTH; // Tentatively. NBrackets := 0; SavedModifiers := fCompModifiers; // Make an OP_OPEN node, if parenthesized. ret := nil; if InBrackets then begin if BracketCounter <> nil then begin if BracketCounter^ >= RegexMaxMaxGroups then begin Error(reeCompParseRegTooManyBrackets); Exit; end; NBrackets := BracketCounter^; Inc(BracketCounter^); if BeginGroupOp <> OP_NONE then ret := EmitNodeWithGroupIndex(BeginGroupOp, NBrackets); if fSecondPass and (BracketCounter = @regNumBrackets) then GrpOpCodes[NBrackets] := ret; end else if BeginGroupOp <> OP_NONE then ret := EmitNode(BeginGroupOp); end; // Pick up the branches, linking them together. br := ParseBranch(FlagTemp); brStart := br; if br = nil then begin Result := nil; Exit; end; if ret <> nil then Tail(ret, br) // OP_OPEN -> first. else ret := br; if (FlagTemp and FLAG_HASWIDTH) = 0 then FlagParse := FlagParse and not FLAG_HASWIDTH; FlagParse := FlagParse or FlagTemp and (FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY); HasGBranch := False; HasChoice := regParse^ = '|'; while (regParse^ = '|') do begin Inc(regParse); br := ParseBranch(FlagTemp); if br = nil then begin Result := nil; Exit; end; if br^ <> OP_BRANCH then HasGBranch := True; Tail(ret, br); // OP_BRANCH -> OP_BRANCH. if (FlagTemp and FLAG_HASWIDTH) = 0 then FlagParse := FlagParse and not FLAG_HASWIDTH; FlagParse := FlagParse or FlagTemp and (FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY); end; if fSecondPass then begin if HasGBranch then begin if brStart^ = OP_BRANCH then brStart^ := OP_GBRANCH; end else if not HasChoice then RemoveOperator(brStart, REOpSz + RENextOffSz + REBranchArgSz); end; // Make a closing node, and hook it on the end. if InBrackets and (EndGroupOP <> OP_NONE) then begin if BracketCounter <> nil then ender := EmitNodeWithGroupIndex(EndGroupOP, NBrackets) else ender := EmitNode(EndGroupOP); end else if (EndGroupOP = OP_NONE) then begin if HasChoice then ender := EmitNode(OP_COMMENT) // need something to hook the branches' tails too else ender := nil; end else ender := EmitNode(OP_EEND); if ender <> nil then begin Tail(ret, ender); // Hook the tails of the branches to the closing node. br := ret; while br <> nil do begin OpTail(br, ender); br := regNext(br); end; end; // Check for proper termination. if InBrackets then if regParse^ <> ')' then begin Error(reeCompParseRegUnmatchedBrackets); Exit; end else Inc(regParse); // skip trailing ')' if (not InBrackets) and (regParse < fRegexEnd) then begin if regParse^ = ')' then Error(reeCompParseRegUnmatchedBrackets2) else Error(reeCompParseRegJunkOnEnd); Exit; end; fCompModifiers := SavedModifiers; // restore modifiers of parent Result := ret; end; { of function TRegExpr.ParseReg -------------------------------------------------------------- } function TRegExpr.ParseBranch(var FlagParse: Integer): PRegExprChar; // one alternative of an | operator // Implements the concatenation operator. var ret, chain, latest: PRegExprChar; FlagTemp: Integer; begin FlagTemp := 0; FlagParse := FLAG_WORST; // Tentatively. ret := EmitBranch; chain := nil; while (regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')') do begin latest := ParsePiece(FlagTemp); if latest = nil then begin Result := nil; Exit; end; if fSecondPass and (latest <> nil) and (latest^ = OP_COMMENT) and ( ((regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')')) or (chain <> nil) ) then begin regCode := latest; continue; end; FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_LOOP or FLAG_GREEDY); if chain = nil // First piece. then begin FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART; MaybeGuardBranchPiece(ret); end else Tail(chain, latest); chain := latest; end; if chain = nil // Loop ran zero times. then EmitNode(OP_NOTHING); Result := ret; end; { of function TRegExpr.ParseBranch -------------------------------------------------------------- } procedure TRegExpr.MaybeGuardBranchPiece(piece: PRegExprChar); var opnd: PRegExprChar; ch: REChar; begin if not fSecondPass then exit; opnd := piece + REOpSz + RENextOffSz + REBranchArgSz; while opnd <> nil do begin case opnd^ of OP_OPEN, OP_OPEN_ATOMIC, OP_CLOSE, OP_CLOSE_ATOMIC, OP_COMMENT, OP_BOL, OP_CONTINUE_POS, OP_RESET_MATCHPOS, OP_BOUND, OP_NOTBOUND, OP_BACK: opnd := regNext(opnd); OP_PLUS, OP_PLUS_NG, OP_PLUS_POSS: opnd := opnd + REOpSz + RENextOffSz; OP_BRACES, OP_BRACES_NG, OP_BRACES_POSS: begin if PREBracesArg(AlignToPtr(opnd + REOpSz + RENextOffSz))^ >= 1 then opnd := opnd + REOpSz + RENextOffSz + 2*REBracesArgSz; break; end; OP_LOOPENTRY: begin if PREBracesArg(AlignToInt(regNext(opnd) + REOpSz + RENextOffSz))^ >= 1 then opnd := opnd + REOpSz + RENextOffSz; break; end; OP_LOOKAHEAD: // could contain OP_OPEN.... begin if ( ((opnd + 1 + RENextOffSz)^ = OP_EXACTLY) or ((opnd + 1 + RENextOffSz)^ = OP_EXACTLY_CI) ) then begin opnd := (opnd + 1 + RENextOffSz); break; end else opnd := regNext(regNext(opnd)); end; OP_LOOKAHEAD_NEG, OP_LOOKBEHIND, OP_LOOKBEHIND_NEG: opnd := regNext(regNext(opnd)); else break; end; end; if opnd <> nil then case opnd^ of OP_EXACTLY: begin piece^ := OP_GBRANCH_EX; ch := (opnd + REOpSz + RENextOffSz + RENumberSz)^; (piece + REOpSz + RENextOffSz)^ := ch; end; OP_EXACTLY_CI: begin piece^ := OP_GBRANCH_EX_CI; ch := (opnd + REOpSz + RENextOffSz + RENumberSz)^; (piece + REOpSz + RENextOffSz)^ := _UpperCase(ch); (piece + REOpSz + RENextOffSz + 1)^ := _LowerCase(ch); end; end; end; function TRegExpr.ParsePiece(var FlagParse: Integer): PRegExprChar; // something followed by possible [*+?{] // Note that the branching code sequences used for ? and the general cases // of * and + and { are somewhat optimized: they use the same OP_NOTHING node as // both the endmarker for their branch list and the body of the last branch. // It might seem that this node could be dispensed with entirely, but the // endmarker role is not redundant. function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg; begin Result := 0; if AEnd - AStart + 1 > 8 then begin // prevent stupid scanning Error(reeBRACESArgTooBig); Exit; end; while AStart <= AEnd do begin Result := Result * 10 + (Ord(AStart^) - Ord('0')); Inc(AStart); end; if (Result > MaxBracesArg) or (Result < 0) then begin Error(reeBRACESArgTooBig); Exit; end; end; var TheOp: TREOp; NextNode: PRegExprChar; procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossesive: boolean); {$IFDEF ComplexBraces} var off: TRENextOff; {$ENDIF} begin {$IFNDEF ComplexBraces} Error(reeComplexBracesNotImplemented); {$ELSE} if APossesive then TheOp := OP_LOOP_POSS else if ANonGreedyOp then TheOp := OP_LOOP_NG else TheOp := OP_LOOP; InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz); NextNode := EmitNode(TheOp); if regCode <> @regDummy then begin off := (Result + REOpSz + RENextOffSz) - (regCode - REOpSz - RENextOffSz); // back to Atom after OP_LOOPENTRY PREBracesArg(AlignToInt(regCode))^ := ABracesMin; Inc(regCode, REBracesArgSz); PREBracesArg(AlignToInt(regCode))^ := ABracesMax; Inc(regCode, REBracesArgSz); PRENextOff(AlignToPtr(regCode))^ := off; Inc(regCode, RENextOffSz); {$IFDEF DebugSynRegExpr} if regcode - programm > regCodeSize then raise Exception.Create ('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun'); {$ENDIF} end else Inc(regCodeSize, REBracesArgSz * 2 + RENextOffSz); Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP if regCode <> @regDummy then Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP {$ENDIF} end; procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossessive: Boolean); begin if APossessive then TheOp := OP_BRACES_POSS else if ANonGreedyOp then TheOp := OP_BRACES_NG else TheOp := OP_BRACES; InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2); if regCode <> @regDummy then begin PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin; PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax; end; end; function DoParseBraceMinMax(var BMin, BMax: TREBracesArg): Boolean; var p: PRegExprChar; begin Result := False; p := regParse; while IsDigitChar(regParse^) do // MUST appear Inc(regParse); if FAllowBraceWithoutMin and (regParse^ = ',') and (p = regParse) then begin if not (((regParse+1)^ >= '0') and ((regParse+1)^ <= '9')) then Exit; BMin := 0 end else if (regParse^ <> '}') and (regParse^ <> ',') or (p = regParse) then begin if not FAllowLiteralBraceWithoutRange then Error(reeIncorrectBraces); Exit; end else BMin := ParseNumber(p, regParse - 1); if regParse^ = ',' then begin Inc(regParse); p := regParse; while IsDigitChar(regParse^) do Inc(regParse); if regParse^ <> '}' then begin if not FAllowLiteralBraceWithoutRange then Error(reeIncorrectBraces); Exit; end; if p = regParse then BMax := MaxBracesArg else BMax := ParseNumber(p, regParse - 1); end else BMax := BMin; // {n} == {n,n} Result := True; end; function ParseBraceMinMax(var BMin, BMax: TREBracesArg): Boolean; begin Result := DoParseBraceMinMax(BMin, BMax); if Result and (BMin > BMax) then begin Error(reeBracesMinParamGreaterMax); Exit; end; end; function CheckBraceIsLiteral: Boolean; var dummyBracesMin, dummyBracesMax: TREBracesArg; savedRegParse: PRegExprChar; begin Result := False; if not FAllowLiteralBraceWithoutRange then exit; savedRegParse := regParse; Inc(regParse); Result := not DoParseBraceMinMax(dummyBracesMin, dummyBracesMax); regParse := savedRegParse; end; var op, nextch: REChar; NonGreedyOp, NonGreedyCh, PossessiveCh: Boolean; FlagTemp: Integer; BracesMin, BracesMax: TREBracesArg; savedRegParse: PRegExprChar; begin FlagTemp := 0; Result := ParseAtom(FlagTemp); if Result = nil then Exit; op := regParse^; if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then begin FlagParse := FlagTemp; Exit; end; case op of '*': begin if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin Error(reeNotQuantifiable); exit; end; FlagParse := FLAG_WORST or FLAG_SPECSTART or FLAG_LOOP; nextch := (regParse + 1)^; PossessiveCh := nextch = '+'; if PossessiveCh then begin NonGreedyCh := False; NonGreedyOp := False; end else begin NonGreedyCh := nextch = '?'; NonGreedyOp := NonGreedyCh or not fCompModifiers.G; end; if not NonGreedyCh then FlagParse := FlagParse or FLAG_GREEDY; if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) <> (FLAG_SIMPLE or FLAG_HASWIDTH) then begin if NonGreedyOp or PossessiveCh or ((FlagTemp and FLAG_HASWIDTH) = 0) then EmitComplexBraces(0, MaxBracesArg, NonGreedyOp, PossessiveCh) else begin // Too complex for OP_STAR. Write loop using OP_BRANCH and OP_BACK. // 1: OP_BRANCH with 2 branches - to allow backtracking // 1st choice: loop-content // OP_BACK back to the branch // execute another iteration of the branch, so each can backtrack // 2nd choice: OP_NOTHING to exit InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz + REBranchArgSz); OpTail(Result, EmitNode(OP_BACK)); OpTail(Result, Result); Tail(Result, EmitBranch); Tail(Result, EmitNode(OP_NOTHING)); MaybeGuardBranchPiece(Result); end end else begin // Simple AND has Width if PossessiveCh then TheOp := OP_STAR_POSS else if NonGreedyOp then TheOp := OP_STAR_NG else TheOp := OP_STAR; InsertOperator(TheOp, Result, REOpSz + RENextOffSz); end; if NonGreedyCh or PossessiveCh then Inc(regParse); // Skip extra char ('?') end; { of case '*' } '+': begin if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin Error(reeNotQuantifiable); exit; end; FlagParse := FLAG_WORST or FLAG_SPECSTART or (FlagTemp and FLAG_HASWIDTH) or FLAG_LOOP; nextch := (regParse + 1)^; PossessiveCh := nextch = '+'; if PossessiveCh then begin NonGreedyCh := False; NonGreedyOp := False; end else begin NonGreedyCh := nextch = '?'; NonGreedyOp := NonGreedyCh or not fCompModifiers.G; end; if not NonGreedyCh then FlagParse := FlagParse or FLAG_GREEDY; if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) <> (FLAG_SIMPLE or FLAG_HASWIDTH) then begin if NonGreedyOp or PossessiveCh or ((FlagTemp and FLAG_HASWIDTH) = 0) then EmitComplexBraces(1, MaxBracesArg, NonGreedyOp, PossessiveCh) else begin // Too complex for OP_PLUS. Write loop using OP_BRANCH and OP_BACK. // 1: loop-content // 2: OP_BRANCH with 2 choices - to allow backtracking // 2a: OP_BACK(1) to match the loop again (goto back, include another iteration of the branch in this choice) // 2b: OP_NOTHING to exit, if the loop can match no more (branch 2a did not match) NextNode := EmitBranch; Tail(Result, NextNode); Tail(EmitNode(OP_BACK), Result); Tail(NextNode, EmitBranch); Tail(Result, EmitNode(OP_NOTHING)); MaybeGuardBranchPiece(NextNode); end end else begin // Simple if PossessiveCh then TheOp := OP_PLUS_POSS else if NonGreedyOp then TheOp := OP_PLUS_NG else TheOp := OP_PLUS; InsertOperator(TheOp, Result, REOpSz + RENextOffSz); end; if NonGreedyCh or PossessiveCh then Inc(regParse); // Skip extra char ('?') end; { of case '+' } '?': begin FlagParse := FLAG_WORST; nextch := (regParse + 1)^; PossessiveCh := nextch = '+'; if PossessiveCh then begin NonGreedyCh := False; NonGreedyOp := False; end else begin NonGreedyCh := nextch = '?'; NonGreedyOp := NonGreedyCh or not fCompModifiers.G; end; if not NonGreedyCh then FlagParse := FlagParse or FLAG_GREEDY; if NonGreedyOp or PossessiveCh then begin // We emit x?? as x{0,1}? if (FlagTemp and FLAG_SIMPLE) = 0 then begin EmitComplexBraces(0, 1, NonGreedyOp, PossessiveCh); end else EmitSimpleBraces(0, 1, NonGreedyOp, PossessiveCh); end else begin // greedy '?' InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz + REBranchArgSz); // Either x Tail(Result, EmitBranch); // or NextNode := EmitNode(OP_NOTHING); // nil. Tail(Result, NextNode); OpTail(Result, NextNode); MaybeGuardBranchPiece(Result); end; if NonGreedyCh or PossessiveCh then Inc(regParse); // Skip extra char ('?') end; { of case '?' } '{': begin savedRegParse := regParse; Inc(regParse); if not ParseBraceMinMax(BracesMin, BracesMax) then begin regParse := savedRegParse; Exit; end; if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin Error(reeNotQuantifiable); exit; end; if BracesMin > 0 then FlagParse := FLAG_WORST or (FlagTemp and FLAG_HASWIDTH); if BracesMax > 0 then FlagParse := FlagParse or FLAG_SPECSTART; nextch := (regParse + 1)^; PossessiveCh := nextch = '+'; if PossessiveCh then begin NonGreedyCh := False; NonGreedyOp := False; end else begin NonGreedyCh := nextch = '?'; NonGreedyOp := NonGreedyCh or not fCompModifiers.G; end; if not NonGreedyCh then FlagParse := FlagParse or FLAG_GREEDY; if BracesMax >= 2 then FlagParse := FlagParse or FLAG_LOOP; if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) = (FLAG_SIMPLE or FLAG_HASWIDTH) then EmitSimpleBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh) else begin EmitComplexBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh); end; if NonGreedyCh or PossessiveCh then Inc(regParse); // Skip extra char '?' end; // of case '{' // else // here we can't be end; { of case op } FlagParse := FlagParse or FlagTemp and (FLAG_LOOP or FLAG_GREEDY); Inc(regParse); op := regParse^; if (op = '*') or (op = '+') or (op = '?') or ( (op = '{') and not CheckBraceIsLiteral) then Error(reeNestedQuantif); end; { of function TRegExpr.ParsePiece -------------------------------------------------------------- } function TRegExpr.HexDig(Ch: REChar): Integer; begin case Ch of '0' .. '9': Result := Ord(Ch) - Ord('0'); 'a' .. 'f': Result := Ord(Ch) - Ord('a') + 10; 'A' .. 'F': Result := Ord(Ch) - Ord('A') + 10; else Result := 0; Error(reeBadHexDigit); end; end; function TRegExpr.UnQuoteChar(var APtr, AEnd: PRegExprChar): REChar; var Ch: REChar; begin case APtr^ of 't': Result := #$9; // \t => tab (HT/TAB) 'n': Result := #$a; // \n => newline (NL) 'r': Result := #$d; // \r => carriage return (CR) 'f': Result := #$c; // \f => form feed (FF) 'a': Result := #$7; // \a => alarm (bell) (BEL) 'e': Result := #$1b; // \e => escape (ESC) 'c': begin // \cK => code for Ctrl+K Result := #0; Inc(APtr); if APtr >= AEnd then Error(reeNoLetterAfterBSlashC); Ch := APtr^; case Ch of 'a' .. 'z': Result := REChar(Ord(Ch) - Ord('a') + 1); 'A' .. 'Z': Result := REChar(Ord(Ch) - Ord('A') + 1); else Error(reeNoLetterAfterBSlashC); end; end; 'x': begin // \x: hex char Result := #0; Inc(APtr); if APtr >= AEnd then begin Error(reeNoHexCodeAfterBSlashX); Exit; end; if APtr^ = '{' then begin // \x{nnnn} repeat Inc(APtr); if APtr >= AEnd then begin Error(reeNoHexCodeAfterBSlashX); Exit; end; if APtr^ <> '}' then begin if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then begin Error(reeHexCodeAfterBSlashXTooBig); Exit; end; Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^)); // HexDig will cause Error if bad hex digit found end else Break; until False; end else begin Result := REChar(HexDig(APtr^)); // HexDig will cause Error if bad hex digit found Inc(APtr); if APtr >= AEnd then begin Error(reeNoHexCodeAfterBSlashX); Exit; end; Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^)); // HexDig will cause Error if bad hex digit found end; end; else Result := APtr^; if (Result <> '_') and IsWordChar(Result) then begin fLastErrorSymbol := Result; Error(reeUnknownMetaSymbol); end; end; end; function TRegExpr.ParseAtom(var FlagParse: Integer): PRegExprChar; // the lowest level // Optimization: gobbles an entire sequence of ordinary characters so that // it can turn them into a single node, which is smaller to store and // faster to run. Backslashed characters are exceptions, each becoming a // separate node; the code is simpler that way and it's not worth fixing. var ret, ret2, regLookBehindOption: PRegExprChar; RangeBeg, RangeEnd: REChar; CanBeRange: Boolean; AddrOfLen: PLongInt; HasCaseSenseChars: boolean; function ParseNumber(var AParsePos: PRegExprChar; out ANumber: Integer): Boolean; begin Result := False; ANumber := 0; while (AParsePos^ >= '0') and (AParsePos^ <= '9') do begin if ANumber > (High(ANumber)-10) div 10 then exit; ANumber := ANumber * 10 + (Ord(AParsePos^) - Ord('0')); inc(AParsePos); end; Result := True; end; procedure EmitExactly(Ch: REChar); var cs: Boolean; begin if fCompModifiers.I then ret := EmitNode(OP_EXACTLY_CI) else ret := EmitNode(OP_EXACTLY); EmitInt(1); cs := False; if fCompModifiers.I then begin Ch := _UpperCase(Ch); EmitC(Ch); if Ch <> _LowerCase(Ch) then cs := True; end else EmitC(Ch); if not cs then PREOp(ret)^ := OP_EXACTLY; FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; procedure EmitRangeChar(Ch: REChar; AStartOfRange: Boolean); begin CanBeRange := AStartOfRange; if fCompModifiers.I then begin Ch := _UpperCase(Ch); if Ch <> _LowerCase(Ch) then HasCaseSenseChars := True; end; if AStartOfRange then begin AddrOfLen := nil; RangeBeg := Ch; end else begin if AddrOfLen = nil then begin EmitC(OpKind_Char); Pointer(AddrOfLen) := regCode; EmitInt(0); end; Inc(AddrOfLen^); EmitC(Ch); end; end; procedure EmitRangePacked(ch1, ch2: REChar); var ChkIndex: Integer; begin AddrOfLen := nil; CanBeRange := False; if fCompModifiers.I then begin ch1 := _UpperCase(ch1); ch2 := _UpperCase(ch2); if (Ch1 <> _LowerCase(Ch1)) or (Ch2 <> _LowerCase(Ch2)) then HasCaseSenseChars := True; end; for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and (CharCheckerInfos[ChkIndex].CharEnd = ch2) then begin EmitC(OpKind_MetaClass); EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex)); Exit; end; EmitC(OpKind_Range); EmitC(ch1); EmitC(ch2); end; {$IFDEF FastUnicodeData} procedure EmitCategoryInCharClass(APositive: Boolean); var ch, ch2: REChar; begin AddrOfLen := nil; CanBeRange := False; Inc(regParse); FindCategoryName(regParse, ch, ch2); if APositive then EmitC(OpKind_CategoryYes) else EmitC(OpKind_CategoryNo); EmitC(ch); EmitC(ch2); end; {$ENDIF} var FlagTemp: Integer; Len: Integer; SavedPtr: PRegExprChar; EnderChar, TempChar: REChar; DashForRange: Boolean; GrpKind: TREGroupKind; GrpName: RegExprString; GrpIndex, ALen, RegGrpCountBefore, AMaxLen: integer; NextCh: REChar; op: TREOp; SavedModifiers: TRegExprModifiers; begin Result := nil; FlagTemp := 0; FlagParse := FLAG_WORST; AddrOfLen := nil; GrpIndex := -1; Inc(regParse); case (regParse - 1)^ of '^': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; if not fCompModifiers.M {$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then ret := EmitNode(OP_BOL) else ret := EmitNode(OP_BOL_ML); end; '$': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; if not fCompModifiers.M {$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then ret := EmitNode(OP_EOL) else ret := EmitNode(OP_EOL_ML); end; '.': begin if fCompModifiers.S then begin ret := EmitNode(OP_ANY); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end else begin // not /s, so emit [^:LineSeparators:] ret := EmitNode(OP_ANY_ML); FlagParse := FlagParse or FLAG_HASWIDTH; // not so simple ;) end; end; '[': begin HasCaseSenseChars := False; if regParse^ = '^' then begin // Complement of range. if fCompModifiers.I then ret := EmitNode(OP_ANYBUT_CI) else ret := EmitNode(OP_ANYBUT); Inc(regParse); end else if fCompModifiers.I then ret := EmitNode(OP_ANYOF_CI) else ret := EmitNode(OP_ANYOF); CanBeRange := False; if regParse^ = ']' then begin // first ']' inside [] treated as simple char, no need to check '[' EmitRangeChar(regParse^, (regParse + 1)^ = '-'); Inc(regParse); end; while (regParse < fRegexEnd) and (regParse^ <> ']') do begin // last '-' inside [] treated as simple dash if (regParse^ = '-') and ((regParse + 1) < fRegexEnd) and ((regParse + 1)^ = ']') then begin EmitRangeChar('-', False); Inc(regParse); Break; end; // char '-' which (maybe) makes a range if (regParse^ = '-') and ((regParse + 1) < fRegexEnd) and CanBeRange then begin Inc(regParse); RangeEnd := regParse^; if RangeEnd = EscChar then begin if _IsMetaChar((regParse + 1)^) then begin Error(reeMetaCharAfterMinusInRange); Exit; end; Inc(regParse); RangeEnd := UnQuoteChar(regParse, fRegexEnd); end; // special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA if fCompModifiers.R and (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then begin EmitRangePacked(RusRangeLoLow, RusRangeLoHigh); EmitRangePacked(RusRangeHiLow, RusRangeHiHigh); end else begin // standard r.e. handling if RangeBeg > RangeEnd then begin Error(reeInvalidRange); Exit; end; EmitRangePacked(RangeBeg, RangeEnd); end; Inc(regParse); end else begin if regParse^ = EscChar then begin Inc(regParse); if regParse >= fRegexEnd then begin Error(reeParseAtomTrailingBackSlash); Exit; end; if _IsMetaChar(regParse^) then begin AddrOfLen := nil; CanBeRange := False; EmitC(OpKind_MetaClass); case regParse^ of 'w': EmitC(REChar(CheckerIndex_Word)); 'W': EmitC(REChar(CheckerIndex_NotWord)); 's': EmitC(REChar(CheckerIndex_Space)); 'S': EmitC(REChar(CheckerIndex_NotSpace)); 'd': EmitC(REChar(CheckerIndex_Digit)); 'D': EmitC(REChar(CheckerIndex_NotDigit)); 'v': EmitC(REChar(CheckerIndex_VertSep)); 'V': EmitC(REChar(CheckerIndex_NotVertSep)); 'h': EmitC(REChar(CheckerIndex_HorzSep)); 'H': EmitC(REChar(CheckerIndex_NotHorzSep)); 'R': EmitC(REChar(CheckerIndex_AnyLineBreak)); else Error(reeBadOpcodeInCharClass); end; end else {$IFDEF FastUnicodeData} if regParse^ = 'p' then EmitCategoryInCharClass(True) else if regParse^ = 'P' then EmitCategoryInCharClass(False) else {$ENDIF} begin TempChar := UnQuoteChar(regParse, fRegexEnd); // False if '-' is last char in [] DashForRange := (regParse + 2 < fRegexEnd) and ((regParse + 1)^ = '-') and ((regParse + 2)^ <> ']'); EmitRangeChar(TempChar, DashForRange); end; end else begin // False if '-' is last char in [] DashForRange := (regParse + 2 < fRegexEnd) and ((regParse + 1)^ = '-') and ((regParse + 2)^ <> ']'); EmitRangeChar(regParse^, DashForRange); end; Inc(regParse); end; end; { of while } AddrOfLen := nil; CanBeRange := False; EmitC(OpKind_End); if fCompModifiers.I and not HasCaseSenseChars then begin if PREOp(ret)^ = OP_ANYBUT_CI then PREOp(ret)^ := OP_ANYBUT; if PREOp(ret)^ = OP_ANYOF_CI then PREOp(ret)^ := OP_ANYOF; end; if regParse^ <> ']' then begin Error(reeUnmatchedSqBrackets); Exit; end; Inc(regParse); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; '(': begin GrpKind := gkNormalGroup; GrpName := ''; // A: detect kind of expression in brackets if regParse^ = '?' then begin NextCh := (regParse + 1)^; case NextCh of ':': begin // non-capturing group: (?:regex) GrpKind := gkNonCapturingGroup; Inc(regParse, 2); end; '>': begin // atomic group: (?>regex) GrpKind := gkAtomicGroup; Inc(regParse, 2); end; 'P': begin if (regParse + 4 >= fRegexEnd) then Error(reeNamedGroupBad); case (regParse + 2)^ of '<': begin // named group: (?Pregex) GrpKind := gkNormalGroup; FindGroupName(regParse + 3, fRegexEnd, '>', GrpName); Inc(regParse, Length(GrpName) + 4); end; '=': begin // back-reference to named group: (?P=name) GrpKind := gkNamedGroupReference; FindGroupName(regParse + 3, fRegexEnd, ')', GrpName); Inc(regParse, Length(GrpName) + 4); end; '>': begin // subroutine call to named group: (?P>name) GrpKind := gkSubCall; FindGroupName(regParse + 3, fRegexEnd, ')', GrpName); Inc(regParse, Length(GrpName) + 4); if fSecondPass then begin GrpIndex := GrpNames.MatchIndexFromName(GrpName); if GrpIndex < 1 then Error(reeNamedGroupBadRef); end; end; else Error(reeNamedGroupBad); end; end; '<': begin // lookbehind: (?<=foo)bar case (regParse + 2)^ of '=': begin if (regParse + 4 >= fRegexEnd) then Error(reeLookbehindBad); GrpKind := gkLookbehind; Inc(regParse, 3); end; '!': begin if (regParse + 4 >= fRegexEnd) then Error(reeLookbehindBad); GrpKind := gkLookbehindNeg; Inc(regParse, 3); end; 'A'..'Z', 'a'..'z': begin // named group: (?regex) if (regParse + 4 >= fRegexEnd) then Error(reeNamedGroupBad); GrpKind := gkNormalGroup; FindGroupName(regParse + 2, fRegexEnd, '>', GrpName); Inc(regParse, Length(GrpName) + 3); end; else Error(reeIncorrectSpecialBrackets); end; end; '=', '!': begin // lookaheads: foo(?=bar) and foo(?!bar) if (regParse + 3 >= fRegexEnd) then Error(reeLookaheadBad); if NextCh = '=' then begin GrpKind := gkLookahead; end else begin GrpKind := gkLookaheadNeg; end; Inc(regParse, 2); end; '#': begin // (?#comment) FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; GrpKind := gkComment; Inc(regParse, 2); end; 'a'..'z', '-': begin // modifiers string like (?mxr) GrpKind := gkModifierString; Inc(regParse); end; 'R', '0': begin // recursion (?R), (?0) GrpKind := gkRecursion; Inc(regParse, 2); if regParse^ <> ')' then Error(reeBadRecursion); Inc(regParse); end; '1'..'9': begin // subroutine call (?1)..(?99) GrpKind := gkSubCall; Inc(regParse, 1); if not ParseNumber(regParse, GrpIndex) or (regParse^ <> ')') then begin Error(reeBadRecursion); Exit; end; Inc(regParse, 1); if fSecondPass and (GrpIndex > GrpCount) then Error(reeBadSubCall); end; '''': begin // named group: (?'name'regex) if (regParse + 4 >= fRegexEnd) then Error(reeNamedGroupBad); GrpKind := gkNormalGroup; FindGroupName(regParse + 2, fRegexEnd, '''', GrpName); Inc(regParse, Length(GrpName) + 3); end; '&': begin // subroutine call to named group: (?&name) if (regParse + 2 >= fRegexEnd) then Error(reeBadSubCall); GrpKind := gkSubCall; FindGroupName(regParse + 2, fRegexEnd, ')', GrpName); Inc(regParse, Length(GrpName) + 3); if fSecondPass then begin GrpIndex := GrpNames.MatchIndexFromName(GrpName); if GrpIndex < 1 then Error(reeNamedGroupBadRef); end; end; else Error(reeIncorrectSpecialBrackets); end; end; // B: process found kind of brackets case GrpKind of gkNonCapturingGroup: begin ret := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_NONE); if ret = nil then begin Result := nil; Exit; end; FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY); end; gkNormalGroup, gkAtomicGroup: begin // skip this block for one of passes, to not double groups count; // must take first pass (we need GrpNames filled) if (GrpKind = gkNormalGroup) then begin Inc(ParsedGrpCount); if (not fSecondPass) and (GrpName <> '') then begin // first pass if GrpNames.MatchIndexFromName(GrpName) >= 0 then Error(reeNamedGroupDupName); GrpNames.Add(GrpName, ParsedGrpCount); end; end; if GrpKind = gkAtomicGroup then ret := DoParseReg(True, @regNumAtomicBrackets, FlagTemp, OP_OPEN_ATOMIC, OP_CLOSE_ATOMIC) else ret := DoParseReg(True, @regNumBrackets, FlagTemp, OP_OPEN, OP_CLOSE); if ret = nil then begin Result := nil; Exit; end; FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY); end; gkLookahead, gkLookaheadNeg: begin case GrpKind of gkLookahead: ret := EmitNode(OP_LOOKAHEAD); gkLookaheadNeg: ret := EmitNode(OP_LOOKAHEAD_NEG); end; Result := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_LOOKAHEAD_END); if Result = nil then Exit; Tail(ret, regLast(Result)); FlagParse := FlagParse and not FLAG_HASWIDTH; end; gkLookbehind, gkLookbehindNeg: begin case GrpKind of gkLookbehind: ret := EmitNode(OP_LOOKBEHIND); gkLookbehindNeg: ret := EmitNode(OP_LOOKBEHIND_NEG); end; regLookBehindOption := regCode; if (regCode <> @regDummy) then Inc(regCode, ReOpLookBehindOptionsSz) else Inc(regCodeSize, ReOpLookBehindOptionsSz); RegGrpCountBefore := ParsedGrpCount; Result := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_LOOKBEHIND_END); if Result = nil then Exit; Tail(ret, regLast(Result)); if (regCode <> @regDummy) then begin ALen := 0; ret2 := Result; if IsPartFixedLength(ret2, op, ALen, AMaxLen, OP_LOOKBEHIND_END, regCode, [flfSkipLookAround]) then PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_FIXED else if (ParsedGrpCount > RegGrpCountBefore) and (not FAllowUnsafeLookBehind) then Error(reeLookaroundNotSafe) else if (FlagTemp and (FLAG_GREEDY)) = (FLAG_GREEDY) then PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_GREEDY else PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_NON_GREEDY; PReOpLookBehindOptions(regLookBehindOption)^.MatchLenMin := ALen; PReOpLookBehindOptions(regLookBehindOption)^.MatchLenMax := AMaxLen; end; FlagParse := FlagParse and not FLAG_HASWIDTH; end; gkNamedGroupReference: begin Len := GrpNames.MatchIndexFromName(GrpName); if fSecondPass and (Len < 0) then Error(reeNamedGroupBadRef); ret := EmitGroupRef(Len, fCompModifiers.I); end; gkModifierString: begin SavedPtr := regParse; while (regParse < fRegexEnd) and (regParse^ <> ')') and (regParse^ <> ':') do Inc(regParse); SavedModifiers := fCompModifiers; if (regParse^ = ':') and ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then begin Inc(regParse); // skip ')' ret := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_COMMENT); // can't use OP_NONE // The "ender" op will not be omitted anyway fCompModifiers := SavedModifiers; if ret = nil then begin Result := nil; Exit; end; FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY); end else if (regParse^ = ')') and ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then begin Inc(regParse); // skip ')' ret := EmitNode(OP_COMMENT); // comment FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; end else begin Error(reeUnrecognizedModifier); Exit; end; end; gkComment: begin while (regParse < fRegexEnd) and (regParse^ <> ')') do Inc(regParse); if regParse^ <> ')' then begin Error(reeUnclosedComment); Exit; end; Inc(regParse); // skip ')' ret := EmitNode(OP_COMMENT); // comment end; gkRecursion: begin // set FLAG_HASWIDTH to allow compiling of such regex: b(?:m|(?R))*e FlagParse := FlagParse or FLAG_HASWIDTH; ret := EmitNode(OP_RECUR); hasRecursion := True; end; gkSubCall: begin // set FLAG_HASWIDTH like for (?R) FlagParse := FlagParse or FLAG_HASWIDTH; ret := EmitNodeWithGroupIndex(OP_SUBCALL, GrpIndex); hasRecursion := True; end; end; // case GrpKind of end; '|', ')': begin // Supposed to be caught earlier. Error(reeInternalUrp); Exit; end; '?', '+', '*': begin Error(reeQuantifFollowsNothing); Exit; end; EscChar: begin if regParse >= fRegexEnd then begin Error(reeTrailingBackSlash); Exit; end; case regParse^ of 'b': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_BOUND); end; 'B': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_NOTBOUND); end; 'A': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_BOL); end; 'z': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_EOL); end; 'Z': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_EOL2); end; 'G': begin FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; ret := EmitNode(OP_CONTINUE_POS); end; 'd': begin // r.e.extension - any digit ('0' .. '9') ret := EmitNode(OP_ANYDIGIT); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'D': begin // r.e.extension - not digit ('0' .. '9') ret := EmitNode(OP_NOTDIGIT); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 's': begin // r.e.extension - any space char ret := EmitNode(OP_ANYSPACE); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'S': begin // r.e.extension - not space char ret := EmitNode(OP_NOTSPACE); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'w': begin // r.e.extension - any english char / digit / '_' ret := EmitNode(OP_ANYLETTER); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'W': begin // r.e.extension - not english char / digit / '_' ret := EmitNode(OP_NOTLETTER); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'v': begin ret := EmitNode(OP_ANYVERTSEP); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'V': begin ret := EmitNode(OP_NOTVERTSEP); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'h': begin ret := EmitNode(OP_ANYHORZSEP); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'H': begin ret := EmitNode(OP_NOTHORZSEP); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; '1' .. '9': begin if fSecondPass and (Ord(regParse^) - Ord('0') > GrpCount) then Error(reeBadReference); ret := EmitGroupRef(Ord(regParse^) - Ord('0'), fCompModifiers.I); end; 'g': begin case (regParse + 1)^ of '<', '''': begin // subroutine call to named group case (regParse + 1)^ of '<': FindGroupName(regParse + 2, fRegexEnd, '>', GrpName); '''': FindGroupName(regParse + 2, fRegexEnd, '''', GrpName); end; Inc(regParse, Length(GrpName) + 2); GrpIndex := GrpNames.MatchIndexFromName(GrpName); if fSecondPass and (GrpIndex < 1) then Error(reeNamedGroupBadRef); ret := EmitNodeWithGroupIndex(OP_SUBCALL, GrpIndex); FlagParse := FlagParse or FLAG_HASWIDTH; hasRecursion := True; end; '{': begin // back-reference to named group FindGroupName(regParse + 2, fRegexEnd, '}', GrpName); Inc(regParse, Length(GrpName) + 2); GrpIndex := GrpNames.MatchIndexFromName(GrpName); if fSecondPass and (GrpIndex < 1) then Error(reeNamedGroupBadRef); ret := EmitGroupRef(GrpIndex, fCompModifiers.I); end; '0'..'9': begin inc(regParse); if not ParseNumber(regParse, GrpIndex) then begin Error(reeBadReference); Exit; end; dec(regParse); if GrpIndex = 0 then Error(reeBadReference); if fSecondPass and (GrpIndex > GrpCount) then Error(reeBadReference); ret := EmitGroupRef(GrpIndex, fCompModifiers.I); end; else Error(reeBadReference); end; end; 'k': begin // back-reference to named group case (regParse + 1)^ of '<': FindGroupName(regParse + 2, fRegexEnd, '>', GrpName); '''': FindGroupName(regParse + 2, fRegexEnd, '''', GrpName); '{': FindGroupName(regParse + 2, fRegexEnd, '}', GrpName); else Error(reeBadReference); end; Inc(regParse, Length(GrpName) + 2); GrpIndex := GrpNames.MatchIndexFromName(GrpName); if fSecondPass and (GrpIndex < 1) then Error(reeNamedGroupBadRef); ret := EmitGroupRef(GrpIndex, fCompModifiers.I); end; 'K': begin ret := EmitNode(OP_RESET_MATCHPOS); FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE; end; {$IFDEF FastUnicodeData} 'p': begin ret := EmitCategoryMain(True); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; 'P': begin ret := EmitCategoryMain(False); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; {$ENDIF} 'R': begin ret := EmitNode(OP_ANYLINEBREAK); FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; end; else EmitExactly(UnQuoteChar(regParse, fRegexEnd)); end; { of case } Inc(regParse); end; else begin Dec(regParse); if fCompModifiers.X and // check for eXtended syntax ((regParse^ = '#') or IsIgnoredChar(regParse^)) then begin // \x if regParse^ = '#' then begin // Skip eXtended comment // find comment terminator (group of \n and/or \r) while (regParse < fRegexEnd) and (regParse^ <> #$d) and (regParse^ <> #$a) do Inc(regParse); while (regParse^ = #$d) or (regParse^ = #$a) // skip comment terminator do Inc(regParse); // attempt to support different type of line separators end else begin // Skip the blanks! while IsIgnoredChar(regParse^) do Inc(regParse); end; ret := EmitNode(OP_COMMENT); // comment end else begin Len := FindSkippedMetaLen(regParse, fRegexEnd); if Len <= 0 then if regParse^ <> '{' then begin Error(reeRarseAtomInternalDisaster); Exit; end else Len := FindSkippedMetaLen(regParse + 1, fRegexEnd) + 1; // bad {n,m} - compile as EXACTLY EnderChar := (regParse + Len)^; if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then Dec(Len); // back off clear of ?+*{ operand. FlagParse := FlagParse or FLAG_HASWIDTH; if Len = 1 then FlagParse := FlagParse or FLAG_SIMPLE; if fCompModifiers.I then ret := EmitNode(OP_EXACTLY_CI) else ret := EmitNode(OP_EXACTLY); EmitInt(0); while (Len > 0) and ((not fCompModifiers.X) or (regParse^ <> '#')) do begin if not fCompModifiers.X or not IsIgnoredChar(regParse^) then begin if fCompModifiers.I then EmitC(_UpperCase(regParse^)) else EmitC(regParse^); if regCode <> @regDummy then Inc(regExactlyLen^); end; Inc(regParse); Dec(Len); end; end; { of if not comment } end; { of case else } end; { of case } Result := ret; end; { of function TRegExpr.ParseAtom -------------------------------------------------------------- } function TRegExpr.GetCompilerErrorPos: PtrInt; begin Result := 0; if (fRegexStart = nil) or (regParse = nil) then Exit; // not in compiling mode ? Result := regParse - fRegexStart; end; { of function TRegExpr.GetCompilerErrorPos -------------------------------------------------------------- } { ============================================================= } { ===================== Matching section ====================== } { ============================================================= } procedure TRegExpr.FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString); // check that group name is valid identifier, started from non-digit // this is to be like in Python regex var P: PRegExprChar; begin P := APtr; if IsDigitChar(P^) or not IsWordChar(P^) then Error(reeNamedGroupBadName); repeat if P >= AEndPtr then Error(reeNamedGroupBad); if P^ = AEndChar then Break; if not (IsWordChar(P^) or (P^ = '_')) then Error(reeNamedGroupBadName); Inc(P); until False; SetString(AName, APtr, P-APtr); end; function TRegExpr.FindRepeated(p: PRegExprChar; AMax: Integer): Integer; // repeatedly match something simple, report how many // p: points to current opcode var scan: PRegExprChar; opnd: PRegExprChar; TheMax: PtrInt; // PtrInt, gets diff of 2 pointers InvChar: REChar; {$IFDEF UnicodeEx} i: Integer; {$ENDIF} begin Result := 0; scan := regInput; // points into InputString opnd := p + REOpSz + RENextOffSz; // points to operand of opcode (after OP_nnn code) TheMax := fInputCurrentEnd - scan; if TheMax > AMax then TheMax := AMax; case PREOp(p)^ of OP_ANY: begin // note - OP_ANY_ML cannot be proceeded in FindRepeated because can skip // more than one char at once {$IFDEF UnicodeEx} for i := 1 to TheMax do IncUnicode2(scan, Result); {$ELSE} Result := TheMax; Inc(scan, Result); {$ENDIF} end; OP_EXACTLY: begin // in opnd can be only ONE char !!! { // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145 NLen := PLongInt(opnd)^; if TheMax > NLen then TheMax := NLen; } Inc(opnd, RENumberSz); while (Result < TheMax) and (opnd^ = scan^) do begin Inc(Result); Inc(scan); end; end; OP_EXACTLY_CI: begin // in opnd can be only ONE char !!! { // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145 NLen := PLongInt(opnd)^; if TheMax > NLen then TheMax := NLen; } Inc(opnd, RENumberSz); while (Result < TheMax) and (opnd^ = scan^) do begin // prevent unneeded InvertCase Inc(Result); Inc(scan); end; if Result < TheMax then begin InvChar := _LowerCase(opnd^); // store in register while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do begin Inc(Result); Inc(scan); end; end; end; OP_ANYDIGIT: while (Result < TheMax) and IsDigitChar(scan^) do begin Inc(Result); Inc(scan); end; OP_NOTDIGIT: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not IsDigitChar(scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not IsDigitChar(scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYLETTER: while (Result < TheMax) and IsWordChar(scan^) do begin Inc(Result); Inc(scan); end; OP_NOTLETTER: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not IsWordChar(scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not IsWordChar(scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYSPACE: while (Result < TheMax) and IsSpaceChar(scan^) do begin Inc(Result); Inc(scan); end; OP_NOTSPACE: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not IsSpaceChar(scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not IsSpaceChar(scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYVERTSEP: while (Result < TheMax) and IsVertLineSeparator(scan^) do begin Inc(Result); Inc(scan); end; OP_NOTVERTSEP: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not IsVertLineSeparator(scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not IsVertLineSeparator(scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYHORZSEP: while (Result < TheMax) and IsHorzSeparator(scan^) do begin Inc(Result); Inc(scan); end; OP_NOTHORZSEP: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not IsHorzSeparator(scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not IsHorzSeparator(scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYOF: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and FindInCharClass(opnd, scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and FindInCharClass(opnd, scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYBUT: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not FindInCharClass(opnd, scan^) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not FindInCharClass(opnd, scan^) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYOF_CI: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and FindInCharClass(opnd, _UpperCase(scan^)) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and FindInCharClass(opnd, _UpperCase(scan^)) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_ANYBUT_CI: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not FindInCharClass(opnd, _UpperCase(scan^)) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not FindInCharClass(opnd, _UpperCase(scan^)) do begin Inc(Result); Inc(scan); end; {$ENDIF} {$IFDEF FastUnicodeData} OP_ANYCATEGORY: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and MatchOneCharCategory(opnd, scan) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and MatchOneCharCategory(opnd, scan) do begin Inc(Result); Inc(scan); end; {$ENDIF} OP_NOTCATEGORY: {$IFDEF UNICODEEX} begin i := 0; while (i < TheMax) and not MatchOneCharCategory(opnd, scan) do begin Inc(i); IncUnicode2(scan, Result); end; end; {$ELSE} while (Result < TheMax) and not MatchOneCharCategory(opnd, scan) do begin Inc(Result); Inc(scan); end; {$ENDIF} {$ENDIF} OP_ANYLINEBREAK: while (Result < TheMax) and IsAnyLineBreak(scan^) do begin Inc(Result); Inc(scan); end; else Result := 0; Error(reeRegRepeatCalledInappropriately); Exit; end; { of case } regInput := scan; end; { of function TRegExpr.FindRepeated -------------------------------------------------------------- } function TRegExpr.regNext(p: PRegExprChar): PRegExprChar; // dig the "next" pointer out of a node var offset: TRENextOff; begin if p = @regDummy then begin Result := nil; Exit; end; offset := PRENextOff(AlignToPtr(p + REOpSz))^; if offset = 0 then Result := nil else Result := p + offset; end; function TRegExpr.regNextQuick(p: PRegExprChar): PRegExprChar; {$IFDEF FPC}inline;{$ENDIF} {$IFDEF WITH_REGEX_ASSERT} var offset: TRENextOff; {$ENDIF} begin // The inlined version is never called in the first pass. Assert(fSecondPass); // fSecondPass will also be true in MatchPrim. {$IFDEF WITH_REGEX_ASSERT} offset := PRENextOff(AlignToPtr(p + REOpSz))^; if offset = 0 then Result := nil else begin Result := p + offset; assert((Result >= programm) and (Result < programm + regCodeSize * SizeOf(REChar))); end; {$ELSE} Result := p + PRENextOff(AlignToPtr(p + REOpSz))^; {$ENDIF} end; function TRegExpr.regLast(p: PRegExprChar): PRegExprChar; var temp: PRegExprChar; begin Result := p; if p = @regDummy then Exit; // Find last node. repeat temp := regNext(Result); if temp = nil then Break; Result := temp; until False; end; type TRegExprMatchPrimLocals = record case TREOp of {$IFDEF ComplexBraces} OP_LOOPENTRY: ( LoopInfo: TOpLoopInfo; ); OP_LOOP: ( // and OP_LOOP_NG LoopInfoListPtr: POpLoopInfo; ); {$ENDIF} OP_LOOKAHEAD, OP_LOOKBEHIND: ( IsGreedy: REChar; LookAroundInfo: TRegExprLookAroundInfo; InpStart: PRegExprChar; // only OP_LOOKBEHIND ); OP_LOOKAHEAD_END, OP_LOOKBEHIND_END: ( LookAroundInfoPtr: PRegExprLookAroundInfo; ); OP_SUBCALL: ( savedCurrentSubCalled: Integer; ); OP_STAR: ( nextch: REChar; ); end; function TRegExpr.MatchPrim(prog: PRegExprChar): Boolean; // recursively matching routine // Conceptually the strategy is simple: check to see whether the current // node matches, call self recursively to see whether the rest matches, // and then act accordingly. In practice we make some effort to avoid // recursion, in particular by going through "ordinary" nodes (that don't // need to know whether the rest of the match failed) by a loop instead of // by recursion. var scan: PRegExprChar; next: PRegExprChar; // next node opnd, save: PRegExprChar; no: Integer; LoopCnt: Integer; Local: TRegExprMatchPrimLocals; begin Result := False; {$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame} if get_frame < StackLimit then begin error(reeLoopStackExceeded); exit; end; {$ENDIF} { // Alexey: not sure it's ok for long searches in big texts, so disabled if regNestedCalls > MaxRegexBackTracking then Exit; Inc(regNestedCalls); } scan := prog; while True do begin Assert(scan <> nil); next := regNextQuick(scan); case scan^ of OP_BOUND: begin if ( (regInput = fInputStart) or not IsWordChar((regInput - 1)^) ) = ( (regInput >= fInputEnd) or not IsWordChar(regInput^) ) then Exit; end; OP_NOTBOUND: begin if ( (regInput = fInputStart) or not IsWordChar((regInput - 1)^) ) <> ( (regInput >= fInputEnd) or not IsWordChar(regInput^) ) then Exit; end; OP_BOL: begin if regInput <> fInputStart then Exit; end; OP_CONTINUE_POS: begin if regInput <> fInputContinue then Exit; end; OP_RESET_MATCHPOS: begin save := GrpBounds[0].GrpStart[0]; GrpBounds[0].GrpStart[0] := regInput; Result := MatchPrim(next); if not Result then GrpBounds[0].GrpStart[0] := save; exit; end; OP_EOL: begin // \z matches at the very end if regInput < fInputEnd then Exit; end; OP_EOL2: begin // \Z matches at the very and + before the final line-break (LF and CR LF) if regInput < fInputEnd then begin if (regInput = fInputEnd - 1) and (regInput^ = #10) then begin end else if (regInput = fInputEnd - 2) and (regInput^ = #13) and ((regInput + 1) ^ = #10) then begin end else Exit; end; end; OP_BOL_ML: if regInput > fInputStart then begin if ((regInput - 1) <= fInputStart) or not IsPairedBreak(regInput - 2) then begin // don't stop between paired separator if IsPairedBreak(regInput - 1) then Exit; if not IsCustomLineSeparator((regInput - 1)^) then Exit; end; end; OP_EOL_ML: if regInput < fInputEnd then begin if not IsPairedBreak(regInput) then begin // don't stop between paired separator if (regInput > fInputStart) and IsPairedBreak(regInput - 1) then Exit; if not IsCustomLineSeparator(regInput^) then Exit; end; end; OP_ANY: begin if regInput >= fInputCurrentEnd then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANY_ML: begin if (regInput >= fInputCurrentEnd) or IsPairedBreak(regInput) or IsCustomLineSeparator(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYDIGIT: begin if (regInput >= fInputCurrentEnd) or not IsDigitChar(regInput^) then Exit; Inc(regInput); end; OP_NOTDIGIT: begin if (regInput >= fInputCurrentEnd) or IsDigitChar(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYLETTER: begin if (regInput >= fInputCurrentEnd) or not IsWordChar(regInput^) then Exit; Inc(regInput); end; OP_NOTLETTER: begin if (regInput >= fInputCurrentEnd) or IsWordChar(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYSPACE: begin if (regInput >= fInputCurrentEnd) or not IsSpaceChar(regInput^) then Exit; Inc(regInput); end; OP_NOTSPACE: begin if (regInput >= fInputCurrentEnd) or IsSpaceChar(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYVERTSEP: begin if (regInput >= fInputCurrentEnd) or not IsVertLineSeparator(regInput^) then Exit; Inc(regInput); end; OP_NOTVERTSEP: begin if (regInput >= fInputCurrentEnd) or IsVertLineSeparator(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYHORZSEP: begin if (regInput >= fInputCurrentEnd) or not IsHorzSeparator(regInput^) then Exit; Inc(regInput); end; OP_NOTHORZSEP: begin if (regInput >= fInputCurrentEnd) or IsHorzSeparator(regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_EXACTLY_CI: begin opnd := scan + REOpSz + RENextOffSz; // OPERAND no := PLongInt(opnd)^; if (regInput + no > fInputCurrentEnd) then Exit; Inc(opnd, RENumberSz); // Inline the first character, for speed. if (opnd^ <> regInput^) and (_LowerCase(opnd^) <> regInput^) then Exit; save := regInput; Inc(regInput, no); while no > 1 do begin Inc(save); Inc(opnd); if (opnd^ <> save^) and (_LowerCase(opnd^) <> save^) then Exit; Dec(no); end; end; OP_EXACTLY: begin opnd := scan + REOpSz + RENextOffSz; // OPERAND no := PLongInt(opnd)^; if (regInput + no > fInputCurrentEnd) then Exit; Inc(opnd, RENumberSz); // Inline the first character, for speed. if opnd^ <> regInput^ then Exit; save := regInput; Inc(regInput, no); while no > 1 do begin Inc(save); Inc(opnd); if opnd^ <> save^ then Exit; Dec(no); end; end; OP_BSUBEXP: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; if no < 0 then Exit; opnd := CurrentGrpBounds.GrpStart[no]; if opnd = nil then Exit; save := CurrentGrpBounds.GrpEnd[no]; if save = nil then Exit; no := save - opnd; save := regInput; if save + no - 1 >= fInputCurrentEnd then Exit; while no > 0 do begin if (save^ <> opnd^) then Exit; Inc(save); Inc(opnd); Dec(no); end; regInput := save; end; OP_BSUBEXP_CI: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; if no < 0 then Exit; opnd := CurrentGrpBounds.GrpStart[no]; if opnd = nil then Exit; save := CurrentGrpBounds.GrpEnd[no]; if save = nil then Exit; no := save - opnd; save := regInput; if save + no - 1 >= fInputCurrentEnd then Exit; while no > 0 do begin if ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then Exit; Inc(save); Inc(opnd); Dec(no); end; regInput := save; end; OP_ANYOF: begin if (regInput >= fInputCurrentEnd) or not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYBUT: begin if (regInput >= fInputCurrentEnd) or FindInCharClass(scan + REOpSz + RENextOffSz, regInput^) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYOF_CI: begin if (regInput >= fInputCurrentEnd) or not FindInCharClass(scan + REOpSz + RENextOffSz, _UpperCase(regInput^)) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_ANYBUT_CI: begin if (regInput >= fInputCurrentEnd) or FindInCharClass(scan + REOpSz + RENextOffSz, _UpperCase(regInput^)) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_NOTHING: ; OP_COMMENT: ; OP_BACK: ; OP_OPEN: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; save := CurrentGrpBounds.TmpStart[no]; CurrentGrpBounds.TmpStart[no] := regInput; Result := MatchPrim(next); CurrentGrpBounds.TmpStart[no] := save; exit; end; OP_OPEN_ATOMIC: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; Result := MatchPrim(next); if GrpBacktrackingAsAtom[no] then IsBacktrackingGroupAsAtom := False; GrpBacktrackingAsAtom[no] := False; Exit; end; OP_CLOSE: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; save := CurrentGrpBounds.GrpStart[no]; opnd := CurrentGrpBounds.GrpEnd[no]; // save2 CurrentGrpBounds.GrpStart[no] := CurrentGrpBounds.TmpStart[no]; CurrentGrpBounds.GrpEnd[no] := regInput; // if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return // in OP_CLOSE, without going to next opcode if CurrentSubCalled = no then begin Result := True; Exit; end; Result := MatchPrim(next); if not Result then begin CurrentGrpBounds.GrpStart[no] := save; CurrentGrpBounds.GrpEnd[no] := opnd; end; Exit; end; OP_CLOSE_ATOMIC: begin no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; // handle atomic group, mark it as "done" // (we are here because some OP_BRANCH is matched) Result := MatchPrim(next); if not Result then begin if not IsBacktrackingGroupAsAtom then begin GrpBacktrackingAsAtom[no] := True; IsBacktrackingGroupAsAtom := True; end; end; Exit; end; OP_LOOKAHEAD, OP_LOOKAHEAD_NEG: begin Local.LookAroundInfo.InputPos := regInput; Local.LookAroundInfo.IsNegative := (scan^ = OP_LOOKAHEAD_NEG); Local.LookAroundInfo.HasMatchedToEnd := False; Local.LookAroundInfo.IsBackTracking := False; Local.LookAroundInfo.OuterInfo := LookAroundInfoList; Local.LookAroundInfo.savedInputCurrentEnd := fInputCurrentEnd; LookAroundInfoList := @Local.LookAroundInfo; fInputCurrentEnd := fInputEnd; scan := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz; Result := MatchPrim(scan); if Local.LookAroundInfo.IsBackTracking then IsBacktrackingGroupAsAtom := False; LookAroundInfoList := Local.LookAroundInfo.OuterInfo; fInputCurrentEnd := Local.LookAroundInfo.savedInputCurrentEnd; if Local.LookAroundInfo.IsNegative then begin Result := (not Local.LookAroundInfo.HasMatchedToEnd); if Result then begin next := regNextQuick(next); // Next-Pointer of OP_LOOKAHEAD_END regInput := Local.LookAroundInfo.InputPos; Result := False; scan := next; continue; end; end; Exit; end; OP_LOOKBEHIND, OP_LOOKBEHIND_NEG: begin Local.LookAroundInfo.InputPos := regInput; Local.LookAroundInfo.IsNegative := (scan^ = OP_LOOKBEHIND_NEG); Local.LookAroundInfo.HasMatchedToEnd := False; Local.LookAroundInfo.IsBackTracking := False; Local.LookAroundInfo.OuterInfo := LookAroundInfoList; Local.LookAroundInfo.savedInputCurrentEnd := fInputCurrentEnd; LookAroundInfoList := @Local.LookAroundInfo; scan := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz; Local.IsGreedy := PReOpLookBehindOptions(scan)^.IsGreedy; fInputCurrentEnd := regInput; Result := regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMin; if Result then begin if Local.IsGreedy = OPT_LOOKBEHIND_FIXED then begin regInput := regInput - PReOpLookBehindOptions(scan)^.MatchLenMin; inc(scan, ReOpLookBehindOptionsSz); Result := MatchPrim(scan) end else if Local.IsGreedy = OPT_LOOKBEHIND_NON_GREEDY then begin Local.InpStart := regInput - PReOpLookBehindOptions(scan)^.MatchLenMin; if regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMax then save := regInput - PReOpLookBehindOptions(scan)^.MatchLenMax else save := fInputStart; inc(scan, ReOpLookBehindOptionsSz); repeat regInput := Local.InpStart; dec(Local.InpStart); Result := MatchPrim(scan); until Local.LookAroundInfo.HasMatchedToEnd or (Local.InpStart < save); end else begin if regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMax then Local.InpStart := regInput - PReOpLookBehindOptions(scan)^.MatchLenMax else Local.InpStart := fInputStart; save := Local.LookAroundInfo.InputPos - PReOpLookBehindOptions(scan)^.MatchLenMin; inc(scan, ReOpLookBehindOptionsSz); repeat regInput := Local.InpStart; inc(Local.InpStart); Result := MatchPrim(scan); until Local.LookAroundInfo.HasMatchedToEnd or (Local.InpStart > save); end; end; if Local.LookAroundInfo.IsBackTracking then IsBacktrackingGroupAsAtom := False; LookAroundInfoList := Local.LookAroundInfo.OuterInfo; fInputCurrentEnd := Local.LookAroundInfo.savedInputCurrentEnd; if Local.LookAroundInfo.IsNegative then begin Result := not Local.LookAroundInfo.HasMatchedToEnd; if Result then begin next := regNextQuick(next); // Next-Pointer of OP_LOOKAHEAD_END regInput := Local.LookAroundInfo.InputPos; Result := False; scan := next; continue; end; end; Exit; end; OP_LOOKAHEAD_END: begin if LookAroundInfoList = nil then Exit; Local.LookAroundInfoPtr := LookAroundInfoList; Local.LookAroundInfoPtr.HasMatchedToEnd := True; if not Local.LookAroundInfoPtr^.IsNegative then begin fInputCurrentEnd := Local.LookAroundInfoPtr^.savedInputCurrentEnd; regInput := Local.LookAroundInfoPtr^.InputPos; LookAroundInfoList := Local.LookAroundInfoPtr^.OuterInfo; Result := MatchPrim(next); LookAroundInfoList := Local.LookAroundInfoPtr; end; if (not Result) and not IsBacktrackingGroupAsAtom then begin IsBacktrackingGroupAsAtom := True; Local.LookAroundInfoPtr.IsBackTracking := True; end; Exit; end; OP_LOOKBEHIND_END: begin if LookAroundInfoList = nil then Exit; Local.LookAroundInfoPtr := LookAroundInfoList; if not (Local.LookAroundInfoPtr^.InputPos = regInput) then Exit; Local.LookAroundInfoPtr.HasMatchedToEnd := True; if not Local.LookAroundInfoPtr^.IsNegative then begin regInput := Local.LookAroundInfoPtr^.InputPos; fInputCurrentEnd := Local.LookAroundInfoPtr^.savedInputCurrentEnd; LookAroundInfoList := Local.LookAroundInfoPtr^.OuterInfo; Result := MatchPrim(next); LookAroundInfoList := Local.LookAroundInfoPtr; end; if (not Result) and not IsBacktrackingGroupAsAtom then begin IsBacktrackingGroupAsAtom := True; Local.LookAroundInfoPtr.IsBackTracking := True; end; Exit; end; OP_BRANCH: begin repeat save := regInput; Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz); if Result then Exit; // if branch worked until OP_CLOSE, and marked atomic group as "done", then exit regInput := save; if IsBacktrackingGroupAsAtom then Exit; scan := next; Assert(scan <> nil); next := regNextQuick(scan); if (next^ <> OP_BRANCH) then break; until False; next := scan + REOpSz + RENextOffSz + REBranchArgSz; // Avoid recursion end; OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI: begin Assert((next^ = OP_BRANCH) or (next^ = OP_GBRANCH) or (next^ = OP_GBRANCH_EX) or (next^ = OP_GBRANCH_EX_CI)); repeat save := regInput; case scan^ of OP_GBRANCH, OP_BRANCH: Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz); OP_GBRANCH_EX: if (regInput^ = (scan + REOpSz + RENextOffSz)^) then Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz); OP_GBRANCH_EX_CI: if (regInput^ = (scan + REOpSz + RENextOffSz)^) or (regInput^ = (scan + REOpSz + RENextOffSz + 1)^) then Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz); end; if Result then Exit; // if branch worked until OP_CLOSE, and marked atomic group as "done", then exit regInput := save; if IsBacktrackingGroupAsAtom then Exit; scan := next; Assert(scan <> nil); next := regNextQuick(scan); if (next^ <> OP_BRANCH) and (next^ <> OP_GBRANCH) and (next^ <> OP_GBRANCH_EX) and (next^ <> OP_GBRANCH_EX_CI) then break; until False; case scan^ of OP_GBRANCH_EX: if (regInput^ <> (scan + REOpSz + RENextOffSz)^) then exit; OP_GBRANCH_EX_CI: if (regInput^ <> (scan + REOpSz + RENextOffSz)^) and (regInput^ <> (scan + REOpSz + RENextOffSz + 1)^) then exit; end; next := scan + REOpSz + RENextOffSz + REBranchArgSz; // Avoid recursion end; {$IFDEF ComplexBraces} OP_LOOPENTRY: begin Local.LoopInfo.Count := 0; Local.LoopInfo.BackTrackingAsAtom := False; Local.LoopInfo.CurrentRegInput := nil; Local.LoopInfo.OuterLoop := CurrentLoopInfoListPtr; CurrentLoopInfoListPtr := @Local.LoopInfo; save := regInput; Result := MatchPrim(next); // execute loop CurrentLoopInfoListPtr := Local.LoopInfo.OuterLoop; if Local.LoopInfo.BackTrackingAsAtom then IsBacktrackingGroupAsAtom := False; if not Result then regInput := save; Exit; end; OP_LOOP, OP_LOOP_POSS: begin if CurrentLoopInfoListPtr = nil then begin Error(reeLoopWithoutEntry); Exit; end; opnd := AlignToPtr(scan + REOpSz + RENextOffSz); Local.LoopInfoListPtr := CurrentLoopInfoListPtr; if Local.LoopInfoListPtr^.Count >= PREBracesArg(opnd)^ then // Min-Count begin // Min alredy matched - we can work LoopCnt := PREBracesArg(opnd + REBracesArgSz)^; // Max-Count Result := (LoopCnt = MaxBracesArg) and // * or + (Local.LoopInfoListPtr^.CurrentRegInput = regInput); if Result then begin CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop; Result := MatchPrim(next); CurrentLoopInfoListPtr := Local.LoopInfoListPtr; if (not Result) and (scan^ = OP_LOOP_POSS) then begin Local.LoopInfoListPtr^.BackTrackingAsAtom := True; IsBacktrackingGroupAsAtom := True; end; exit; end; // greedy way - first try to max deep of greed ;) if Local.LoopInfoListPtr^.Count < LoopCnt then begin save := regInput; Local.LoopInfoListPtr^.CurrentRegInput := save; Inc(Local.LoopInfoListPtr^.Count); Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^); if Result or IsBacktrackingGroupAsAtom then Exit; Dec(Local.LoopInfoListPtr^.Count); regInput := save; end; CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop; Result := MatchPrim(next); CurrentLoopInfoListPtr := Local.LoopInfoListPtr; if Result or IsBacktrackingGroupAsAtom then Exit; if (scan^ = OP_LOOP_POSS) then begin Local.LoopInfoListPtr^.BackTrackingAsAtom := True; IsBacktrackingGroupAsAtom := True; end; Exit; end else begin // first match a min_cnt times Inc(Local.LoopInfoListPtr^.Count); Local.LoopInfoListPtr^.CurrentRegInput := regInput; Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^); if Result or IsBacktrackingGroupAsAtom then Exit; Dec(Local.LoopInfoListPtr^.Count); Exit; end; end; OP_LOOP_NG: begin if CurrentLoopInfoListPtr = nil then begin Error(reeLoopWithoutEntry); Exit; end; opnd := AlignToPtr(scan + REOpSz + RENextOffSz); Local.LoopInfoListPtr := CurrentLoopInfoListPtr; if Local.LoopInfoListPtr^.Count >= PREBracesArg(opnd)^ then // Min-Count begin // Min alredy matched - we can work LoopCnt := PREBracesArg(opnd + REBracesArgSz)^; // Max-Count Result := (LoopCnt = MaxBracesArg) and // * or + (Local.LoopInfoListPtr^.CurrentRegInput = regInput); if Result then begin CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop; Result := MatchPrim(next); CurrentLoopInfoListPtr := Local.LoopInfoListPtr; exit; end; save := regInput; Local.LoopInfoListPtr^.CurrentRegInput := save; // non-greedy - try just now CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop; Result := MatchPrim(next); CurrentLoopInfoListPtr := Local.LoopInfoListPtr; if Result or IsBacktrackingGroupAsAtom then Exit; if Local.LoopInfoListPtr^.Count < LoopCnt then begin regInput := save; // failed - move next and try again Inc(Local.LoopInfoListPtr^.Count); Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^); if Result or IsBacktrackingGroupAsAtom then Exit; Dec(Local.LoopInfoListPtr^.Count); end; Exit; end else begin // first match a min_cnt times Inc(Local.LoopInfoListPtr^.Count); Local.LoopInfoListPtr^.CurrentRegInput := regInput; Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^); if Result or IsBacktrackingGroupAsAtom then Exit; Dec(Local.LoopInfoListPtr^.Count); Exit; end; end; {$ENDIF} OP_STAR, OP_PLUS, OP_BRACES: begin opnd := scan + REOpSz + RENextOffSz; save := regInput; case scan^ of OP_STAR: begin no := FindRepeated(opnd, MaxInt); LoopCnt := 0 // star end; OP_PLUS: begin no := FindRepeated(opnd, MaxInt); if no < 1 then Exit; LoopCnt := 1 // star end; else begin // braces opnd := AlignToPtr(opnd); no := FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^); LoopCnt := PREBracesArg(opnd)^; if no < LoopCnt then Exit; end; end; if next^ = OP_EXACTLY then begin // Lookahead to avoid useless match attempts when we know // what character comes next. Local.nextch := (next + REOpSz + RENextOffSz + RENumberSz)^; while no >= LoopCnt do begin // If it could work, try it. if (Local.nextch = #0) or (regInput^ = Local.nextch) then begin if MatchPrim(next) then begin Result := True; Exit; end; if IsBacktrackingGroupAsAtom then Exit; end; Dec(no); // Couldn't or didn't - back up. regInput := save + no; end; { of while } end else begin while no >= LoopCnt do begin if MatchPrim(next) then begin Result := True; Exit; end; if IsBacktrackingGroupAsAtom then Exit; Dec(no); // Couldn't or didn't - back up. regInput := save + no; end; { of while } end; Exit; end; OP_STAR_NG, OP_PLUS_NG, OP_BRACES_NG: begin opnd := scan + REOpSz + RENextOffSz; save := regInput; case scan^ of OP_STAR_NG: begin no := FindRepeated(opnd, MaxInt); LoopCnt := 0 // star end; OP_PLUS_NG: begin no := FindRepeated(opnd, MaxInt); if no < 1 then Exit; LoopCnt := 1 // star end; else begin // braces opnd := AlignToPtr(opnd); no := FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^); LoopCnt := PREBracesArg(opnd)^; if no < LoopCnt then Exit; end; end; // non-greedy mode // don't repeat more than "no" times // Now we know real Max limit to move forward (for recursion 'back up') // In some cases it can be faster to check only Min positions first, // but after that we have to check every position separtely instead // of fast scannig in loop. if next^ = OP_EXACTLY then begin // Lookahead to avoid useless match attempts when we know // what character comes next. Local.nextch := (next + REOpSz + RENextOffSz + RENumberSz)^; while LoopCnt <= no do begin regInput := save + LoopCnt; // If it could work, try it. if (Local.nextch = #0) or (regInput^ = Local.nextch) then begin if MatchPrim(next) then begin Result := True; Exit; end; if IsBacktrackingGroupAsAtom then Exit; end; Inc(LoopCnt); // Couldn't or didn't - move forward. end; { of while } end else begin while LoopCnt <= no do begin regInput := save + LoopCnt; if MatchPrim(next) then begin Result := True; Exit; end; if IsBacktrackingGroupAsAtom then Exit; Inc(LoopCnt); // Couldn't or didn't - move forward. end; { of while } end; Exit; end; OP_STAR_POSS, OP_PLUS_POSS, OP_BRACES_POSS: begin opnd := scan + REOpSz + RENextOffSz; case scan^ of OP_STAR_POSS: begin FindRepeated(opnd, MaxInt); end; OP_PLUS_POSS: begin if FindRepeated(opnd, MaxInt) < 1 then Exit; end; else begin // braces opnd := AlignToPtr(opnd); if FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^) < PREBracesArg(opnd)^ then Exit; end; end; end; OP_EEND: begin Result := True; // Success! Exit; end; {$IFDEF FastUnicodeData} OP_ANYCATEGORY: begin if (regInput >= fInputCurrentEnd) then Exit; if not MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; OP_NOTCATEGORY: begin if (regInput >= fInputCurrentEnd) then Exit; if MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit; {$IFDEF UNICODEEX} IncUnicode(regInput); {$ELSE} Inc(regInput); {$ENDIF} end; {$ENDIF} OP_RECUR: begin // call opcode start if regRecursion < RegexMaxRecursion then begin Inc(regRecursion); if regNumBrackets > 0 then begin CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0]; CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0]; CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0]; FillChar(CurrentGrpBounds.GrpStart[0], SizeOf(CurrentGrpBounds.GrpStart[0])*regNumBrackets, 0); end; Result := MatchPrim(regCodeWork); Dec(regRecursion); if regNumBrackets > 0 then begin CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0]; CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0]; CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0]; end; if not Result then Exit; Result := False; end else Exit; end; OP_SUBCALL: begin // call subroutine no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; if no < 0 then Exit; save := GrpOpCodes[no]; if save = nil then Exit; if regRecursion < RegexMaxRecursion then begin Local.savedCurrentSubCalled := CurrentSubCalled; CurrentSubCalled := no; Inc(regRecursion); if regNumBrackets > 0 then begin CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0]; CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0]; CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0]; FillChar(CurrentGrpBounds.GrpStart[0], SizeOf(CurrentGrpBounds.GrpStart[0])*regNumBrackets, 0); end; Result := MatchPrim(save); Dec(regRecursion); if regNumBrackets > 0 then begin CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0]; CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0]; CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0]; end; CurrentSubCalled := Local.savedCurrentSubCalled; if not Result then Exit; Result := False; end else Exit; end; OP_ANYLINEBREAK: begin if (regInput >= fInputCurrentEnd) or not IsAnyLineBreak(regInput^) then Exit; if regInput^ = #13 then begin Inc(regInput); if (regInput < fInputCurrentEnd) and (regInput^ = #10) then Inc(regInput); end else Inc(regInput); end; {$IFDEF WITH_REGEX_ASSERT} else Error(reeMatchPrimMemoryCorruption); Exit; {$ENDIF} end; { of case scan^ } scan := next; end; { of while scan <> nil } end; { of function TRegExpr.MatchPrim -------------------------------------------------------------- } function TRegExpr.Exec(const AInputString: RegExprString): Boolean; begin InputString := AInputString; Result := ExecPrim(1, False, False, 0); end; { of function TRegExpr.Exec -------------------------------------------------------------- } {$IFDEF OverMeth} function TRegExpr.Exec: Boolean; var SlowChecks: Boolean; begin SlowChecks := (fInputEnd - fInputStart < fSlowChecksSizeMax) and (regMustString <> ''); Result := ExecPrim(1, SlowChecks, False, 0); end; { of function TRegExpr.Exec -------------------------------------------------------------- } function TRegExpr.Exec(AOffset: Integer): Boolean; begin // Check that the start position is not negative if AOffset < 1 then begin ClearMatches; Error(reeOffsetMustBePositive); Result := False; Exit; end; Result := ExecPrim(AOffset, False, False, 0); end; { of function TRegExpr.Exec -------------------------------------------------------------- } {$ENDIF} function TRegExpr.ExecPos(AOffset: Integer {$IFDEF DefParam} = 1{$ENDIF}): Boolean; begin // Check that the start position is not negative if AOffset < 1 then begin ClearMatches; Error(reeOffsetMustBePositive); Result := False; Exit; end; Result := ExecPrim(AOffset, False, False, 0); end; { of function TRegExpr.ExecPos -------------------------------------------------------------- } {$IFDEF OverMeth} function TRegExpr.ExecPos(AOffset: Integer; ATryOnce, ABackward: Boolean): Boolean; begin // Check that the start position is not negative if AOffset < 1 then begin ClearMatches; Error(reeOffsetMustBePositive); Result := False; Exit; end; if ATryOnce then Result := ExecPrim(AOffset, False, ABackward, AOffset + 1) else Result := ExecPrim(AOffset, False, ABackward, 0); end; function TRegExpr.ExecPos(AOffset, ATryMatchOnlyStartingBefore: Integer): Boolean; begin // Check that the start position is not negative if AOffset < 1 then begin ClearMatches; Error(reeOffsetMustBePositive); Result := False; Exit; end; if (ATryMatchOnlyStartingBefore > 0) and (AOffset >= ATryMatchOnlyStartingBefore) then begin ClearMatches; Result := False; Exit; end; Result := ExecPrim(AOffset, False, False, ATryMatchOnlyStartingBefore); end; {$ENDIF} function TRegExpr.MatchAtOnePos(APos: PRegExprChar): Boolean; begin regInput := APos; //regNestedCalls := 0; fInputCurrentEnd := fInputEnd; GrpBounds[0].GrpStart[0] := APos; Result := MatchPrim(regCodeWork); if Result then Result := regInput >= GrpBounds[0].GrpStart[0]; if Result then GrpBounds[0].GrpEnd[0] := regInput else GrpBounds[0].GrpStart[0] := nil; end; procedure TRegExpr.ClearMatches; begin if FMatchesCleared then exit; FMatchesCleared := True; if Length(GrpBounds[0].GrpStart) > 0 then FillChar(GrpBounds[0].GrpStart[0], SizeOf(GrpBounds[0].GrpStart[0])*regNumBrackets, 0); end; procedure TRegExpr.ClearInternalExecData; begin fLastError := reeOk; if Length(GrpBacktrackingAsAtom) > 0 then FillChar(GrpBacktrackingAsAtom[0], SizeOf(GrpBacktrackingAsAtom[0])*regNumAtomicBrackets, 0); IsBacktrackingGroupAsAtom := False; {$IFDEF ComplexBraces} // no loops started CurrentLoopInfoListPtr := nil; {$ENDIF} LookAroundInfoList := nil; CurrentSubCalled := -1; regRecursion := 0; if regNumBrackets > 0 then begin CurrentGrpBounds.TmpStart := @GrpBounds[0].TmpStart[0]; CurrentGrpBounds.GrpStart := @GrpBounds[0].GrpStart[0]; CurrentGrpBounds.GrpEnd := @GrpBounds[0].GrpEnd[0]; end; end; procedure TRegExpr.InitInternalGroupData; var BndLen, i: Integer; begin BndLen := GroupDataArraySize(regNumBrackets, Length(GrpBounds[0].GrpStart)); if hasRecursion then begin for i := low(GrpBounds) to high(GrpBounds) do begin SetLength(GrpBounds[i].TmpStart, BndLen); SetLength(GrpBounds[i].GrpStart, BndLen); SetLength(GrpBounds[i].GrpEnd, BndLen); end; end else begin SetLength(GrpBounds[0].TmpStart, BndLen); SetLength(GrpBounds[0].GrpStart, BndLen); SetLength(GrpBounds[0].GrpEnd, BndLen); for i := low(GrpBounds) + 1 to high(GrpBounds) do begin GrpBounds[i].TmpStart := nil; GrpBounds[i].GrpStart := nil; GrpBounds[i].GrpEnd := nil; end; end; SetLength(GrpOpCodes, GroupDataArraySize(regNumBrackets, Length(GrpOpCodes))); SetLength(GrpBacktrackingAsAtom, GroupDataArraySize(regNumAtomicBrackets, Length(GrpBacktrackingAsAtom))); GrpOpCodes[0] := nil; end; function TRegExpr.ExecPrim(AOffset: Integer; ASlowChecks, ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean; var Len: Ptrint; begin Result := False; // Ensure that Match cleared either if optimization tricks or some error // will lead to leaving ExecPrim without actual search. That is // important for ExecNext logic and so on. ClearMatches; // Don't check IsProgrammOk here! it causes big slowdown in test_benchmark! if programm = nil then begin if fRaiseForRuntimeError then begin Compile; end else begin try Compile; except on E: ERegExpr do begin Result := False; end; else begin Result := False; fLastError := reeUnknown; end; end; end; if programm = nil then Exit; end; Len := fInputEnd - fInputStart; if FMinMatchLen > Len then Exit; // Check that the start position is not longer than the line if (AOffset - 1) > Len - FMinMatchLen then Exit; // If there is a "must appear" string, look for it. if ASlowChecks then if regMustString <> '' then if StrLPos(fInputStart, PRegExprChar(regMustString), Len, length(regMustString)) = nil then exit; {$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame} StackLimit := StackBottom; if StackLimit <> nil then StackLimit := StackLimit + 36000; // Add for any calls within the current MatchPrim // FPC has "STACK_MARGIN = 16384;", but we need to call Error, ..., raise {$ENDIF} ClearInternalExecData; if fRaiseForRuntimeError then begin Result := ExecPrimProtected(AOffset, ASlowChecks, ABackward, ATryMatchOnlyStartingBefore); end else begin try Result := ExecPrimProtected(AOffset, ASlowChecks, ABackward, ATryMatchOnlyStartingBefore); except on E: EStackOverflow do begin Result := False; fLastError := reeLoopStackExceeded; end; on E: ERegExpr do begin Result := False; end; else begin Result := False; fLastError := reeUnknown; end; end; end; end; function TRegExpr.ExecPrimProtected(AOffset: Integer; ASlowChecks, ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean; var Ptr, SearchEnd: PRegExprChar; begin Result := False; Ptr := fInputStart + AOffset - 1; fInputContinue := Ptr; FMatchesCleared := False; // ATryOnce or anchored match (it needs to be tried only once). if (ATryMatchOnlyStartingBefore = AOffset + 1) or (regAnchored in [raBOL, raOnlyOnce, raContinue]) then begin case regAnchored of raBOL: if AOffset > 1 then Exit; // can't match the BOL raEOL: Ptr := fInputEnd; end; {$IFDEF UseFirstCharSet} if (Ptr < fInputEnd) {$IFDEF UnicodeRE} and (Ord(Ptr^) <= $FF) {$ENDIF} then if not FirstCharArray[Byte(Ptr^)] then Exit; {$ENDIF} Result := MatchAtOnePos(Ptr); Exit; end; // Messy cases: unanchored match. if ABackward then begin Inc(Ptr, 2); repeat Dec(Ptr); if Ptr < fInputStart then Exit; {$IFDEF UseFirstCharSet} {$IFDEF UnicodeRE} if Ord(Ptr^) <= $FF then {$ENDIF} if not FirstCharArray[byte(Ptr^)] then Continue; {$ENDIF} Result := MatchAtOnePos(Ptr); // Exit on a match or after testing the end-of-string if Result then Exit; until False; end else begin Dec(Ptr); SearchEnd := fInputEnd - FMinMatchLen; if (ATryMatchOnlyStartingBefore > 0) and (fInputStart + ATryMatchOnlyStartingBefore < SearchEnd) then SearchEnd := fInputStart + ATryMatchOnlyStartingBefore - 2; repeat Inc(Ptr); if Ptr > SearchEnd then Break; {$IFDEF UseFirstCharSet} {$IFDEF UnicodeRE} if Ord(Ptr^) <= $FF then {$ENDIF} if not FirstCharArray[byte(Ptr^)] then Continue; {$ENDIF} Result := MatchAtOnePos(Ptr); // Exit on a match or after testing the end-of-string if Result then Exit; until False; {$IFDEF UseFirstCharSet} if FirstCharArray[0] and (fInputEnd^ <> #0) then Result := MatchAtOnePos(fInputEnd); {$ENDIF} end; end; { of function TRegExpr.ExecPrim -------------------------------------------------------------- } function TRegExpr.ExecNext(ABackward: Boolean {$IFDEF DefParam} = False{$ENDIF}): Boolean; var PtrBegin, PtrEnd: PRegExprChar; Offset: PtrInt; begin PtrBegin := GrpBounds[0].GrpStart[0]; PtrEnd := GrpBounds[0].GrpEnd[0]; if (PtrBegin = nil) or (PtrEnd = nil) then begin Error(reeExecNextWithoutExec); Result := False; Exit; end; Offset := PtrEnd - fInputStart + 1; // prevent infinite looping if empty string matches r.e. if PtrBegin = PtrEnd then Inc(Offset); Result := ExecPrim(Offset, False, ABackward, 0); end; { of function TRegExpr.ExecNext -------------------------------------------------------------- } procedure TRegExpr.SetInputString(const AInputString: RegExprString); begin ClearMatches; fInputString := AInputString; //UniqueString(fInputString); fInputStart := PRegExprChar(fInputString); fInputEnd := fInputStart + Length(fInputString); fInputContinue := fInputStart; end; procedure TRegExpr.SetInputRange(AStart, AEnd, AContinueAnchor: PRegExprChar); begin ClearMatches; fInputString := ''; fInputStart := AStart; fInputEnd := AEnd; fInputContinue := AContinueAnchor; end; {$IFDEF UseLineSep} procedure TRegExpr.SetLineSeparators(const AStr: RegExprString); begin if AStr <> fLineSeparators then begin fLineSeparators := AStr; InitLineSepArray; InvalidateProgramm; end; end; { of procedure TRegExpr.SetLineSeparators -------------------------------------------------------------- } {$ENDIF} procedure TRegExpr.SetUsePairedBreak(AValue: Boolean); begin if AValue <> fUsePairedBreak then begin fUsePairedBreak := AValue; InvalidateProgramm; end; end; function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString; // perform substitutions after a regexp match var TemplateBeg, TemplateEnd: PRegExprChar; function ParseVarName(var APtr: PRegExprChar): Integer; // extract name of variable: $1 or ${1} or ${name} // from APtr^, uses TemplateEnd var p: PRegExprChar; Delimited: Boolean; GrpName: RegExprString; begin Result := 0; GrpName := ''; p := APtr; Delimited := (p < TemplateEnd) and (p^ = '{'); if Delimited then Inc(p); // skip left curly brace if (p < TemplateEnd) and (p^ = '&') then Inc(p) // this is '$&' or '${&}' else begin if IsDigitChar(p^) then begin while (p < TemplateEnd) and IsDigitChar(p^) do begin Result := Result * 10 + (Ord(p^) - Ord('0')); Inc(p); end end else if Delimited then begin FindGroupName(p, TemplateEnd, '}', GrpName); Result := GrpNames.MatchIndexFromName(GrpName); Inc(p, Length(GrpName)); end; end; if Delimited then if (p < TemplateEnd) and (p^ = '}') then Inc(p) // skip right curly brace else p := APtr; // isn't properly terminated if p = APtr then Result := -1; // no valid digits found or no right curly brace APtr := p; end; procedure FindSubstGroupIndex(var p: PRegExprChar; var Idx: Integer; var NumberFound: Boolean); begin Idx := ParseVarName(p); NumberFound := Idx >= 0; if NumberFound and (Idx > GrpCount) then Idx := -1; end; type TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower); var Mode: TSubstMode; p, p0, p1, ResultPtr: PRegExprChar; ResultLen, n: Integer; Ch, QuotedChar: REChar; GroupFound: Boolean; begin // Check programm and input string if not IsProgrammOk then Exit; // Note: don't check for empty fInputString, it's valid case, // e.g. user needs to replace regex "\b" to "_", it's zero match length if ATemplate = '' then begin Result := ''; Exit; end; TemplateBeg := PRegExprChar(ATemplate); TemplateEnd := TemplateBeg + Length(ATemplate); // Count result length for speed optimization. ResultLen := 0; p := TemplateBeg; while p < TemplateEnd do begin Ch := p^; Inc(p); n := -1; GroupFound := False; if Ch = SubstituteGroupChar then FindSubstGroupIndex(p, n, GroupFound); if GroupFound then begin if (n >= 0) and (GrpBounds[0].GrpStart[n] <> nil) then Inc(ResultLen, GrpBounds[0].GrpEnd[n] - GrpBounds[0].GrpStart[n]); end else begin if (Ch = EscChar) and (p < TemplateEnd) then begin // quoted or special char followed Ch := p^; Inc(p); case Ch of 'n': Inc(ResultLen, Length(fReplaceLineEnd)); 'u', 'l', 'U', 'L': { nothing } ; 'x': begin Inc(ResultLen); if (p^ = '{') then begin // skip \x{....} while ((p^ <> '}') and (p < TemplateEnd)) do p := p + 1; p := p + 1; end else p := p + 2 // skip \x.. end; else Inc(ResultLen); end; end else Inc(ResultLen); end; end; // Get memory. We do it once and it significant speed up work ! if ResultLen = 0 then begin Result := ''; Exit; end; SetLength(Result, ResultLen); // Fill Result ResultPtr := PRegExprChar(Result); p := TemplateBeg; Mode := smodeNormal; while p < TemplateEnd do begin Ch := p^; p0 := p; Inc(p); p1 := p; n := -1; GroupFound := False; if Ch = SubstituteGroupChar then FindSubstGroupIndex(p, n, GroupFound); if GroupFound then begin if n >= 0 then begin p0 := GrpBounds[0].GrpStart[n]; if p0 = nil then p1 := nil else p1 := GrpBounds[0].GrpEnd[n]; end else p1 := p0; end else begin if (Ch = EscChar) and (p < TemplateEnd) then begin // quoted or special char followed Ch := p^; Inc(p); case Ch of 'n': begin p0 := PRegExprChar(fReplaceLineEnd); p1 := p0 + Length(fReplaceLineEnd); end; 'x', 't', 'r', 'f', 'a', 'e': begin p := p - 1; // UnquoteChar expects the escaped char under the pointer QuotedChar := UnQuoteChar(p, TemplateEnd); p := p + 1; // Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it p0 := @QuotedChar; p1 := p0 + 1; end; 'l': begin Mode := smodeOneLower; p1 := p0; end; 'L': begin Mode := smodeAllLower; p1 := p0; end; 'u': begin Mode := smodeOneUpper; p1 := p0; end; 'U': begin Mode := smodeAllUpper; p1 := p0; end; else Inc(p0); Inc(p1); end; end end; if p0 < p1 then begin while p0 < p1 do begin case Mode of smodeOneLower: begin ResultPtr^ := _LowerCase(p0^); Mode := smodeNormal; end; smodeAllLower: begin ResultPtr^ := _LowerCase(p0^); end; smodeOneUpper: begin ResultPtr^ := _UpperCase(p0^); Mode := smodeNormal; end; smodeAllUpper: begin ResultPtr^ := _UpperCase(p0^); end; else ResultPtr^ := p0^; end; Inc(ResultPtr); Inc(p0); end; Mode := smodeNormal; end; end; end; { of function TRegExpr.Substitute -------------------------------------------------------------- } procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings); var PrevPos: PtrInt; begin PrevPos := 1; if Exec(AInputStr) then repeat APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)); PrevPos := MatchPos[0] + MatchLen[0]; until not ExecNext; APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail end; { of procedure TRegExpr.Split -------------------------------------------------------------- } function TRegExpr.Replace(const AInputStr: RegExprString; const AReplaceStr: RegExprString; AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString; var PrevPos: PtrInt; begin Result := ''; PrevPos := 1; if Exec(AInputStr) then repeat Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos); if AUseSubstitution then Result := Result + Substitute(AReplaceStr) else Result := Result + AReplaceStr; PrevPos := MatchPos[0] + MatchLen[0]; until not ExecNext; Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail end; { of function TRegExpr.Replace -------------------------------------------------------------- } function TRegExpr.ReplaceEx(const AInputStr: RegExprString; AReplaceFunc: TRegExprReplaceFunction): RegExprString; var PrevPos: PtrInt; begin Result := ''; PrevPos := 1; if Exec(AInputStr) then repeat Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos) + AReplaceFunc(Self); PrevPos := MatchPos[0] + MatchLen[0]; until not ExecNext; Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail end; { of function TRegExpr.ReplaceEx -------------------------------------------------------------- } {$IFDEF OverMeth} function TRegExpr.Replace(const AInputStr: RegExprString; AReplaceFunc: TRegExprReplaceFunction): RegExprString; begin Result := ReplaceEx(AInputStr, AReplaceFunc); end; { of function TRegExpr.Replace -------------------------------------------------------------- } {$ENDIF} { ============================================================= } { ====================== Debug section ======================== } { ============================================================= } {$IFDEF UseFirstCharSet} procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar); var scan: PRegExprChar; // Current node. Next: PRegExprChar; // Next node. opnd: PRegExprChar; Oper: TREOp; ch: REChar; min_cnt: Integer; {$IFDEF UseLineSep} i: Integer; {$ENDIF} TempSet, TmpFirstCharSet: TRegExprCharset; begin TempSet := []; scan := prog; while scan <> nil do begin Next := regNextQuick(scan); Oper := PREOp(scan)^; case Oper of OP_BSUBEXP, OP_BSUBEXP_CI: begin // we cannot optimize r.e. if it starts with back reference FirstCharSet := RegExprAllSet; Exit; end; OP_BOL, OP_BOL_ML, OP_CONTINUE_POS, OP_RESET_MATCHPOS: ; // Exit; OP_EOL, OP_EOL2, OP_EOL_ML: begin Include(FirstCharSet, 0); if ModifierM then begin {$IFDEF UseLineSep} for i := 1 to Length(LineSeparators) do Include(FirstCharSet, Byte(LineSeparators[i])); {$ELSE} FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet; {$ENDIF} end; Exit; end; OP_BOUND, OP_NOTBOUND: ; OP_ANY, OP_ANY_ML: begin // we can better define ANYML FirstCharSet := RegExprAllSet; Exit; end; OP_ANYDIGIT: begin FirstCharSet := FirstCharSet + RegExprDigitSet; Exit; end; OP_NOTDIGIT: begin FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet); Exit; end; OP_ANYLETTER: begin GetCharSetFromWordChars(TempSet); FirstCharSet := FirstCharSet + TempSet; Exit; end; OP_NOTLETTER: begin GetCharSetFromWordChars(TempSet); FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet); Exit; end; OP_ANYSPACE: begin GetCharSetFromSpaceChars(TempSet); FirstCharSet := FirstCharSet + TempSet; Exit; end; OP_NOTSPACE: begin GetCharSetFromSpaceChars(TempSet); FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet); Exit; end; OP_ANYVERTSEP: begin FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet; Exit; end; OP_NOTVERTSEP: begin FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet); Exit; end; OP_ANYHORZSEP: begin FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet; Exit; end; OP_NOTHORZSEP: begin FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet); Exit; end; OP_EXACTLY_CI: begin ch := (scan + REOpSz + RENextOffSz + RENumberSz)^; {$IFDEF UnicodeRE} if Ord(ch) <= $FF then {$ENDIF} begin Include(FirstCharSet, Byte(ch)); Include(FirstCharSet, Byte(InvertCase(ch))); end; Exit; end; OP_EXACTLY: begin ch := (scan + REOpSz + RENextOffSz + RENumberSz)^; {$IFDEF UnicodeRE} if Ord(ch) <= $FF then {$ENDIF} Include(FirstCharSet, Byte(ch)); Exit; end; OP_ANYOF: begin GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet); FirstCharSet := FirstCharSet + TempSet; Exit; end; OP_ANYBUT: begin GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet); FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet); Exit; end; OP_ANYOF_CI: begin GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet); FirstCharSet := FirstCharSet + TempSet; Exit; end; OP_ANYBUT_CI: begin GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet); FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet); Exit; end; OP_NOTHING: ; OP_COMMENT: ; OP_BACK: begin // No point to rescan the code again Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;; end; OP_OPEN, OP_OPEN_ATOMIC: begin FillFirstCharSet(Next); Exit; end; OP_CLOSE, OP_CLOSE_ATOMIC: begin FillFirstCharSet(Next); Exit; end; OP_LOOKAHEAD: begin opnd := PRegExprChar(AlignToPtr(Next + 1)) + RENextOffSz; Next := regNextQuick(Next); TempSet := FirstCharSet; FirstCharSet := []; FillFirstCharSet(Next); // after the lookahead Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz; TmpFirstCharSet := FirstCharSet; FirstCharSet := []; FillFirstCharSet(Next); // inside the lookahead if TmpFirstCharSet = [] then FirstCharSet := TempSet + FirstCharSet else if FirstCharSet = [] then FirstCharSet := TempSet + TmpFirstCharSet else FirstCharSet := TempSet + (FirstCharSet * TmpFirstCharSet); exit; end; OP_LOOKAHEAD_NEG, OP_LOOKBEHIND, OP_LOOKBEHIND_NEG: begin Next := PRegExprChar(AlignToPtr(Next + 1)) + RENextOffSz; end; OP_LOOKAHEAD_END, OP_LOOKBEHIND_END: begin Exit; end; OP_BRANCH, OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI: begin repeat TmpFirstCharSet := FirstCharSet; FirstCharSet := []; FillFirstCharSet(scan + REOpSz + RENextOffSz + REBranchArgSz); FirstCharSet := FirstCharSet + TmpFirstCharSet; scan := regNextQuick(scan); until (scan = nil) or ( (PREOp(scan)^ <> OP_BRANCH) and (PREOp(Next)^ <> OP_GBRANCH) and (PREOp(scan)^ <> OP_GBRANCH_EX) and (PREOp(scan)^ <> OP_GBRANCH_EX_CI) ); Exit; end; {$IFDEF ComplexBraces} OP_LOOPENTRY: begin min_cnt := PREBracesArg(AlignToPtr(Next + REOpSz + RENextOffSz))^; if min_cnt = 0 then begin opnd := regNext(Next); FillFirstCharSet(opnd); // FirstChar may be after loop end; Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz; end; OP_LOOP, OP_LOOP_NG, OP_LOOP_POSS: begin min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; if min_cnt = 0 then Exit; // zero width loop end; {$ENDIF} OP_STAR, OP_STAR_NG, OP_STAR_POSS: FillFirstCharSet(scan + REOpSz + RENextOffSz); OP_PLUS, OP_PLUS_NG, OP_PLUS_POSS: begin FillFirstCharSet(scan + REOpSz + RENextOffSz); Exit; end; OP_BRACES, OP_BRACES_NG, OP_BRACES_POSS: begin opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2; min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES FillFirstCharSet(opnd); if min_cnt > 0 then Exit; end; OP_EEND: begin FirstCharSet := RegExprAllSet; Exit; end; OP_ANYCATEGORY, OP_NOTCATEGORY: begin FirstCharSet := RegExprAllSet; Exit; end; OP_RECUR, OP_SUBCALL: begin // we cannot optimize // TODO: lookup the called group FirstCharSet := RegExprAllSet; Exit; end; OP_ANYLINEBREAK: begin Include(FirstCharSet, Byte(10)); Include(FirstCharSet, Byte(13)); Include(FirstCharSet, Byte($0B)); Include(FirstCharSet, Byte($0C)); Include(FirstCharSet, Byte($85)); Exit; end; else fLastErrorOpcode := Oper; Error(reeUnknownOpcodeInFillFirst); Exit; end; { of case scan^} scan := Next; end; { of while scan <> nil} end; { of procedure FillFirstCharSet --------------------------------------------------------------} {$ENDIF} procedure TRegExpr.InitCharCheckers; var Cnt: Integer; // function Add(AChecker: TRegExprCharChecker): Byte; begin Inc(Cnt); if Cnt > High(CharCheckers) then Error(reeTooSmallCheckersArray); CharCheckers[Cnt - 1] := AChecker; Result := Cnt - 1; end; // begin Cnt := 0; FillChar(CharCheckers, SizeOf(CharCheckers), 0); CheckerIndex_Word := Add(CharChecker_Word); CheckerIndex_NotWord := Add(CharChecker_NotWord); CheckerIndex_Space := Add(CharChecker_Space); CheckerIndex_NotSpace := Add(CharChecker_NotSpace); CheckerIndex_Digit := Add(CharChecker_Digit); CheckerIndex_NotDigit := Add(CharChecker_NotDigit); CheckerIndex_VertSep := Add(CharChecker_VertSep); CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep); CheckerIndex_HorzSep := Add(CharChecker_HorzSep); CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep); //CheckerIndex_AllAZ := Add(CharChecker_AllAZ); CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ); CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ); CheckerIndex_AnyLineBreak := Add(CharChecker_AnyLineBreak); SetLength(CharCheckerInfos, 3); with CharCheckerInfos[0] do begin CharBegin := 'a'; CharEnd:= 'z'; CheckerIndex := CheckerIndex_LowerAZ; end; with CharCheckerInfos[1] do begin CharBegin := 'A'; CharEnd := 'Z'; CheckerIndex := CheckerIndex_UpperAZ; end; with CharCheckerInfos[2] do begin CharBegin := '0'; CharEnd := '9'; CheckerIndex := CheckerIndex_Digit; end; end; function TRegExpr.CharChecker_Word(ch: REChar): Boolean; begin Result := IsWordChar(ch); end; function TRegExpr.CharChecker_NotWord(ch: REChar): Boolean; begin Result := not IsWordChar(ch); end; function TRegExpr.CharChecker_Space(ch: REChar): Boolean; begin Result := IsSpaceChar(ch); end; function TRegExpr.CharChecker_NotSpace(ch: REChar): Boolean; begin Result := not IsSpaceChar(ch); end; function TRegExpr.CharChecker_Digit(ch: REChar): Boolean; begin Result := IsDigitChar(ch); end; function TRegExpr.CharChecker_NotDigit(ch: REChar): Boolean; begin Result := not IsDigitChar(ch); end; function TRegExpr.CharChecker_VertSep(ch: REChar): Boolean; begin Result := IsVertLineSeparator(ch); end; function TRegExpr.CharChecker_NotVertSep(ch: REChar): Boolean; begin Result := not IsVertLineSeparator(ch); end; function TRegExpr.CharChecker_AnyLineBreak(ch: REChar): Boolean; begin Result := IsAnyLineBreak(ch); end; function TRegExpr.CharChecker_HorzSep(ch: REChar): Boolean; begin Result := IsHorzSeparator(ch); end; function TRegExpr.CharChecker_NotHorzSep(ch: REChar): Boolean; begin Result := not IsHorzSeparator(ch); end; function TRegExpr.CharChecker_LowerAZ(ch: REChar): Boolean; begin case ch of 'a' .. 'z': Result := True; else Result := False; end; end; function TRegExpr.CharChecker_UpperAZ(ch: REChar): Boolean; begin case ch of 'A' .. 'Z': Result := True; else Result := False; end; end; {$IFDEF RegExpPCodeDump} function TRegExpr.DumpOp(op: TREOp): RegExprString; // printable representation of opcode begin case op of OP_BOL: Result := 'BOL'; OP_EOL: Result := 'EOL'; OP_EOL2: Result := 'EOL2'; OP_BOL_ML: Result := 'BOL_ML'; OP_CONTINUE_POS: Result := 'CONTINUE_POS'; OP_EOL_ML: Result := 'EOL_ML'; OP_BOUND: Result := 'BOUND'; OP_NOTBOUND: Result := 'NOTBOUND'; OP_ANY: Result := 'ANY'; OP_ANY_ML: Result := 'ANY_ML'; OP_ANYLETTER: Result := 'ANYLETTER'; OP_NOTLETTER: Result := 'NOTLETTER'; OP_ANYDIGIT: Result := 'ANYDIGIT'; OP_NOTDIGIT: Result := 'NOTDIGIT'; OP_ANYSPACE: Result := 'ANYSPACE'; OP_NOTSPACE: Result := 'NOTSPACE'; OP_ANYHORZSEP: Result := 'ANYHORZSEP'; OP_NOTHORZSEP: Result := 'NOTHORZSEP'; OP_ANYVERTSEP: Result := 'ANYVERTSEP'; OP_NOTVERTSEP: Result := 'NOTVERTSEP'; OP_ANYOF: Result := 'ANYOF'; OP_ANYBUT: Result := 'ANYBUT'; OP_ANYOF_CI: Result := 'ANYOF_CI'; OP_ANYBUT_CI: Result := 'ANYBUT_CI'; OP_BRANCH: Result := 'BRANCH'; OP_GBRANCH: Result := 'G_BRANCH'; OP_GBRANCH_EX: Result := 'G_BRANCH_EX'; OP_GBRANCH_EX_CI: Result := 'G_BRANCH_EX_CI'; OP_EXACTLY: Result := 'EXACTLY'; OP_EXACTLY_CI: Result := 'EXACTLY_CI'; OP_NOTHING: Result := 'NOTHING'; OP_COMMENT: Result := 'COMMENT'; OP_BACK: Result := 'BACK'; OP_EEND: Result := 'END'; OP_BSUBEXP: Result := 'BSUBEXP'; OP_BSUBEXP_CI: Result := 'BSUBEXP_CI'; OP_OPEN: Result := 'OPEN'; OP_CLOSE: Result := 'CLOSE'; OP_OPEN_ATOMIC: Result := 'OPEN_ATOMIC'; OP_CLOSE_ATOMIC: Result := 'CLOSE_ATOMIC'; OP_LOOKAHEAD: Result := 'LOOKAHEAD'; OP_LOOKAHEAD_NEG: Result := 'LOOKAHEAD_NEG'; OP_LOOKBEHIND: Result := 'LOOKBEHIND'; OP_LOOKBEHIND_NEG: Result := 'LOOKBEHIND_NEG'; OP_LOOKAHEAD_END: Result := 'LOOKAHEAD_END'; OP_LOOKBEHIND_END: Result := 'LOOKBEHIND_END'; OP_STAR: Result := 'STAR'; OP_PLUS: Result := 'PLUS'; OP_BRACES: Result := 'BRACES'; {$IFDEF ComplexBraces} OP_LOOPENTRY: Result := 'LOOPENTRY'; OP_LOOP: Result := 'LOOP'; OP_LOOP_NG: Result := 'LOOP_NG'; OP_LOOP_POSS: Result := 'LOOP_POSS'; {$ENDIF} OP_STAR_NG: Result := 'STAR_NG'; OP_PLUS_NG: Result := 'PLUS_NG'; OP_BRACES_NG: Result := 'BRACES_NG'; OP_STAR_POSS: Result := 'STAR_POSS'; OP_PLUS_POSS: Result := 'PLUS_POSS'; OP_BRACES_POSS: Result := 'BRACES_POSS'; OP_ANYCATEGORY: Result := 'ANYCATEGORY'; OP_NOTCATEGORY: Result := 'NOTCATEGORY'; OP_RECUR: Result := 'RECURSION'; OP_SUBCALL: Result := 'SUBCALL'; OP_ANYLINEBREAK: Result := 'ANYLINEBREAK'; OP_RESET_MATCHPOS: Result := 'RESET_MATCHPOS'; else Error(reeDumpCorruptedOpcode); end; end; { of function TRegExpr.DumpOp -------------------------------------------------------------- } function TRegExpr.IsCompiled: Boolean; begin Result := programm <> nil; end; function PrintableChar(AChar: REChar): RegExprString; {$IFDEF InlineFuncs}inline;{$ENDIF} begin if AChar < ' ' then Result := '#' + IntToStr(Ord(AChar)) else Result := AChar; end; function TRegExpr.DumpCheckerIndex(N: Byte): RegExprString; begin Result := '?'; if N = CheckerIndex_Word then Result := '\w' else if N = CheckerIndex_NotWord then Result := '\W' else if N = CheckerIndex_Digit then Result := '\d' else if N = CheckerIndex_NotDigit then Result := '\D' else if N = CheckerIndex_Space then Result := '\s' else if N = CheckerIndex_NotSpace then Result := '\S' else if N = CheckerIndex_HorzSep then Result := '\h' else if N = CheckerIndex_NotHorzSep then Result := '\H' else if N = CheckerIndex_VertSep then Result := '\v' else if N = CheckerIndex_NotVertSep then Result := '\V' else if N = CheckerIndex_LowerAZ then Result := 'az' else if N = CheckerIndex_UpperAZ then Result := 'AZ' else if N = CheckerIndex_AnyLineBreak then Result := '\R' ; end; function TRegExpr.DumpCategoryChars(ch, ch2: REChar; Positive: Boolean): RegExprString; const S: array[Boolean] of RegExprString = ('P', 'p'); begin Result := '\' + S[Positive] + '{' + ch; if ch2 <> #0 then Result := Result + ch2; Result := Result + '} '; end; function TRegExpr.Dump(Indent: Integer): RegExprString; // dump a regexp in vaguely comprehensible form var s: PRegExprChar; op: TREOp; // Arbitrary non-END op. next, BranchEnd: PRegExprChar; BranchEndStack: Array of PRegExprChar; i, NLen, CurIndent: Integer; Diff: PtrInt; iByte: Byte; ch, ch2: REChar; begin Result := ''; if not IsProgrammOk then Exit; CurIndent := 0; op := OP_EXACTLY; s := regCodeWork; BranchEnd := nil; while op <> OP_EEND do begin // While that wasn't END last time... op := s^; next := regNext(s); if ((op =OP_CLOSE) or (op = OP_CLOSE_ATOMIC) or (op = OP_LOOP) or (op = OP_LOOP_NG) or (op = OP_LOOP_POSS) or (op = OP_LOOKAHEAD_END) or (op = OP_LOOKBEHIND_END) ) and (CurIndent > 0) then dec(CurIndent, Indent); if s = BranchEnd then begin dec(CurIndent, Indent); BranchEnd := nil; if Length(BranchEndStack) > 0 then begin BranchEnd := BranchEndStack[Length(BranchEndStack)-1]; SetLength(BranchEndStack, Length(BranchEndStack)-1); end; end; Result := Result + Format('%3d:%s %s', [s - programm, StringOfChar(' ', CurIndent), DumpOp(s^)]); if (op = OP_OPEN) or (op = OP_OPEN_ATOMIC) or (op = OP_LOOPENTRY) or (op = OP_LOOKAHEAD) or (op = OP_LOOKAHEAD_NEG) or (op = OP_LOOKBEHIND) or (op = OP_LOOKBEHIND_NEG) then inc(CurIndent, Indent); if (op = OP_BRANCH) or (op = OP_GBRANCH) or (op = OP_GBRANCH_EX) or (op = OP_GBRANCH_EX_CI) then begin inc(CurIndent, Indent); if BranchEnd <> nil then begin SetLength(BranchEndStack, Length(BranchEndStack)+1); BranchEndStack[Length(BranchEndStack)-1] := BranchEnd; end; BranchEnd := next; end; // Where, what. if next = nil // Next ptr. then Result := Result + ' (0)' else begin if next > s // PWideChar subtraction workaround (see comments in Tail method for details) then Diff := next - s else Diff := -(s - next); Result := Result + Format(' (%d) ', [(s - programm) + Diff]); end; Inc(s, REOpSz + RENextOffSz); if (op = OP_ANYOF) or (op = OP_ANYOF_CI) or (op = OP_ANYBUT) or (op = OP_ANYBUT_CI) then begin repeat case s^ of OpKind_End: begin Inc(s); Break; end; OpKind_Range: begin Result := Result + 'Rng('; Inc(s); Result := Result + PrintableChar(s^) + '-'; Inc(s); Result := Result + PrintableChar(s^); Result := Result + ') '; Inc(s); end; OpKind_MetaClass: begin Inc(s); Result := Result + DumpCheckerIndex(Byte(s^)) + ' '; Inc(s); end; OpKind_Char: begin Inc(s); NLen := PLongInt(s)^; Inc(s, RENumberSz); Result := Result + 'Ch('; for i := 1 to NLen do begin Result := Result + PrintableChar(s^); Inc(s); end; Result := Result + ') '; end; OpKind_CategoryYes: begin Inc(s); ch := s^; Inc(s); ch2 := s^; Result := Result + DumpCategoryChars(ch, ch2, True); Inc(s); end; OpKind_CategoryNo: begin Inc(s); ch := s^; Inc(s); ch2 := s^; Result := Result + DumpCategoryChars(ch, ch2, False); Inc(s); end; else Error(reeDumpCorruptedOpcode); end; until false; end; if (op = OP_EXACTLY) or (op = OP_EXACTLY_CI) then begin // Literal string, where present. NLen := PLongInt(s)^; Inc(s, RENumberSz); for i := 1 to NLen do begin Result := Result + PrintableChar(s^); Inc(s); end; end; if (op = OP_BSUBEXP) or (op = OP_BSUBEXP_CI) then begin Result := Result + ' \' + IntToStr(PReGroupIndex(s)^); Inc(s, ReGroupIndexSz); end; if (op = OP_SUBCALL) then begin Result := Result + ' (?' + IntToStr(PReGroupIndex(s)^) + ') @' + IntToStr(GrpOpCodes[PReGroupIndex(s)^]-programm); Inc(s, ReGroupIndexSz); end; if (op = OP_OPEN) or (op = OP_OPEN_ATOMIC) or (op = OP_CLOSE) or (op = OP_CLOSE_ATOMIC) then begin Result := Result + ' [' + IntToStr(PReGroupIndex(s)^) + ']'; Inc(s, ReGroupIndexSz); end; if (op = OP_BRACES) or (op = OP_BRACES_NG) or (op = OP_BRACES_POSS) then begin // show min/max argument of braces operator Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^, PREBracesArg(AlignToInt(s + REBracesArgSz))^]); Inc(s, REBracesArgSz * 2); end; {$IFDEF ComplexBraces} if (op = OP_LOOP) or (op = OP_LOOP_NG) or (op = OP_LOOP_POSS) then begin Result := Result + Format(' -> (%d) {%d,%d}', [(s - programm - (REOpSz + RENextOffSz)) + PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^, PREBracesArg(AlignToInt(s))^, PREBracesArg(AlignToInt(s + REBracesArgSz))^]); Inc(s, 2 * REBracesArgSz + RENextOffSz); end; {$ENDIF} if (op = OP_ANYCATEGORY) or (op = OP_NOTCATEGORY) then begin ch := s^; Inc(s); ch2 := s^; Inc(s); if ch2<>#0 then Result := Result + '{' + ch + ch2 + '}' else Result := Result + '{' + ch + '}'; end; if (op = OP_LOOKBEHIND) or (op = OP_LOOKBEHIND_NEG) then begin if PReOpLookBehindOptions(s)^.IsGreedy = OPT_LOOKBEHIND_FIXED then Result := Result + ' (fixed)' else if PReOpLookBehindOptions(s)^.IsGreedy = OPT_LOOKBEHIND_NON_GREEDY then Result := Result + ' (not greedy)' else Result := Result + ' (greedy)'; Result := Result + ' Len: ' + IntToStr(PReOpLookBehindOptions(s)^.MatchLenMin) + '..' + IntToStr(PReOpLookBehindOptions(s)^.MatchLenMax); Inc(s, ReOpLookBehindOptionsSz); end else if (op = OP_BRANCH) or (op = OP_GBRANCH) then begin Inc(s, REBranchArgSz); end else if (op = OP_GBRANCH_EX) or (op = OP_GBRANCH_EX_CI) then begin Result := Result + ' ' + s^; if (op = OP_GBRANCH_EX_CI) then Result := Result + (s+1)^; Inc(s, REBranchArgSz); end; Result := Result + #$d#$a; end; { of while } // Header fields of interest. case regAnchored of raBOL: Result := Result + 'Anchored(BOL); '; raEOL: Result := Result + 'Anchored(EOL); '; raContinue: Result := Result + 'Anchored(\G); '; raOnlyOnce: Result := Result + 'Anchored(start); '; end; if regMustString <> '' then Result := Result + 'Must have: "' + regMustString + '"; '; {$IFDEF UseFirstCharSet} Result := Result + #$d#$a'First charset: '; if FirstCharSet = [] then Result := Result + '' else if FirstCharSet = RegExprAllSet then Result := Result + '' else for iByte := 0 to 255 do if iByte in FirstCharSet then Result := Result + PrintableChar(REChar(iByte)); {$ENDIF} Result := Result + #$d#$a; end; { of function TRegExpr.Dump -------------------------------------------------------------- } {$ENDIF} function TRegExpr.IsFixedLength(var op: TREOp; var ALen: Integer): Boolean; var s: PRegExprChar; ADummyMaxLen: integer; begin Result := False; if not IsCompiled then Exit; s := regCodeWork; Result := IsPartFixedLength(s, op, ALen, ADummyMaxLen, OP_EEND, nil, []); end; function TRegExpr.IsFixedLengthEx(var op: TREOp; var AMinLen, AMaxLen: integer ): boolean; var s: PRegExprChar; begin Result := False; if not IsCompiled then Exit; s := regCodeWork; Result := IsPartFixedLength(s, op, AMinLen, AMaxLen, OP_EEND, nil, []); end; function TRegExpr.IsPartFixedLength(var prog: PRegExprChar; var op: TREOp; var AMinLen, AMaxLen: integer; StopAt: TREOp; StopMaxProg: PRegExprChar; Flags: TRegExprFindFixedLengthFlags): boolean; function MultiplyLen(AVal, AFactor: Integer): Integer; begin if AFactor > High(AVal) div AVal then Result := high(AVal) else Result := AVal * AFactor; end; procedure IncMaxLen(var AVal: Integer; AInc: Integer); begin if AInc > High(AVal) - AVal then AVal := high(AVal) else AVal := AVal + AInc; end; function MaxStopOrNext(next: PRegExprChar): PRegExprChar; begin Result := next; if (Result = nil) or ( (StopMaxProg <> nil) and (Result > StopMaxProg) ) then Result := StopMaxProg; end; var s, next: PRegExprChar; N, N2, FndMaxLen, ASubLen, ABranchLen, ABranchMaxLen, ASubMaxLen: integer; NotFixedLen, r, NextIsNil: Boolean; FirstVarLenOp: TREOp; begin Result := False; NotFixedLen := False; AMinLen := 0; AMaxLen := High(AMaxLen); FndMaxLen := 0; next := prog; s := prog; repeat NextIsNil := next = nil; next := regNext(s); prog := s; op := s^; if not NotFixedLen then FirstVarLenOp := op; if (op = StopAt) or ((StopMaxProg <> nil) and (s >= StopMaxProg)) or (NextIsNil and (flfReturnAtNextNil in Flags)) then begin AMaxLen := FndMaxLen; op := FirstVarLenOp; if not NotFixedLen then Result := True; Exit; end; Inc(s, REOpSz + RENextOffSz); case op of OP_EEND, OP_BACK: begin AMaxLen := FndMaxLen; op := FirstVarLenOp; if not NotFixedLen then Result := True; Exit; end; OP_BRANCH, OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI: begin s := s + REBranchArgSz; if not IsPartFixedLength(s, op, ABranchLen, ABranchMaxLen, OP_EEND, MaxStopOrNext(next), Flags * [flfReturnAtNextNil, flfSkipLookAround]) then begin if not NotFixedLen then FirstVarLenOp := op; NotFixedLen := True; end; s := next; repeat next := regNext(s); s := s + REBranchArgSz; Inc(s, REOpSz + RENextOffSz); if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_EEND, MaxStopOrNext(next), Flags * [flfReturnAtNextNil, flfSkipLookAround]) then begin if not NotFixedLen then FirstVarLenOp := op; NotFixedLen := True; end; s := next; if (ASubLen <> ABranchLen) then NotFixedLen := True; if ASubLen < ABranchLen then ABranchLen := ASubLen; if ASubMaxLen > ABranchMaxLen then ABranchMaxLen := ASubMaxLen; until (next^ <> OP_BRANCH) and (next^ <> OP_GBRANCH) and (next^ <> OP_GBRANCH_EX) and (next^ <> OP_GBRANCH_EX_CI); AMinLen := AMinLen + ABranchLen; IncMaxLen(FndMaxLen, ABranchMaxLen); end; OP_OPEN: begin Inc(s, ReGroupIndexSz); if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_CLOSE, StopMaxProg, Flags * [flfReturnAtNextNil, flfSkipLookAround]) then begin if not NotFixedLen then FirstVarLenOp := op; NotFixedLen := True; end; assert(s^=OP_CLOSE); AMinLen := AMinLen + ASubLen; IncMaxLen(FndMaxLen, ASubMaxLen); Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE continue; end; OP_OPEN_ATOMIC: begin Inc(s, ReGroupIndexSz); if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_CLOSE_ATOMIC, StopMaxProg, Flags * [flfReturnAtNextNil, flfSkipLookAround]) then begin if not NotFixedLen then FirstVarLenOp := op; NotFixedLen := True; end; assert(s^=OP_CLOSE_ATOMIC); AMinLen := AMinLen + ASubLen; IncMaxLen(FndMaxLen, ASubMaxLen); Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE_ATOMIC; continue; end; OP_CLOSE, OP_CLOSE_ATOMIC: begin Inc(s, ReGroupIndexSz); continue; end; OP_LOOKAHEAD, OP_LOOKAHEAD_NEG: begin r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_LOOKAHEAD_END, MaxStopOrNext(next), [flfSkipLookAround] + Flags * [flfReturnAtNextNil]); s := next; Inc(s, REOpSz + RENextOffSz); // skip the OP_LOOKAHEAD_END if not (flfSkipLookAround in Flags) then begin //if not r then NotFixedLen := True; end; end; OP_LOOKBEHIND, OP_LOOKBEHIND_NEG: begin Inc(s, ReOpLookBehindOptionsSz); r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_LOOKBEHIND_END, MaxStopOrNext(next), [flfSkipLookAround] + Flags * [flfReturnAtNextNil]); s := next; Inc(s, REOpSz + RENextOffSz); // skip the OP_LOOKBEHIND_END if not (flfSkipLookAround in Flags) then NotFixedLen := True end; OP_LOOKAHEAD_END, OP_LOOKBEHIND_END: if flfSkipLookAround in Flags then begin continue; end; OP_NOTHING, OP_COMMENT, OP_BOUND, OP_NOTBOUND, OP_BOL, OP_BOL_ML, OP_EOL, OP_EOL2, OP_EOL_ML, OP_CONTINUE_POS: Continue; OP_ANY, OP_ANY_ML, OP_ANYDIGIT, OP_NOTDIGIT, OP_ANYLETTER, OP_NOTLETTER, OP_ANYSPACE, OP_NOTSPACE, OP_ANYHORZSEP, OP_NOTHORZSEP, OP_ANYVERTSEP, OP_NOTVERTSEP: begin Inc(AMinLen); IncMaxLen(FndMaxLen, 1); Continue; end; OP_ANYOF, OP_ANYOF_CI, OP_ANYBUT, OP_ANYBUT_CI: begin Inc(AMinLen); IncMaxLen(FndMaxLen, 1); repeat case s^ of OpKind_End: begin Inc(s); Break; end; OpKind_Range: begin Inc(s); Inc(s); Inc(s); end; OpKind_MetaClass: begin Inc(s); Inc(s); end; OpKind_Char: begin Inc(s); Inc(s, RENumberSz + PLongInt(s)^); end; OpKind_CategoryYes, OpKind_CategoryNo: begin Inc(s); Inc(s); Inc(s); end; end; until False; end; OP_EXACTLY, OP_EXACTLY_CI: begin N := PLongInt(s)^; Inc(AMinLen, N); IncMaxLen(FndMaxLen, N); Inc(s, RENumberSz + N); Continue; end; OP_ANYCATEGORY, OP_NOTCATEGORY: begin Inc(AMinLen); IncMaxLen(FndMaxLen, 1); Inc(s, 2); Continue; end; OP_BRACES, OP_BRACES_NG, OP_BRACES_POSS: begin // allow only d{n,n} N := PREBracesArg(AlignToInt(s))^; N2 := PREBracesArg(AlignToInt(s + REBracesArgSz))^; Inc(s, REBracesArgSz * 2); r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_EEND, MaxStopOrNext(next), [flfSkipLookAround, flfReturnAtNextNil]); if not r then begin if not NotFixedLen then FirstVarLenOp := op; end; Inc(AMinLen, MultiplyLen(ASubLen, N)); IncMaxLen(FndMaxLen, MultiplyLen(ASubMaxLen, N2)); if (not r) or (N <> N2) then NotFixedLen := True; s := next; end; OP_BSUBEXP, OP_BSUBEXP_CI, OP_SUBCALL: begin s := next; NotFixedLen := True; // group may be in look-around. Could be anything FndMaxLen := high(FndMaxLen); end; OP_PLUS, OP_PLUS_NG, OP_PLUS_POSS: begin s := next; Inc(AMinLen, 1); FndMaxLen := high(FndMaxLen); NotFixedLen := True end; else // OP_STAR... begin s := next; FndMaxLen := high(FndMaxLen); NotFixedLen := True end; end; until False; end; procedure TRegExpr.SetInputSubString(const AInputString: RegExprString; AInputStartPos, AInputLen: Integer); begin ClearMatches; if AInputStartPos < 1 then AInputStartPos := 1 else if AInputStartPos > Length(AInputString) then AInputStartPos := Length(AInputString) + 1; if AInputLen < 0 then AInputLen := 0 else if AInputLen > Length(AInputString) + 1 - AInputStartPos then AInputLen := Length(AInputString) + 1 - AInputStartPos; fInputString := AInputString; //UniqueString(fInputString); fInputStart := PRegExprChar(fInputString) + AInputStartPos - 1; fInputEnd := fInputStart + AInputLen; fInputContinue := fInputStart; end; {$IFDEF reRealExceptionAddr} {$OPTIMIZATION ON} // ReturnAddr works correctly only if compiler optimization is ON // I placed this method at very end of unit because there are no // way to restore compiler optimization flag ... {$ENDIF} procedure TRegExpr.Error(AErrorID: Integer); {$IFDEF windows} {$IFDEF reRealExceptionAddr} function ReturnAddr: Pointer; asm mov eax,[ebp+4] end; {$ENDIF} {$ENDIF} var e: ERegExpr; Msg: string; begin fLastError := AErrorID; // dummy stub - useless because will raise exception Msg := ErrorMsg(AErrorID); // compilation error ? if AErrorID < reeFirstRuntimeCode then Msg := Msg + ' (pos ' + IntToStr(CompilerErrorPos) + ')'; e := ERegExpr.Create(Msg); e.ErrorCode := AErrorID; e.CompilerErrorPos := CompilerErrorPos; raise e {$IFDEF windows} {$IFDEF reRealExceptionAddr} at ReturnAddr {$ENDIF} {$ENDIF}; end; { of procedure TRegExpr.Error -------------------------------------------------------------- } {$IFDEF Compat} // APIs needed only for users of old FPC 3.0 function TRegExpr.ExecPos(AOffset: Integer; ATryOnce: Boolean): Boolean; overload; begin // Check that the start position is not negative if AOffset < 1 then begin ClearMatches; Error(reeOffsetMustBePositive); Result := False; Exit; end; if ATryOnce then Result := ExecPrim(AOffset, False, False, AOffset + 1) else Result := ExecPrim(AOffset, False, False, 0); end; function TRegExpr.OldInvertCase(const Ch: REChar): REChar; begin Result := _UpperCase(Ch); if Result = Ch then Result := _LowerCase(Ch); end; class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar; begin Result := _UpperCase(Ch); if Result = Ch then Result := _LowerCase(Ch); end; function TRegExpr.GetLinePairedSeparator: RegExprString; begin // not supported anymore Result := ''; end; procedure TRegExpr.SetLinePairedSeparator(const AValue: RegExprString); begin // not supported anymore end; procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: Boolean); begin if fUseOsLineEndOnReplace = AValue then Exit; fUseOsLineEndOnReplace := AValue; if fUseOsLineEndOnReplace then fReplaceLineEnd := sLineBreak else fReplaceLineEnd := #10; end; {$ENDIF} end.