|
//------------------------------------------------------------------------------
// <copyright file="RegexCode.cs" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
// This RegexCode class is internal to the regular expression package.
// It provides operator constants for use by the Builder and the Machine.
// Implementation notes:
//
// Regexps are built into RegexCodes, which contain an operation array,
// a string table, and some constants.
//
// Each operation is one of the codes below, followed by the integer
// operands specified for each op.
//
// Strings and sets are indices into a string table.
namespace System.Text.RegularExpressions {
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
internal sealed class RegexCode {
// the following primitive operations come directly from the parser
// lef/back operands description
internal const int Onerep = 0; // lef,back char,min,max a {n}
internal const int Notonerep = 1; // lef,back char,min,max .{n}
internal const int Setrep = 2; // lef,back set,min,max [\d]{n}
internal const int Oneloop = 3; // lef,back char,min,max a {,n}
internal const int Notoneloop = 4; // lef,back char,min,max .{,n}
internal const int Setloop = 5; // lef,back set,min,max [\d]{,n}
internal const int Onelazy = 6; // lef,back char,min,max a {,n}?
internal const int Notonelazy = 7; // lef,back char,min,max .{,n}?
internal const int Setlazy = 8; // lef,back set,min,max [\d]{,n}?
internal const int One = 9; // lef char a
internal const int Notone = 10; // lef char [^a]
internal const int Set = 11; // lef set [a-z\s] \w \s \d
internal const int Multi = 12; // lef string abcd
internal const int Ref = 13; // lef group \#
internal const int Bol = 14; // ^
internal const int Eol = 15; // $
internal const int Boundary = 16; // \b
internal const int Nonboundary = 17; // \B
internal const int Beginning = 18; // \A
internal const int Start = 19; // \G
internal const int EndZ = 20; // \Z
internal const int End = 21; // \Z
internal const int Nothing = 22; // Reject!
// primitive control structures
internal const int Lazybranch = 23; // back jump straight first
internal const int Branchmark = 24; // back jump branch first for loop
internal const int Lazybranchmark = 25; // back jump straight first for loop
internal const int Nullcount = 26; // back val set counter, null mark
internal const int Setcount = 27; // back val set counter, make mark
internal const int Branchcount = 28; // back jump,limit branch++ if zero<=c<limit
internal const int Lazybranchcount= 29; // back jump,limit same, but straight first
internal const int Nullmark = 30; // back save position
internal const int Setmark = 31; // back save position
internal const int Capturemark = 32; // back group define group
internal const int Getmark = 33; // back recall position
internal const int Setjump = 34; // back save backtrack state
internal const int Backjump = 35; // zap back to saved state
internal const int Forejump = 36; // zap backtracking state
internal const int Testref = 37; // backtrack if ref undefined
internal const int Goto = 38; // jump just go
internal const int Prune = 39; // prune it baby
internal const int Stop = 40; // done!
internal const int ECMABoundary = 41; // \b
internal const int NonECMABoundary= 42; // \B
// modifiers for alternate modes
internal const int Mask = 63; // Mask to get unmodified ordinary operator
internal const int Rtl = 64; // bit to indicate that we're reverse scanning.
internal const int Back = 128; // bit to indicate that we're backtracking.
internal const int Back2 = 256; // bit to indicate that we're backtracking on a second branch.
internal const int Ci = 512; // bit to indicate that we're case-insensitive.
// the code
internal int[] _codes; // the code
internal String[] _strings; // the string/set table
// not used! internal int[] _sparseIndex; // a list of the groups that are used
internal int _trackcount; // how many instructions use backtracking
#if SILVERLIGHT
internal Dictionary<Int32, Int32> _caps; // mapping of user group numbers -> impl group slots
#else
internal Hashtable _caps; // mapping of user group numbers -> impl group slots
#endif
internal int _capsize; // number of impl group slots
internal RegexPrefix _fcPrefix; // the set of candidate first characters (may be null)
internal RegexBoyerMoore _bmPrefix; // the fixed prefix string as a Boyer-Moore machine (may be null)
internal int _anchors; // the set of zero-length start anchors (RegexFCD.Bol, etc)
internal bool _rightToLeft; // true if right to left
// optimizations
// constructor
internal RegexCode(int [] codes, List<String> stringlist, int trackcount,
#if SILVERLIGHT
Dictionary<Int32, Int32> caps, int capsize,
#else
Hashtable caps, int capsize,
#endif
RegexBoyerMoore bmPrefix, RegexPrefix fcPrefix,
int anchors, bool rightToLeft) {
_codes = codes;
_strings = new String[stringlist.Count];
_trackcount = trackcount;
_caps = caps;
_capsize = capsize;
_bmPrefix = bmPrefix;
_fcPrefix = fcPrefix;
_anchors = anchors;
_rightToLeft = rightToLeft;
stringlist.CopyTo(0, _strings, 0, stringlist.Count);
}
internal static bool OpcodeBacktracks(int Op) {
Op &= Mask;
switch (Op) {
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
case Lazybranch:
case Branchmark:
case Lazybranchmark:
case Nullcount:
case Setcount:
case Branchcount:
case Lazybranchcount:
case Setmark:
case Capturemark:
case Getmark:
case Setjump:
case Backjump:
case Forejump:
case Goto:
return true;
default:
return false;
}
}
internal static int OpcodeSize(int Opcode) {
Opcode &= Mask;
switch (Opcode) {
case Nothing:
case Bol:
case Eol:
case Boundary:
case Nonboundary:
case ECMABoundary:
case NonECMABoundary:
case Beginning:
case Start:
case EndZ:
case End:
case Nullmark:
case Setmark:
case Getmark:
case Setjump:
case Backjump:
case Forejump:
case Stop:
return 1;
case One:
case Notone:
case Multi:
case Ref:
case Testref:
case Goto:
case Nullcount:
case Setcount:
case Lazybranch:
case Branchmark:
case Lazybranchmark:
case Prune:
case Set:
return 2;
case Capturemark:
case Branchcount:
case Lazybranchcount:
case Onerep:
case Notonerep:
case Oneloop:
case Notoneloop:
case Onelazy:
case Notonelazy:
case Setlazy:
case Setrep:
case Setloop:
return 3;
default:
throw MakeException(SR.GetString(SR.UnexpectedOpcode, Opcode.ToString(CultureInfo.CurrentCulture)));
}
}
internal static ArgumentException MakeException(String message) {
return new ArgumentException(message);
}
// Debug only code below
#if DBG
internal static String[] CodeStr = new String[]
{
"Onerep", "Notonerep", "Setrep",
"Oneloop", "Notoneloop", "Setloop",
"Onelazy", "Notonelazy", "Setlazy",
"One", "Notone", "Set",
"Multi", "Ref",
"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
"Nothing",
"Lazybranch", "Branchmark", "Lazybranchmark",
"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
"Nullmark", "Setmark", "Capturemark", "Getmark",
"Setjump", "Backjump", "Forejump", "Testref", "Goto",
"Prune", "Stop",
#if ECMA
"ECMABoundary", "NonECMABoundary",
#endif
};
internal static String OperatorDescription(int Opcode) {
bool isCi = ((Opcode & Ci) != 0);
bool isRtl = ((Opcode & Rtl) != 0);
bool isBack = ((Opcode & Back) != 0);
bool isBack2 = ((Opcode & Back2) != 0);
return CodeStr[Opcode & Mask] +
(isCi ? "-Ci" : "") + (isRtl ? "-Rtl" : "") + (isBack ? "-Back" : "") + (isBack2 ? "-Back2" : "");
}
internal String OpcodeDescription(int offset) {
StringBuilder sb = new StringBuilder();
int opcode = _codes[offset];
sb.AppendFormat("{0:D6} ", offset);
sb.Append(OpcodeBacktracks(opcode & Mask) ? '*' : ' ');
sb.Append(OperatorDescription(opcode));
sb.Append('(');
opcode &= Mask;
switch (opcode) {
case One:
case Notone:
case Onerep:
case Notonerep:
case Oneloop:
case Notoneloop:
case Onelazy:
case Notonelazy:
sb.Append("Ch = ");
sb.Append(RegexCharClass.CharDescription((char)_codes[offset+1]));
break;
case Set:
case Setrep:
case Setloop:
case Setlazy:
sb.Append("Set = ");
sb.Append(RegexCharClass.SetDescription(_strings[_codes[offset+1]]));
break;
case Multi:
sb.Append("String = ");
sb.Append(_strings[_codes[offset+1]]);
break;
case Ref:
case Testref:
sb.Append("Index = ");
sb.Append(_codes[offset+1]);
break;
case Capturemark:
sb.Append("Index = ");
sb.Append(_codes[offset+1]);
if (_codes[offset+2] != -1) {
sb.Append(", Unindex = ");
sb.Append(_codes[offset+2]);
}
break;
case Nullcount:
case Setcount:
sb.Append("Value = ");
sb.Append(_codes[offset+1]);
break;
case Goto:
case Lazybranch:
case Branchmark:
case Lazybranchmark:
case Branchcount:
case Lazybranchcount:
sb.Append("Addr = ");
sb.Append(_codes[offset+1]);
break;
}
switch (opcode) {
case Onerep:
case Notonerep:
case Oneloop:
case Notoneloop:
case Onelazy:
case Notonelazy:
case Setrep:
case Setloop:
case Setlazy:
sb.Append(", Rep = ");
if (_codes[offset + 2] == Int32.MaxValue)
sb.Append("inf");
else
sb.Append(_codes[offset + 2]);
break;
case Branchcount:
case Lazybranchcount:
sb.Append(", Limit = ");
if (_codes[offset + 2] == Int32.MaxValue)
sb.Append("inf");
else
sb.Append(_codes[offset + 2]);
break;
}
sb.Append(")");
return sb.ToString();
}
internal void Dump() {
int i;
Debug.WriteLine("Direction: " + (_rightToLeft ? "right-to-left" : "left-to-right"));
Debug.WriteLine("Firstchars: " + (_fcPrefix == null ? "n/a" : RegexCharClass.SetDescription(_fcPrefix.Prefix)));
Debug.WriteLine("Prefix: " + (_bmPrefix == null ? "n/a" : Regex.Escape(_bmPrefix.ToString())));
Debug.WriteLine("Anchors: " + RegexFCD.AnchorDescription(_anchors));
Debug.WriteLine("");
if (_bmPrefix != null) {
Debug.WriteLine("BoyerMoore:");
Debug.WriteLine(_bmPrefix.Dump(" "));
}
for (i = 0; i < _codes.Length;) {
Debug.WriteLine(OpcodeDescription(i));
i += OpcodeSize(_codes[i]);
}
Debug.WriteLine("");
}
#endif
}
}
|