File: regex\system\text\regularexpressions\RegexFCD.cs
Project: ndp\fx\src\System.csproj (System)
//------------------------------------------------------------------------------
// <copyright file="RegexFCD.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>                                                                
//------------------------------------------------------------------------------
 
// This RegexFCD class is internal to the Regex package.
// It builds a bunch of FC information (RegexFC) about
// the regex for optimization purposes.
 
// Implementation notes:
// 
// This step is as simple as walking the tree and emitting
// sequences of codes.
 
namespace System.Text.RegularExpressions {
 
    using System.Collections;
    using System.Globalization;
    
    internal sealed class RegexFCD {
        private int[]      _intStack;
        private int        _intDepth;    
        private RegexFC[]  _fcStack;
        private int        _fcDepth;
        private bool    _skipAllChildren;      // don't process any more children at the current level
        private bool    _skipchild;            // don't process the current child. 
        private bool    _failed = false;
        
        private const int BeforeChild = 64;
        private const int AfterChild = 128;
 
        // where the regex can be pegged
 
        internal  const int Beginning  = 0x0001;
        internal  const int Bol        = 0x0002;
        internal  const int Start      = 0x0004;
        internal  const int Eol        = 0x0008;
        internal  const int EndZ       = 0x0010;
        internal  const int End        = 0x0020;
        internal  const int Boundary   = 0x0040;
        internal  const int ECMABoundary = 0x0080;
 
        /*
         * This is the one of the only two functions that should be called from outside.
         * It takes a RegexTree and computes the set of chars that can start it.
         */
        internal static RegexPrefix FirstChars(RegexTree t) {
            RegexFCD s = new RegexFCD();
            RegexFC fc = s.RegexFCFromRegexTree(t);
 
            if (fc == null || fc._nullable)
                return null;
            
            CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
            return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive());
        }
 
        /*
         * This is a related computation: it takes a RegexTree and computes the
         * leading substring if it see one. It's quite trivial and gives up easily.
         */
        internal static RegexPrefix Prefix(RegexTree tree) {
            RegexNode curNode;
            RegexNode concatNode = null;
            int nextChild = 0;
 
            curNode = tree._root;
 
            for (;;) {
                switch (curNode._type) {
                    case RegexNode.Concatenate:
                        if (curNode.ChildCount() > 0) {
                            concatNode = curNode;
                            nextChild = 0;
                        }
                        break;
 
                    case RegexNode.Greedy:
                    case RegexNode.Capture:
                        curNode = curNode.Child(0);
                        concatNode = null;
                        continue;
 
                    case RegexNode.Oneloop:
                    case RegexNode.Onelazy:
                        if (curNode._m > 0) {
                            string pref = String.Empty.PadRight(curNode._m, curNode._ch);
                            return new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase));
                        }
                        else
                            return RegexPrefix.Empty;
                        
                    case RegexNode.One:
                        return new RegexPrefix(curNode._ch.ToString(CultureInfo.InvariantCulture), 0 != (curNode._options & RegexOptions.IgnoreCase));
                            
                    case RegexNode.Multi:
                        return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase));
 
                    case RegexNode.Bol:
                    case RegexNode.Eol:
                    case RegexNode.Boundary:
                    case RegexNode.ECMABoundary:
                    case RegexNode.Beginning:
                    case RegexNode.Start:
                    case RegexNode.EndZ:
                    case RegexNode.End:
                    case RegexNode.Empty:
                    case RegexNode.Require:
                    case RegexNode.Prevent:
                        break;
 
                    default:
                        return RegexPrefix.Empty;
                }
 
                if (concatNode == null || nextChild >= concatNode.ChildCount())
                    return RegexPrefix.Empty;
 
                curNode = concatNode.Child(nextChild++);
            }
        }
 
        /*
         * Yet another related computation: it takes a RegexTree and computes the
         * leading anchors that it encounters.
         */
        internal static int Anchors(RegexTree tree) {
            RegexNode curNode;
            RegexNode concatNode = null;
            int nextChild = 0;
            int result = 0;
 
            curNode = tree._root;
 
            for (;;) {
                switch (curNode._type) {
                    case RegexNode.Concatenate:
                        if (curNode.ChildCount() > 0) {
                            concatNode = curNode;
                            nextChild = 0;
                        }
                        break;
 
                    case RegexNode.Greedy:
                    case RegexNode.Capture:
                        curNode = curNode.Child(0);
                        concatNode = null;
                        continue;
 
                    case RegexNode.Bol:
                    case RegexNode.Eol:
                    case RegexNode.Boundary:
                    case RegexNode.ECMABoundary:
                    case RegexNode.Beginning:
                    case RegexNode.Start:
                    case RegexNode.EndZ:
                    case RegexNode.End:
                        return result | AnchorFromType(curNode._type);
 
                    case RegexNode.Empty:
                    case RegexNode.Require:
                    case RegexNode.Prevent:
                        break;
 
                    default:
                        return result;
                }
 
                if (concatNode == null || nextChild >= concatNode.ChildCount())
                    return result;
 
                curNode = concatNode.Child(nextChild++);
            }
        }
 
        /*
         * Convert anchor type to anchor bit.
         */
        private static int AnchorFromType(int type) {
            switch (type) {
                case RegexNode.Bol:             return Bol;         
                case RegexNode.Eol:             return Eol;         
                case RegexNode.Boundary:        return Boundary;    
                case RegexNode.ECMABoundary:    return ECMABoundary;
                case RegexNode.Beginning:       return Beginning;   
                case RegexNode.Start:           return Start;       
                case RegexNode.EndZ:            return EndZ;        
                case RegexNode.End:             return End;         
                default:                        return 0;
            }
        }
 
#if DBG
        internal static String AnchorDescription(int anchors) {
            StringBuilder sb = new StringBuilder();
 
            if (0 != (anchors & Beginning))     sb.Append(", Beginning");
            if (0 != (anchors & Start))         sb.Append(", Start");
            if (0 != (anchors & Bol))           sb.Append(", Bol");
            if (0 != (anchors & Boundary))      sb.Append(", Boundary");
            if (0 != (anchors & ECMABoundary))  sb.Append(", ECMABoundary");
            if (0 != (anchors & Eol))           sb.Append(", Eol");
            if (0 != (anchors & End))           sb.Append(", End");
            if (0 != (anchors & EndZ))          sb.Append(", EndZ");
 
            if (sb.Length >= 2)
                return(sb.ToString(2, sb.Length - 2));
 
            return "None";
        }
#endif
 
        /*
         * private constructor; can't be created outside
         */
        private RegexFCD() {
            _fcStack = new RegexFC[32];
            _intStack = new int[32];
        }
 
        /*
         * To avoid recursion, we use a simple integer stack.
         * This is the push.
         */
        private void PushInt(int I) {
            if (_intDepth >= _intStack.Length) {
                int [] expanded = new int[_intDepth * 2];
 
                System.Array.Copy(_intStack, 0, expanded, 0, _intDepth);
 
                _intStack = expanded;
            }
 
            _intStack[_intDepth++] = I;
        }
 
        /*
         * True if the stack is empty.
         */
        private bool IntIsEmpty() {
            return _intDepth == 0;
        }
 
        /*
         * This is the pop.
         */
        private int PopInt() {
            return _intStack[--_intDepth];
        }
 
        /*
          * We also use a stack of RegexFC objects.
          * This is the push.
          */
        private void PushFC(RegexFC fc) {
            if (_fcDepth >= _fcStack.Length) {
                RegexFC[] expanded = new RegexFC[_fcDepth * 2];
 
                System.Array.Copy(_fcStack, 0, expanded, 0, _fcDepth);
                _fcStack = expanded;
            }
 
            _fcStack[_fcDepth++] = fc;
        }
 
        /*
         * True if the stack is empty.
         */
        private bool FCIsEmpty() {
            return _fcDepth == 0;
        }
 
        /*
         * This is the pop.
         */
        private RegexFC PopFC() {
            return _fcStack[--_fcDepth];
        }
 
        /*
         * This is the top.
         */
        private RegexFC TopFC() {
            return _fcStack[_fcDepth - 1];
        }
 
        /*
         * The main FC computation. It does a shortcutted depth-first walk
         * through the tree and calls CalculateFC to emits code before
         * and after each child of an interior node, and at each leaf.
         */
        private RegexFC RegexFCFromRegexTree(RegexTree tree) {
            RegexNode curNode;
            int curChild;
 
            curNode = tree._root;
            curChild = 0;
 
            for (;;) {
                if (curNode._children == null) {
                    // This is a leaf node
                    CalculateFC(curNode._type, curNode, 0);
                }
                else if (curChild < curNode._children.Count && !_skipAllChildren) {
                    // This is an interior node, and we have more children to analyze
                    CalculateFC(curNode._type | BeforeChild, curNode, curChild);
 
                    if (!_skipchild) {
                        curNode = (RegexNode)curNode._children[curChild];
                        // this stack is how we get a depth first walk of the tree. 
                        PushInt(curChild);
                        curChild = 0;
                    }
                    else {
                        curChild++;
                        _skipchild = false;
                    }
                    continue;
                }
                
                // This is an interior node where we've finished analyzing all the children, or
                // the end of a leaf node. 
                _skipAllChildren = false;
 
                if (IntIsEmpty())
                    break;
 
                curChild = PopInt();
                curNode = curNode._next;
 
                CalculateFC(curNode._type | AfterChild, curNode, curChild);
                if (_failed)
                    return null;
                
                curChild++;
            }
 
            if (FCIsEmpty())
                return null; 
 
            return PopFC();
        }
 
        /*
         * Called in Beforechild to prevent further processing of the current child
         */
        private void SkipChild() {
            _skipchild = true;
        }
 
        /*
         * FC computation and shortcut cases for each node type
         */
        private void CalculateFC(int NodeType, RegexNode node, int CurIndex) {
            bool ci = false;
            bool rtl = false;
 
            if (NodeType <= RegexNode.Ref) {
                if ((node._options & RegexOptions.IgnoreCase) != 0)
                    ci = true;
                if ((node._options & RegexOptions.RightToLeft) != 0)
                    rtl = true;
            }
 
            switch (NodeType) {
                case RegexNode.Concatenate | BeforeChild:
                case RegexNode.Alternate | BeforeChild:
                case RegexNode.Testref | BeforeChild:
                case RegexNode.Loop | BeforeChild:
                case RegexNode.Lazyloop | BeforeChild:
                    break;
 
                case RegexNode.Testgroup | BeforeChild:
                    if (CurIndex == 0)
                        SkipChild();
                    break;
 
                case RegexNode.Empty:
                    PushFC(new RegexFC(true));
                    break;
 
                case RegexNode.Concatenate | AfterChild:
                    if (CurIndex != 0) {
                        RegexFC child = PopFC();
                        RegexFC cumul = TopFC();
 
                        _failed = !cumul.AddFC(child, true);
                    }
 
                    if (!TopFC()._nullable)
                        _skipAllChildren = true;
                    break;
 
                case RegexNode.Testgroup | AfterChild:
                    if (CurIndex > 1) {
                        RegexFC child = PopFC();
                        RegexFC cumul = TopFC();
 
                        _failed = !cumul.AddFC(child, false);
                    }
                    break;
 
                case RegexNode.Alternate | AfterChild:
                case RegexNode.Testref | AfterChild:
                    if (CurIndex != 0) {
                        RegexFC child = PopFC();
                        RegexFC cumul = TopFC();
 
                        _failed = !cumul.AddFC(child, false);
                    }
                    break;
 
                case RegexNode.Loop | AfterChild:
                case RegexNode.Lazyloop | AfterChild:
                    if (node._m == 0)
                        TopFC()._nullable = true;
                    break;
 
                case RegexNode.Group | BeforeChild:
                case RegexNode.Group | AfterChild:
                case RegexNode.Capture | BeforeChild:
                case RegexNode.Capture | AfterChild:
                case RegexNode.Greedy | BeforeChild:
                case RegexNode.Greedy | AfterChild:
                    break;
 
                case RegexNode.Require | BeforeChild:
                case RegexNode.Prevent | BeforeChild:
                    SkipChild();
                    PushFC(new RegexFC(true));
                    break;
 
                case RegexNode.Require | AfterChild:
                case RegexNode.Prevent | AfterChild:
                    break;
 
                case RegexNode.One:
                case RegexNode.Notone:
                    PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci));
                    break;
 
                case RegexNode.Oneloop:
                case RegexNode.Onelazy:
                    PushFC(new RegexFC(node._ch, false, node._m == 0, ci));
                    break;
 
                case RegexNode.Notoneloop:
                case RegexNode.Notonelazy:
                    PushFC(new RegexFC(node._ch, true, node._m == 0, ci));
                    break;
 
                case RegexNode.Multi:
                    if (node._str.Length == 0)
                        PushFC(new RegexFC(true));
                    else if (!rtl)
                        PushFC(new RegexFC(node._str[0], false, false, ci));
                    else
                        PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci));
                    break;
 
                case RegexNode.Set:
                    PushFC(new RegexFC(node._str, false, ci));
                    break;
 
                case RegexNode.Setloop:
                case RegexNode.Setlazy:
                    PushFC(new RegexFC(node._str, node._m == 0, ci));
                    break;
 
                case RegexNode.Ref:
                    PushFC(new RegexFC(RegexCharClass.AnyClass, true, false));
                    break;
 
                case RegexNode.Nothing:
                case RegexNode.Bol:
                case RegexNode.Eol:
                case RegexNode.Boundary:
                case RegexNode.Nonboundary:
                case RegexNode.ECMABoundary:
                case RegexNode.NonECMABoundary:
                case RegexNode.Beginning:
                case RegexNode.Start:
                case RegexNode.EndZ:
                case RegexNode.End:
                    PushFC(new RegexFC(true));
                    break;
 
                default:
                    throw new ArgumentException(SR.GetString(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture)));
            }
        }
    }
 
    internal sealed class RegexFC {
        internal RegexCharClass _cc;
        internal bool _nullable;
        internal bool _caseInsensitive;
 
        internal RegexFC(bool nullable) {
            _cc = new RegexCharClass();
            _nullable = nullable;
        }
 
        internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) {
            _cc = new RegexCharClass();
 
            if (not) {
                if (ch > 0)
                    _cc.AddRange('\0', (char)(ch - 1));
                if (ch < 0xFFFF)
                    _cc.AddRange((char)(ch + 1), '\uFFFF');
            }
            else {
                _cc.AddRange(ch, ch);
            }
 
            _caseInsensitive = caseInsensitive;
            _nullable = nullable;
        }
 
        internal RegexFC(String charClass, bool nullable, bool caseInsensitive) {
            _cc = RegexCharClass.Parse(charClass);
 
            _nullable = nullable;
            _caseInsensitive = caseInsensitive;
        }
 
        internal bool AddFC(RegexFC fc, bool concatenate) {
            if (!_cc.CanMerge || !fc._cc.CanMerge) {
                return false;
            }
            
            if (concatenate) {
                if (!_nullable)
                    return true;
 
                if (!fc._nullable)
                    _nullable = false;
            }
            else {
                if (fc._nullable)
                    _nullable = true;
            }
 
            _caseInsensitive |= fc._caseInsensitive;
            _cc.AddCharClass(fc._cc);
            return true;
        }
 
        internal String GetFirstChars(CultureInfo culture) {
            if (_caseInsensitive)
                _cc.AddLowercase(culture);
 
            return _cc.ToStringClass();
        }
        
        internal bool IsCaseInsensitive() {
            return _caseInsensitive;
        }
    }
 
    internal sealed class RegexPrefix {
        internal String _prefix;
        internal bool _caseInsensitive;
 
        internal static RegexPrefix _empty = new RegexPrefix(String.Empty, false);
 
        internal RegexPrefix(String prefix, bool ci) {
            _prefix = prefix;
            _caseInsensitive = ci;
        }
 
        internal String Prefix {
            get {
                return _prefix;
            }
        }
 
        internal bool CaseInsensitive {
            get {
                return _caseInsensitive;
            }
        }
        internal static RegexPrefix Empty {
            get {
                return _empty;
            }
        }
    }
}