File: System\Xml\Xsl\XPath\XPathScanner.cs
Project: ndp\fx\src\XmlUtils\System.Data.SqlXml.csproj (System.Data.SqlXml)
//------------------------------------------------------------------------------
// <copyright file="XPathScanner.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
// <owner current="true" primary="true">Microsoft</owner>
// <spec>http://www.w3.org/TR/xpath#exprlex</spec>
//------------------------------------------------------------------------------
 
using System.Diagnostics;
 
namespace System.Xml.Xsl.XPath {
    using Res = System.Xml.Utils.Res;
 
    // Extends XPathOperator enumeration
    internal enum LexKind {
        Unknown,        // Unknown lexeme
        Or,             // Operator 'or'
        And,            // Operator 'and'
        Eq,             // Operator '='
        Ne,             // Operator '!='
        Lt,             // Operator '<'
        Le,             // Operator '<='
        Gt,             // Operator '>'
        Ge,             // Operator '>='
        Plus,           // Operator '+'
        Minus,          // Operator '-'
        Multiply,       // Operator '*'
        Divide,         // Operator 'div'
        Modulo,         // Operator 'mod'
        UnaryMinus,     // Not used
        Union,          // Operator '|'
        LastOperator    = Union,
 
        DotDot,         // '..'
        ColonColon,     // '::'
        SlashSlash,     // Operator '//'
        Number,         // Number (numeric literal)
        Axis,           // AxisName
 
        Name,           // NameTest, NodeType, FunctionName, AxisName, second part of VariableReference
        String,         // Literal (string literal)
        Eof,            // End of the expression
 
        FirstStringable = Name,
        LastNonChar     = Eof,
 
        LParens     = '(',
        RParens     = ')',
        LBracket    = '[',
        RBracket    = ']',
        Dot         = '.',
        At          = '@',
        Comma       = ',',
 
        Star        = '*',      // NameTest
        Slash       = '/',      // Operator '/'
        Dollar      = '$',      // First part of VariableReference
        RBrace      = '}',      // Used for AVTs
    };
 
    internal sealed class XPathScanner {
        private string  xpathExpr;
        private int     curIndex;
        private char    curChar;
        private LexKind kind;
        private string  name;
        private string  prefix;
        private string  stringValue;
        private bool    canBeFunction;
        private int     lexStart;
        private int     prevLexEnd;
        private LexKind prevKind;
        private XPathAxis axis;
 
        private XmlCharType xmlCharType = XmlCharType.Instance;
 
        public XPathScanner(string xpathExpr) : this(xpathExpr, 0) {}
 
        public XPathScanner(string xpathExpr, int startFrom) {
            Debug.Assert(xpathExpr != null);
            this.xpathExpr = xpathExpr;
            this.kind = LexKind.Unknown;
            SetSourceIndex(startFrom);
            NextLex();
        }
 
        public string   Source      { get { return xpathExpr;   } }
        public LexKind  Kind        { get { return kind;        } }
        public int      LexStart    { get { return lexStart;    } }
        public int      LexSize     { get { return curIndex - lexStart; } }
        public int      PrevLexEnd  { get { return prevLexEnd;  } }
 
        private void SetSourceIndex(int index) {
            Debug.Assert(0 <= index && index <= xpathExpr.Length);
            curIndex = index - 1;
            NextChar();
        }
 
        private void NextChar() {
            Debug.Assert(-1 <= curIndex && curIndex < xpathExpr.Length);
            curIndex++;
            if (curIndex < xpathExpr.Length) {
                curChar = xpathExpr[curIndex];
            } else {
                Debug.Assert(curIndex == xpathExpr.Length);
                curChar = '\0';
            }
        }
 
#if XML10_FIFTH_EDITION
        private char PeekNextChar() {
            Debug.Assert(-1 <= curIndex && curIndex <= xpathExpr.Length);
            if (curIndex + 1 < xpathExpr.Length) {
                return xpathExpr[curIndex + 1];
            }
            else {
                return '\0';
            }
        }
#endif
 
        public string Name {
            get {
                Debug.Assert(kind == LexKind.Name);
                Debug.Assert(name != null);
                return name;
            }
        }
 
        public string Prefix {
            get {
                Debug.Assert(kind == LexKind.Name);
                Debug.Assert(prefix != null);
                return prefix;
            }
        }
 
        public string RawValue {
            get {
                if (kind == LexKind.Eof) {
                    return LexKindToString(kind);
                } else {
                    return xpathExpr.Substring(lexStart, curIndex - lexStart);
                }
            }
        }
 
        public string StringValue {
            get {
                Debug.Assert(kind == LexKind.String);
                Debug.Assert(stringValue != null);
                return stringValue;
            }
        }
 
        // Returns true if the character following an QName (possibly after intervening
        // ExprWhitespace) is '('. In this case the token must be recognized as a NodeType
        // or a FunctionName unless it is an OperatorName. This distinction cannot be done
        // without knowing the previous lexeme. For example, "or" in "... or (1 != 0)" may
        // be an OperatorName or a FunctionName.
        public bool CanBeFunction {
            get {
                Debug.Assert(kind == LexKind.Name);
                return canBeFunction;
            }
        }
 
        public XPathAxis Axis {
            get {
                Debug.Assert(kind == LexKind.Axis);
                Debug.Assert(axis != XPathAxis.Unknown);
                return axis;
            }
        }
 
        private void SkipSpace() {
            while (xmlCharType.IsWhiteSpace(curChar)) {
                NextChar();
            }
        }
 
        private static bool IsAsciiDigit(char ch) {
            return (uint)(ch - '0') <= 9;
        }
 
        public void NextLex() {
            prevLexEnd = curIndex;
            prevKind = kind;
            SkipSpace();
            lexStart = curIndex;
 
            switch (curChar) {
                case '\0':
                    kind = LexKind.Eof;
                    return;
                case '(': case ')': case '[': case ']':
                case '@': case ',': case '$': case '}':
                    kind = (LexKind)curChar;
                    NextChar();
                    break;
                case '.':
                    NextChar();
                    if (curChar == '.') {
                        kind = LexKind.DotDot;
                        NextChar();
                    } else if (IsAsciiDigit(curChar)) {
                        SetSourceIndex(lexStart);
                        goto case '0';
                    } else {
                        kind = LexKind.Dot;
                    }
                    break;
                case ':':
                    NextChar();
                    if (curChar == ':') {
                        kind = LexKind.ColonColon;
                        NextChar();
                    } else {
                        kind = LexKind.Unknown;
                    }
                    break;
                case '*':
                    kind = LexKind.Star;
                    NextChar();
                    CheckOperator(true);
                    break;
                case '/':
                    NextChar();
                    if (curChar == '/') {
                        kind = LexKind.SlashSlash;
                        NextChar();
                    } else {
                        kind = LexKind.Slash;
                    }
                    break;
                case '|':
                    kind = LexKind.Union;
                    NextChar();
                    break;
                case '+':
                    kind = LexKind.Plus;
                    NextChar();
                    break;
                case '-':
                    kind = LexKind.Minus;
                    NextChar();
                    break;
                case '=':
                    kind = LexKind.Eq;
                    NextChar();
                    break;
                case '!':
                    NextChar();
                    if (curChar == '=') {
                        kind = LexKind.Ne;
                        NextChar();
                    } else {
                        kind = LexKind.Unknown;
                    }
                    break;
                case '<':
                    NextChar();
                    if (curChar == '=') {
                        kind = LexKind.Le;
                        NextChar();
                    } else {
                        kind = LexKind.Lt;
                    }
                    break;
                case '>':
                    NextChar();
                    if (curChar == '=') {
                        kind = LexKind.Ge;
                        NextChar();
                    } else {
                        kind = LexKind.Gt;
                    }
                    break;
                case '"':
                case '\'':
                    kind = LexKind.String;
                    ScanString();
                    break;
                case '0': case '1': case '2': case '3':
                case '4': case '5': case '6': case '7':
                case '8': case '9':
                    kind = LexKind.Number;
                    ScanNumber();
                    break;
                default:
                    if (xmlCharType.IsStartNCNameSingleChar(curChar) 
#if XML10_FIFTH_EDITION
                        || xmlCharType.IsNCNameHighSurrogateChar(curChar)
#endif
                        ) {
                        kind = LexKind.Name;
                        this.name   = ScanNCName();
                        this.prefix = string.Empty;
                        this.canBeFunction = false;
                        this.axis = XPathAxis.Unknown;
                        bool colonColon = false;
                        int saveSourceIndex = curIndex;
 
                        // "foo:bar" or "foo:*" -- one lexeme (no spaces allowed)
                        // "foo::" or "foo ::"  -- two lexemes, reported as one (AxisName)
                        // "foo:?" or "foo :?"  -- lexeme "foo" reported
                        if (curChar == ':') {
                            NextChar();
                            if (curChar == ':') {   // "foo::" -> OperatorName, AxisName
                                NextChar();
                                colonColon = true;
                                SetSourceIndex(saveSourceIndex);
                            } else {                // "foo:bar", "foo:*" or "foo:?"
                                if (curChar == '*') {
                                    NextChar();
                                    this.prefix = this.name;
                                    this.name = "*";
                                } else if (xmlCharType.IsStartNCNameSingleChar(curChar) 
#if XML10_FIFTH_EDITION
                                    || xmlCharType.IsNCNameHighSurrogateChar(curChar)
#endif
                                    ) {
                                    this.prefix = this.name;
                                    this.name = ScanNCName();
                                    // Look ahead for '(' to determine whether QName can be a FunctionName
                                    saveSourceIndex = curIndex;
                                    SkipSpace();
                                    this.canBeFunction = (curChar == '(');
                                    SetSourceIndex(saveSourceIndex);
                                } else {            // "foo:?" -> OperatorName, NameTest
                                    // Return "foo" and leave ":" to be reported later as an unknown lexeme
                                    SetSourceIndex(saveSourceIndex);
                                }
                            }
                        } else {
                            SkipSpace();
                            if (curChar == ':') {   // "foo ::" or "foo :?"
                                NextChar();
                                if (curChar == ':') {
                                    NextChar();
                                    colonColon = true;
                                }
                                SetSourceIndex(saveSourceIndex);
                            } else {
                                this.canBeFunction = (curChar == '(');
                            }
                        }
                        if (!CheckOperator(false) && colonColon) {
                            this.axis = CheckAxis();
                        }
                    } else {
                        kind = LexKind.Unknown;
                        NextChar();
                    }
                    break;
            }
        }
 
        private bool CheckOperator(bool star) {
            LexKind opKind;
 
            if (star) {
                opKind = LexKind.Multiply;
            } else {
                if (prefix.Length != 0 || name.Length > 3)
                    return false;
 
                switch (name) {
                    case "or" : opKind = LexKind.Or;      break;
                    case "and": opKind = LexKind.And;     break;
                    case "div": opKind = LexKind.Divide;  break;
                    case "mod": opKind = LexKind.Modulo;  break;
                    default   : return false;
                }
            }
 
            // If there is a preceding token and the preceding token is not one of '@', '::', '(', '[', ',' or an Operator,
            // then a '*' must be recognized as a MultiplyOperator and an NCName must be recognized as an OperatorName.
            if (prevKind <= LexKind.LastOperator)
                return false;
 
            switch (prevKind) {
                case LexKind.Slash:
                case LexKind.SlashSlash:
                case LexKind.At:
                case LexKind.ColonColon:
                case LexKind.LParens:
                case LexKind.LBracket:
                case LexKind.Comma:
                case LexKind.Dollar:
                    return false;
            }
 
            this.kind = opKind;
            return true;
        }
 
        private XPathAxis CheckAxis() {
            this.kind = LexKind.Axis;
            switch (name) {
                case "ancestor"           : return XPathAxis.Ancestor;
                case "ancestor-or-self"   : return XPathAxis.AncestorOrSelf;
                case "attribute"          : return XPathAxis.Attribute;
                case "child"              : return XPathAxis.Child;
                case "descendant"         : return XPathAxis.Descendant;
                case "descendant-or-self" : return XPathAxis.DescendantOrSelf;
                case "following"          : return XPathAxis.Following;
                case "following-sibling"  : return XPathAxis.FollowingSibling;
                case "namespace"          : return XPathAxis.Namespace;
                case "parent"             : return XPathAxis.Parent;
                case "preceding"          : return XPathAxis.Preceding;
                case "preceding-sibling"  : return XPathAxis.PrecedingSibling;
                case "self"               : return XPathAxis.Self;
                default                   : this.kind = LexKind.Name; return XPathAxis.Unknown;
            }
        }
 
        private void ScanNumber() {
            Debug.Assert(IsAsciiDigit(curChar) || curChar == '.');
            while (IsAsciiDigit(curChar)) {
                NextChar();
            }
            if (curChar == '.') {
                NextChar();
                while (IsAsciiDigit(curChar)) {
                    NextChar();
                }
            }
            if ((curChar & (~0x20)) == 'E') {
                NextChar();
                if (curChar == '+' || curChar == '-') {
                    NextChar();
                }
                while (IsAsciiDigit(curChar)) {
                    NextChar();
                }
                throw CreateException(Res.XPath_ScientificNotation);
            }
        }
 
        private void ScanString() {
            int startIdx = curIndex + 1;
            int endIdx = xpathExpr.IndexOf(curChar, startIdx);
 
            if (endIdx < 0) {
                SetSourceIndex(xpathExpr.Length);
                throw CreateException(Res.XPath_UnclosedString);
            }
 
            this.stringValue = xpathExpr.Substring(startIdx, endIdx - startIdx);
            SetSourceIndex(endIdx + 1);
        }
 
        private string ScanNCName() {
            Debug.Assert(xmlCharType.IsStartNCNameSingleChar(curChar) 
#if XML10_FIFTH_EDITION
                || xmlCharType.IsNCNameHighSurrogateChar(curChar)
#endif
                );
            int start = curIndex;
            for (;;) {
                if (xmlCharType.IsNCNameSingleChar(curChar)) {
                    NextChar();
                }
#if XML10_FIFTH_EDITION
                else if (xmlCharType.IsNCNameSurrogateChar(PeekNextChar(), curChar)) {
                    NextChar();
                    NextChar();
                }
#endif
                else {
                    break;
                }
            }
            return xpathExpr.Substring(start, curIndex - start);
        }
 
        public void PassToken(LexKind t) {
            CheckToken(t);
            NextLex();
        }
 
        public void CheckToken(LexKind t) {
            Debug.Assert(LexKind.FirstStringable <= t);
            if (kind != t) {
                if (t == LexKind.Eof) {
                    throw CreateException(Res.XPath_EofExpected, RawValue);
                } else {
                    throw CreateException(Res.XPath_TokenExpected, LexKindToString(t), RawValue);
                }
            }
        }
 
        // May be called for the following tokens: Name, String, Eof, Comma, LParens, RParens, LBracket, RBracket, RBrace
        private string LexKindToString(LexKind t) {
            Debug.Assert(LexKind.FirstStringable <= t);
 
            if (LexKind.LastNonChar < t) {
                Debug.Assert("()[].@,*/$}".IndexOf((char)t) >= 0);
                return new String((char)t, 1);
            }
 
            switch (t) {
                case LexKind.Name   : return "<name>";
                case LexKind.String : return "<string literal>";
                case LexKind.Eof    : return "<eof>";
                default:
                    Debug.Fail("Unexpected LexKind: " + t.ToString());
                    return string.Empty;
            }
        }
 
        public XPathCompileException CreateException(string resId, params string[] args) {
            return new XPathCompileException(xpathExpr, lexStart, curIndex, resId, args);
        }
    }
}