tokenizer.cs

// ==++==
// 
//   Copyright (c) Microsoft Corporation.  All rights reserved.
// 
// ==--==
/*============================================================
**
** CLASS:    Tokenizer.cs
** 
** <OWNER>Microsoft</OWNER>
**
**
** PURPOSE:  Tokenize "Elementary XML", that is, XML without 
**           attributes or DTDs, in other words, XML with 
**           elements only.
** 
** 
===========================================================*/
namespace System.Security.Util 
{
    using System.Text;
    using System;
    using System.IO;
    using System.Globalization;
    using System.Diagnostics.Contracts;
 
    internal sealed class Tokenizer 
    {
        // There are five externally knowable token types: bras, kets,
        // slashes, cstrs, and equals.  
    
        internal const byte bra = 0;
        internal const byte ket = 1;
        internal const byte slash = 2;
        internal const byte cstr = 3;
        internal const byte equals = 4;
        internal const byte quest = 5;
        internal const byte bang = 6;
        internal const byte dash = 7;
 
        // these are the assorted text characters that the tokenizer must
        // understand to do its job
 
        internal const int intOpenBracket = (int) '<';
        internal const int intCloseBracket = (int) '>';
        internal const int intSlash = (int) '/';
        internal const int intEquals = (int) '=';
        internal const int intQuote = (int) '\"';
        internal const int intQuest = (int) '?';
        internal const int intBang = (int) '!';
        internal const int intDash = (int) '-';
        internal const int intTab = (int) '\t';
        internal const int intCR = (int) '\r';
        internal const int intLF = (int) '\n';
        internal const int intSpace = (int) ' ';
 
        // this tells us where we will be getting our input from
        // and what the encoding is (if any)
        
        private enum TokenSource
        {
            UnicodeByteArray,  // this is little-endian unicode (there are other encodings)
            UTF8ByteArray,
            ASCIIByteArray,
            CharArray,
            String,
            NestedStrings,
            Other                
        }
 
        // byte streams can come in 3 different flavors
 
        internal enum ByteTokenEncoding
        {
            UnicodeTokens,  // this is little-endian unicode (there are other encodings)
            UTF8Tokens,
            ByteTokens
        }
  
        public int  LineNo;
 
        // these variables represent the input state of the of the tokenizer
 
        private int _inProcessingTag;  
        private byte[] _inBytes;
        private char[] _inChars;
        private String _inString;
        private int _inIndex;
        private int _inSize;
        private int _inSavedCharacter;
        private TokenSource _inTokenSource;
        private ITokenReader _inTokenReader;        
 
        // these variables are used to build and deliver tokenizer output strings
                
        private StringMaker _maker = null;
        
        private String[] _searchStrings;
        private String[] _replaceStrings;
        
        private int _inNestedIndex;
        private int _inNestedSize;
        private String _inNestedString;
 
        
        //================================================================
        // Constructor uses given ICharInputStream
        //
 
        internal void BasicInitialization()
        {
            LineNo = 1 ;
            _inProcessingTag = 0;
            _inSavedCharacter = -1;
            _inIndex = 0;
            _inSize = 0;
            _inNestedSize = 0;
            _inNestedIndex = 0;
            _inTokenSource = TokenSource.Other;
            _maker = System.SharedStatics.GetSharedStringMaker();
        }
        
        public void Recycle()
        {
            System.SharedStatics.ReleaseSharedStringMaker(ref _maker); // will set _maker to null
        }
 
        internal Tokenizer (String input)
        {
            BasicInitialization();
            _inString = input;
            _inSize = input.Length;
            _inTokenSource = TokenSource.String;
        }        
        
        internal Tokenizer (String input, String[] searchStrings, String[] replaceStrings)
        {
            BasicInitialization();
            _inString = input;
            _inSize = _inString.Length;
            _inTokenSource = TokenSource.NestedStrings;
            _searchStrings = searchStrings;
            _replaceStrings = replaceStrings;
 
#if DEBUG
            Contract.Assert(searchStrings.Length == replaceStrings.Length, "different number of search/replace strings");
            Contract.Assert(searchStrings.Length != 0, "no search replace strings, shouldn't be using this ctor");
            
            for (int istr=0; istr<searchStrings.Length; istr++)
            {
                String str = searchStrings[istr];
                Contract.Assert( str != null, "XML Slug null");
                Contract.Assert( str.Length >= 3 , "XML Slug too small");
                Contract.Assert( str[0] == '{', "XML Slug doesn't start with '{'" );
                Contract.Assert( str[str.Length-1] == '}', "XML Slug doesn't end with '}'" );
                
                str = replaceStrings[istr];
                Contract.Assert( str != null, "XML Replacement null");
                Contract.Assert( str.Length >= 1, "XML Replacement empty");
            }
#endif            
        }
 
        internal Tokenizer (byte[] array, ByteTokenEncoding encoding, int startIndex)
           {
            BasicInitialization();
            _inBytes = array;
            _inSize = array.Length;
            _inIndex = startIndex;
 
            switch (encoding)
            {
            case ByteTokenEncoding.UnicodeTokens:
                _inTokenSource = TokenSource.UnicodeByteArray;
                break;
 
            case ByteTokenEncoding.UTF8Tokens:
                _inTokenSource = TokenSource.UTF8ByteArray;
                break;
 
            case ByteTokenEncoding.ByteTokens:
                _inTokenSource = TokenSource.ASCIIByteArray;
                break;
 
            default:
                throw new ArgumentException(Environment.GetResourceString("Arg_EnumIllegalVal", (int)encoding));
            }
           }
        
        internal Tokenizer (char[] array)
        {
            BasicInitialization();
            _inChars = array;
            _inSize = array.Length;
            _inTokenSource = TokenSource.CharArray;
        }        
        
        internal Tokenizer (StreamReader input)
        {
            BasicInitialization();
            _inTokenReader = new StreamTokenReader(input);
        }
    
        internal void ChangeFormat( System.Text.Encoding encoding )
        {        
            if (encoding == null)
            {
                return;
            }
 
            Contract.Assert( _inSavedCharacter == -1, "There was a lookahead character at the stream change point, that means the parser is changing encodings too late" );
 
            switch (_inTokenSource)
            {
                case TokenSource.UnicodeByteArray:
                case TokenSource.UTF8ByteArray:
                case TokenSource.ASCIIByteArray:
                    // these are the ones we can change on the fly
 
                    if (encoding == System.Text.Encoding.Unicode)
                    {
                        _inTokenSource = TokenSource.UnicodeByteArray;
                        return;
                    }
 
                    if (encoding == System.Text.Encoding.UTF8)
                    {
                        _inTokenSource = TokenSource.UTF8ByteArray;
                        return;
                    }
#if FEATURE_ASCII
                    if (encoding == System.Text.Encoding.ASCII)
                    {
                        _inTokenSource = TokenSource.ASCIIByteArray;
                        return;
                    }
#endif              
                    break;
 
                case TokenSource.String:
                case TokenSource.CharArray:
                case TokenSource.NestedStrings:
                    // these are already unicode and encoding changes are moot
                    // they can't be further decoded 
                    return;
            }
 
            // if we're here it means we don't know how to change
            // to the desired encoding with the memory that we have
            // we'll have to do this the hard way -- that means
            // creating a suitable stream from what we've got
 
            // this is thankfully the rare case as UTF8 and unicode
            // dominate the scene 
 
            Stream stream = null;
 
            switch (_inTokenSource)
            {
                case TokenSource.UnicodeByteArray:
                case TokenSource.UTF8ByteArray:
                case TokenSource.ASCIIByteArray:
                    stream = new MemoryStream(_inBytes, _inIndex, _inSize - _inIndex);
                    break;
 
                case TokenSource.CharArray:
                case TokenSource.String:
                case TokenSource.NestedStrings:
                    Contract.Assert(false, "attempting to change encoding on a non-changable source, should have been prevented earlier" );
                    return;
 
                default:
                    StreamTokenReader reader = _inTokenReader as StreamTokenReader;
 
                    if (reader == null)
                    {
                        Contract.Assert(false, "A new input source type has been added to the Tokenizer but it doesn't support encoding changes");
                        return;
                    }
 
                    stream = reader._in.BaseStream;
 
                    Contract.Assert( reader._in.CurrentEncoding != null, "Tokenizer's StreamReader does not have an encoding" );
 
                    String fakeReadString = new String(' ', reader.NumCharEncountered);
                    stream.Position = reader._in.CurrentEncoding.GetByteCount( fakeReadString );
                    break;
            }
 
            Contract.Assert(stream != null, "The XML stream with new encoding was not properly initialized for kind of input we had");
 
            // we now have an initialized memory stream based on whatever source we had before
            _inTokenReader = new StreamTokenReader( new StreamReader( stream, encoding ) );
            _inTokenSource = TokenSource.Other;
        }
 
        internal void GetTokens( TokenizerStream stream, int maxNum, bool endAfterKet )
        {
            while (maxNum == -1 || stream.GetTokenCount() < maxNum)
            {
                int i = -1;
                byte ch;
                int cb = 0;
                bool inLiteral = false;
                bool inQuotedString = false;
            
                StringMaker m = _maker;
            
                m._outStringBuilder = null;
                m._outIndex = 0;
                        
            BEGINNING: 
            
                if (_inSavedCharacter != -1)
                {
                       i = _inSavedCharacter;
                       _inSavedCharacter = -1;
                }
                else switch (_inTokenSource)
                {
                case TokenSource.UnicodeByteArray:
                    if (_inIndex + 1 >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                     i = (int)((_inBytes[_inIndex+1]<<8) + _inBytes[_inIndex]);
                      _inIndex += 2;      
                    break;
                
                 case TokenSource.UTF8ByteArray:
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inBytes[_inIndex++]);
 
                    // single byte -- case, early out as we're done already
                    if ((i & 0x80) == 0x00)
                        break;
 
                    // to decode the lead byte switch on the high nibble 
                    // shifted down so the switch gets dense integers
                    switch ((i & 0xf0) >>4)
                    {
                    case 0x8:   // 1000  (together these 4 make 10xxxxx)
                    case 0x9:   // 1001
                    case 0xa:   // 1010
                    case 0xb:   // 1011
                        // trail byte is an error
                        throw new XmlSyntaxException( LineNo );
 
                    case 0xc:   // 1100  (these two make 110xxxxx)
                    case 0xd:   // 1101
                        // two byte encoding (1 trail byte)
                        i &= 0x1f;
                        cb = 2;
                        break;
 
                    case 0xe:   // 1110 (this gets us 1110xxxx)
                        // three byte encoding (2 trail bytes)
                        i &= 0x0f;
                        cb = 3;
                        break;
 
                    case 0xf:   // 1111 (and finally 1111xxxx)
                        // 4 byte encoding is an error
                        throw new XmlSyntaxException( LineNo );
                    }
 
                    // at least one trail byte, fetch it
                    if (_inIndex >= _inSize)
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" ));
 
                    ch = _inBytes[_inIndex++];
 
                    // must be trail byte encoding
                    if ((ch & 0xc0) != 0x80)
                        throw new XmlSyntaxException( LineNo );
 
                    i = (i<<6) | (ch & 0x3f);
 
                    // done now if 2 byte encoding, otherwise go for 3
                    if (cb == 2)
                        break;
 
                    if (_inIndex >= _inSize)
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" ));
 
                    ch = _inBytes[_inIndex++];
 
                    // must be trail byte encoding
                    if ((ch & 0xc0) != 0x80)
                        throw new XmlSyntaxException( LineNo );
 
                    i = (i<<6) | (ch & 0x3f);
                    break;                
                
                case TokenSource.ASCIIByteArray:
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inBytes[_inIndex++]);
                    break;
 
                case TokenSource.CharArray:
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                     i = (int)(_inChars[_inIndex++]);
                    break;
                
                case TokenSource.String:
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inString[_inIndex++]);
                    break;
                            
                case TokenSource.NestedStrings:
                    if (_inNestedSize != 0)
                    {
                        if (_inNestedIndex < _inNestedSize)
                        {
                            i = _inNestedString[_inNestedIndex++];
                            break;
                        }
                    
                        _inNestedSize = 0;
                    }
                
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inString[_inIndex++]);
                
                    if (i != '{')
                        break;
                                    
                    for (int istr=0; istr < _searchStrings.Length; istr++)
                    {
                        if (0==String.Compare(_searchStrings[istr], 0, _inString, _inIndex-1, _searchStrings[istr].Length, StringComparison.Ordinal))
                        {
                            _inNestedString = _replaceStrings[istr];
                            _inNestedSize = _inNestedString.Length;
                            _inNestedIndex = 1;
                            i = _inNestedString[0];
                            _inIndex += _searchStrings[istr].Length - 1;   
                            break;
                        }
                    }
                
                    break;
                    
                default:
                    i = _inTokenReader.Read();
                    if (i == -1) 
                    {
                        stream.AddToken( -1 );
                        return;
                    }
                    break;
                }
 
                if (!inLiteral)
                {
                    switch (i)
                    {
                    // skip whitespace
                    case intSpace:
                    case intTab:
                    case intCR:
                        goto BEGINNING;
 
                    // count linefeeds
                    case intLF:
                        LineNo++;
                        goto BEGINNING;
                                        
                    case intOpenBracket:
                        _inProcessingTag++;
                        stream.AddToken( bra );
                        continue;
                    
                    case intCloseBracket:
                        _inProcessingTag--;
                        stream.AddToken( ket );
                        if (endAfterKet)
                            return;
                        continue;
                    
                    case intEquals:
                        stream.AddToken( equals );
                        continue;
                    
                    case intSlash:
                        if (_inProcessingTag != 0)
                        {
                            stream.AddToken( slash );
                            continue;
                        }
                        break;
 
                    case intQuest:
                        if (_inProcessingTag != 0)
                        {
                            stream.AddToken( quest );
                            continue;
                        } 
                        break;
 
                    case intBang:
                        if (_inProcessingTag != 0)
                        {
                            stream.AddToken( bang );
                            continue;
                        } 
                        break;
 
                    case intDash:
                        if (_inProcessingTag != 0) 
                        {
                            stream.AddToken( dash );
                            continue;
                        }
                        break;
 
                    case intQuote:
                        inLiteral = true;
                        inQuotedString = true;
                        goto BEGINNING;
                    }
                }
                else
                   {
                    switch (i)
                    {
                    case intOpenBracket:
                        if (!inQuotedString)
                        {
                            _inSavedCharacter = i;
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        }
                        break;
                        
                    case intCloseBracket:
                    case intEquals:
                    case intSlash:
                        if (!inQuotedString && _inProcessingTag != 0)
                        {
                            _inSavedCharacter = i;
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        }
                        break;
                    
                    case intQuote:
                        if (inQuotedString)
                        {
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        }
                        break;
                    
                    case intTab:
                    case intCR:
                    case intSpace:
                        if (!inQuotedString)
                        {
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        }
                        break;
 
                    // count linefeeds
                    case intLF:
                        LineNo++;
 
                        if (!inQuotedString)
                        {
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        }
                        break;                        
                    }
                }
 
                inLiteral = true;
                                       
                // add character  to the string
                if (m._outIndex < StringMaker.outMaxSize) 
                {
                    // easy case
                    m._outChars[m._outIndex++] = (char)i;
                } 
                else
                {
                    if (m._outStringBuilder == null) 
                    {
                       // OK, first check if we have to init the StringBuilder
                        m._outStringBuilder = new StringBuilder();
                    }
                
                    // OK, copy from _outChars to _outStringBuilder
                    m._outStringBuilder.Append(m._outChars, 0, StringMaker.outMaxSize);
                
                    // reset _outChars pointer
                    m._outChars[0] = (char)i;
                    m._outIndex = 1;
                }
                        
                goto BEGINNING;
            }
        }
        
        [Serializable]
        internal sealed class StringMaker
        {
            String[] aStrings;
            uint cStringsMax;
            uint cStringsUsed;
            
            public StringBuilder _outStringBuilder;
            public char[] _outChars;
            public int _outIndex;
            
            public const int outMaxSize = 512;            
            
            static uint HashString(String str)
            {
                uint hash = 0;
                
                int l = str.Length;
                
                // rotate in string character
                for (int i=0; i < l; i++)
                {
                    hash = (hash << 3) ^ (uint)str[i] ^ (hash >> 29);
                }
                
                return hash;
            }
 
            static uint HashCharArray(char[] a, int l)
            {
                uint hash = 0;
                
                // rotate in a character
                for (int i=0; i < l; i++)
                {
                    hash = (hash << 3) ^ (uint)a[i] ^ (hash >> 29);
                }
                    
                return hash;
            }
            
            public StringMaker()
            {
                cStringsMax = 2048;
                cStringsUsed = 0;
                aStrings = new String[cStringsMax];
                _outChars = new char[outMaxSize];
            }
            
            bool CompareStringAndChars(String str, char [] a, int l)
            {
                if (str.Length != l)
                    return false;
                
                for (int i=0; i<l; i++)
                    if (a[i] != str[i])
                        return false;
                        
                return true;
            }
            
            public String MakeString()
            {
                uint hash;
                char[] a = _outChars;
                int l = _outIndex;
                
                // if we have a stringbuilder then we have to append... slow case
                if (_outStringBuilder != null) 
                {                    
                    _outStringBuilder.Append(_outChars, 0, _outIndex);
                    return _outStringBuilder.ToString();
                }
                
                // no stringbuilder, fast case, shareable string
                
                if (cStringsUsed > (cStringsMax / 4) * 3)
                {
                    // we need to rehash
                    
                    uint cNewMax = cStringsMax * 2;
                    String [] aStringsNew = new String[cNewMax];
                    
                    for (int i=0; i < cStringsMax;i++)
                    {
                        if (aStrings[i] != null)
                        {
                            hash = HashString(aStrings[i]) % cNewMax;
                            
                            while (aStringsNew[hash] != null)
                            {
                                // slot full, skip
                                if (++hash >= cNewMax)
                                    hash = 0;                        
                            }
                
                            aStringsNew[hash] = aStrings[i];
                        }
                    }
                    
                    // all done, cutover to the new hash table
                    cStringsMax = cNewMax;
                    aStrings = aStringsNew;
                }
                       
                hash = HashCharArray(a, l) % cStringsMax;
                
                String str;
                
                while ((str = aStrings[hash]) != null)
                {
                    if (CompareStringAndChars(str, a, l))
                        return str;
 
                    if (++hash >= cStringsMax)
                        hash = 0;                        
                }
                
                str = new String(a,0,l);
                aStrings[hash] = str;
                cStringsUsed++;
                
                return str;
            }
        }
 
        //================================================================
        //
        //
        
        private String GetStringToken ()
        {
            return _maker.MakeString();
        }
    
        internal interface ITokenReader
        {
            int Read();
        }
                       
        internal class StreamTokenReader : ITokenReader 
        {
            
            internal StreamReader _in;
            internal int _numCharRead;
            
            internal StreamTokenReader(StreamReader input) 
            {
                _in = input;
                _numCharRead = 0;
            }
                       
            public virtual int Read()
            {
                int value = _in.Read();
                if (value != -1)
                    _numCharRead++;
                return value;
            }
 
            internal int NumCharEncountered
            {
                get
                {
                    return _numCharRead;
                }
            }
        }
    }
 
    internal sealed class TokenizerShortBlock
    {
        internal short[] m_block = new short[16];
        internal TokenizerShortBlock m_next = null;
    }
 
    internal sealed class TokenizerStringBlock
    {
        internal String[] m_block = new String[16];
        internal TokenizerStringBlock m_next = null;
    }
 
 
    internal sealed class TokenizerStream
    {
        private int m_countTokens;
 
        private TokenizerShortBlock m_headTokens;
        private TokenizerShortBlock m_lastTokens;
        private TokenizerShortBlock m_currentTokens;
        private int m_indexTokens;
 
        private TokenizerStringBlock m_headStrings;
        private TokenizerStringBlock m_currentStrings;
        private int m_indexStrings;
 
#if _DEBUG
        private bool m_bLastWasCStr;        
#endif
 
        internal TokenizerStream()
        {
            m_countTokens = 0;
            m_headTokens = new TokenizerShortBlock();
            m_headStrings = new TokenizerStringBlock();
            Reset();
        }
 
        internal void AddToken( short token )
        {
            if (m_currentTokens.m_block.Length <= m_indexTokens)
            {
                m_currentTokens.m_next = new TokenizerShortBlock();
                m_currentTokens = m_currentTokens.m_next;
                m_indexTokens = 0;
            }
 
            m_countTokens++;
            m_currentTokens.m_block[m_indexTokens++] = token;
        }
 
        internal void AddString( String str )
        {
            if (m_currentStrings.m_block.Length <= m_indexStrings)
            {
                m_currentStrings.m_next = new TokenizerStringBlock();
                m_currentStrings = m_currentStrings.m_next;
                m_indexStrings = 0;
            }
 
            m_currentStrings.m_block[m_indexStrings++] = str;
        }
 
        internal void Reset()
        {
            m_lastTokens = null;
            m_currentTokens = m_headTokens;
            m_currentStrings = m_headStrings;
            m_indexTokens = 0;
            m_indexStrings = 0;
#if _DEBUG
            m_bLastWasCStr = false;
#endif
        }
 
        internal short GetNextFullToken()
        {
            if (m_currentTokens.m_block.Length <= m_indexTokens)
            {
                m_lastTokens = m_currentTokens;
                m_currentTokens = m_currentTokens.m_next;
                m_indexTokens = 0;
            }
 
            return m_currentTokens.m_block[m_indexTokens++];
        }
 
        internal short GetNextToken()
        {
            short retval = (short)(GetNextFullToken() & 0x00FF);
#if _DEBUG
            Contract.Assert( !m_bLastWasCStr, "CStr token not followed by GetNextString()" );
            m_bLastWasCStr = (retval == Tokenizer.cstr);
#endif
            return retval;
        }
        
        internal String GetNextString()
        {
            if (m_currentStrings.m_block.Length <= m_indexStrings)
            {
                m_currentStrings = m_currentStrings.m_next;
                m_indexStrings = 0;
            }
#if _DEBUG
            m_bLastWasCStr = false;
#endif
            return m_currentStrings.m_block[m_indexStrings++];
        }
 
        internal void ThrowAwayNextString()
        {
            GetNextString();
        }
 
        internal void TagLastToken( short tag )
        {
            if (m_indexTokens == 0)
                m_lastTokens.m_block[m_lastTokens.m_block.Length-1] = (short)((ushort)m_lastTokens.m_block[m_lastTokens.m_block.Length-1] | (ushort)tag);
            else
                m_currentTokens.m_block[m_indexTokens-1] = (short)((ushort)m_currentTokens.m_block[m_indexTokens-1] | (ushort)tag);
        }
 
        internal int GetTokenCount()
        {
            return m_countTokens;
        }
 
        internal void GoToPosition( int position )
        {
            Reset();
 
            for (int count = 0; count < position; ++count)
            {
                if (GetNextToken() == Tokenizer.cstr)
                    ThrowAwayNextString();
            }
        }   
    }
}
File: system\security\util\tokenizer.cs
Project: ndp\clr\src\bcl\mscorlib.csproj (mscorlib)