File: services\io\system\io\PatternMatcher.cs
Project: ndp\fx\src\System.csproj (System)
//------------------------------------------------------------------------------
// <copyright file="PatternMatcher.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>                                                                
//------------------------------------------------------------------------------
 
/*
 */
namespace System.IO {
    using System.Text;
 
    using System.Diagnostics;
 
    using System;
    using Microsoft.Win32;
    using System.Globalization;
 
    internal static class PatternMatcher {
        /// <devdoc>
        ///     Private constants (directly from C header files)
        /// </devdoc>
        private const int MATCHES_ARRAY_SIZE = 16;
        private const char ANSI_DOS_STAR = '>';
        private const char ANSI_DOS_QM = '<';
        private const char DOS_DOT = '"';
        
        /// <devdoc>
        ///     Tells whether a given name matches the expression given with a strict (i.e. UNIX like)
        ///     semantics.  This code is a port of unmanaged code.  Original code comment follows:
        ///
        ///    Routine Description:
        ///
        ///        This routine compares a Dbcs name and an expression and tells the caller
        ///        if the name is in the language defined by the expression.  The input name
        ///        cannot contain wildcards, while the expression may contain wildcards.
        ///
        ///        Expression wild cards are evaluated as shown in the nondeterministic
        ///        finite automatons below.  Note that ~* and ~? are DOS_STAR and DOS_QM.
        ///
        ///
        ///                 ~* is DOS_STAR, ~? is DOS_QM, and ~. is DOS_DOT
        ///
        ///
        ///                                           S
        ///                                        &lt;-----&lt;
        ///                                     X  |     |  e       Y
        ///                 X * Y ==       (0)-----&gt;-(1)-&gt;-----(2)-----(3)
        ///
        ///
        ///                                          S-.
        ///                                        &lt;-----&lt;
        ///                                     X  |     |  e       Y
        ///                 X ~* Y ==      (0)-----&gt;-(1)-&gt;-----(2)-----(3)
        ///
        ///
        ///
        ///                                    X     S     S     Y
        ///                 X ?? Y ==      (0)---(1)---(2)---(3)---(4)
        ///
        ///
        ///
        ///                                    X     .        .      Y
        ///                 X ~.~. Y ==    (0)---(1)----(2)------(3)---(4)
        ///                                       |      |________|
        ///                                       |           ^   |
        ///                                       |_______________|
        ///                                          ^EOF or .^
        ///
        ///
        ///                                    X     S-.     S-.     Y
        ///                 X ~?~? Y ==    (0)---(1)-----(2)-----(3)---(4)
        ///                                       |      |________|
        ///                                       |           ^   |
        ///                                       |_______________|
        ///                                          ^EOF or .^
        ///
        ///
        ///
        ///             where S is any single character
        ///
        ///                   S-. is any single character except the final .
        ///
        ///                   e is a null character transition
        ///
        ///                   EOF is the end of the name string
        ///
        ///        In words:
        ///
        ///            * matches 0 or more characters.
        ///
        ///            ? matches exactly 1 character.
        ///
        ///            DOS_STAR matches 0 or more characters until encountering and matching
        ///                the final . in the name.
        ///
        ///            DOS_QM matches any single character, or upon encountering a period or
        ///                end of name string, advances the expression to the end of the
        ///                set of contiguous DOS_QMs.
        ///
        ///            DOS_DOT matches either a . or zero characters beyond name string.
        ///
        ///    Arguments:
        ///
        ///        Expression - Supplies the input expression to check against
        ///
        ///        Name - Supplies the input name to check for.
        ///
        ///    Return Value:
        ///
        ///        BOOLEAN - TRUE if Name is an element in the set of strings denoted
        ///            by the input Expression and FALSE otherwise.
        ///
        /// </devdoc>
        public static bool StrictMatchPattern(string expression, string name) {
            int nameOffset;
            int exprOffset;
            int length;
 
            int srcCount;
            int destCount;
            int previousDestCount;
            int matchesCount;
 
            char nameChar = '\0';
            char exprChar = '\0';
 
            int[] previousMatches = new int[MATCHES_ARRAY_SIZE];
            int[] currentMatches = new int[MATCHES_ARRAY_SIZE];
 
            int maxState;
            int currentState;
 
            bool nameFinished = false;
 
            //
            //  The idea behind the algorithm is pretty simple.  We keep track of
            //  all possible locations in the regular expression that are matching
            //  the name.  If when the name has been exhausted one of the locations
            //  in the expression is also just exhausted, the name is in the language
            //  defined by the regular expression.
            //
 
            if (name == null || name.Length == 0 || expression == null || expression.Length == 0) {
                return false;
            }
 
            //
            //  Special case by far the most common wild card search of * or *.*
            //
 
            if (expression.Equals("*") || expression.Equals("*.*")) {
                return true;
            }
 
            // If this class is ever exposed for generic use,
            // we need to make sure that name doesn't contain wildcards. Currently 
            // the only component that calls this method is FileSystemWatcher and
            // it will never pass a name that contains a wildcard.
            
 
            //
            //  Also special case expressions of the form *X.  With this and the prior
            //  case we have covered virtually all normal queries.
            //
            if (expression[0] == '*' && expression.IndexOf('*', 1) == -1) {
                int rightLength = expression.Length - 1;
                // if name is shorter that the stuff to the right of * in expression, we don't
                // need to do the string compare, otherwise we compare rightlength characters
                // and the end of both strings.
                if (name.Length >= rightLength && String.Compare(expression, 1, name, name.Length - rightLength, rightLength, StringComparison.OrdinalIgnoreCase) == 0) {
                    return true;
                }
            }
 
            //
            //  Walk through the name string, picking off characters.  We go one
            //  character beyond the end because some wild cards are able to match
            //  zero characters beyond the end of the string.
            //
            //  With each new name character we determine a new set of states that
            //  match the name so far.  We use two arrays that we swap back and forth
            //  for this purpose.  One array lists the possible expression states for
            //  all name characters up to but not including the current one, and other
            //  array is used to build up the list of states considering the current
            //  name character as well.  The arrays are then switched and the process
            //  repeated.
            //
            //  There is not a one-to-one correspondence between state number and
            //  offset into the expression.  This is evident from the NFAs in the
            //  initial comment to this function.  State numbering is not continuous.
            //  This allows a simple conversion between state number and expression
            //  offset.  Each character in the expression can represent one or two
            //  states.  * and DOS_STAR generate two states: ExprOffset*2 and
            //  ExprOffset*2 + 1.  All other expreesion characters can produce only
            //  a single state.  Thus ExprOffset = State/2.
            //
            //
            //  Here is a short description of the variables involved:
            //
            //  NameOffset  - The offset of the current name char being processed.
            //
            //  ExprOffset  - The offset of the current expression char being processed.
            //
            //  SrcCount    - Prior match being investigated with current name char
            //
            //  DestCount   - Next location to put a matching assuming current name char
            //
            //  NameFinished - Allows one more itteration through the Matches array
            //                 after the name is exhusted (to come *s for example)
            //
            //  PreviousDestCount - This is used to prevent entry duplication, see coment
            //
            //  PreviousMatches   - Holds the previous set of matches (the Src array)
            //
            //  CurrentMatches    - Holds the current set of matches (the Dest array)
            //
            //  AuxBuffer, LocalBuffer - the storage for the Matches arrays
            //
 
            //
            //  Set up the initial variables
            //
 
            previousMatches[0] = 0;
            matchesCount = 1;
 
            nameOffset = 0;
            maxState = expression.Length * 2;
 
            while (! nameFinished) {
                if (nameOffset < name.Length) {
                    nameChar = name[nameOffset];
                    length = 1;
                    nameOffset++;
                }
                else {
                    nameFinished = true;
 
                    //
                    //  if we have already exhasted the expression, C#.  Don't
                    //  continue.
                    //
                    if (previousMatches[matchesCount - 1] == maxState) {
                        break;
                    }
                }
 
                //
                //  Now, for each of the previous stored expression matches, see what
                //  we can do with this name character.
                //
                srcCount = 0;
                destCount = 0;
                previousDestCount = 0;
 
                while (srcCount < matchesCount) {
 
                    //
                    //  We have to carry on our expression analysis as far as possible
                    //  for each character of name, so we loop here until the
                    //  expression stops matching.  A clue here is that expression
                    //  cases that can match zero or more characters end with a
                    //  continue, while those that can accept only a single character
                    //  end with a break.
                    //
                    exprOffset = ((previousMatches[srcCount++] + 1) / 2);
                    length = 0;
 
                    while (true) {
                        if ( exprOffset == expression.Length ) {
                            break;
                        }
 
                        //
                        //  The first time through the loop we don't want
                        //  to increment ExprOffset.
                        //
 
                        exprOffset += length;
 
                        currentState = exprOffset * 2;
 
                        if (exprOffset == expression.Length) {
                            currentMatches[destCount++] = maxState;
                            break;
                        }
 
                        exprChar = expression[exprOffset];
                        length = 1;
 
                        //
                        //  Before we get started, we have to check for something
                        //  really gross.  We may be about to exhaust the local
                        //  space for ExpressionMatches[][], so we have to allocate
                        //  some pool if this is the case.  Yuk!
                        //
 
                        if (destCount >= MATCHES_ARRAY_SIZE - 2) {
                            int newSize = currentMatches.Length * 2;
                            int [] tmp = new int[newSize];
                            Array.Copy(currentMatches, tmp, currentMatches.Length);
                            currentMatches = tmp;
 
                            tmp = new int[newSize];
                            Array.Copy(previousMatches, tmp, previousMatches.Length);
                            previousMatches = tmp;
                        }
 
                        //
                        //  * matches any character zero or more times.
                        //
 
                        if (exprChar == '*') {
                            currentMatches[destCount++] = currentState;
                            currentMatches[destCount++] = (currentState + 1);
                            continue;
                        }
 
                        //
                        //  DOS_STAR matches any character except . zero or more times.
                        //
 
                        if (exprChar == ANSI_DOS_STAR) {
                            bool iCanEatADot = false;
 
                            //
                            //  If we are at a period, determine if we are allowed to
                            //  consume it, ie. make sure it is not the last one.
                            //
                            if (!nameFinished && (nameChar == '.') ) {
                                char tmpChar;
                                int offset;
 
                                int nameLength = name.Length;
                                for (offset = nameOffset; offset < nameLength; offset ++ ) {
                                    tmpChar = name[offset];
                                    length = 1;
 
                                    if (tmpChar == '.') {
                                        iCanEatADot = true;
                                        break;
                                    }
                                }
                            }
 
                            if (nameFinished || (nameChar != '.') || iCanEatADot) {
                                currentMatches[destCount++] = currentState;
                                currentMatches[destCount++] = (currentState + 1);
                                continue;
                            } else {
 
                                //
                                //  We are at a period.  We can only match zero
                                //  characters (ie. the epsilon transition).
                                //
                                currentMatches[destCount++] = (currentState + 1);
                                continue;
                            }
                        }
 
                        //
                        //  The following expreesion characters all match by consuming
                        //  a character, thus force the expression, and thus state
                        //  forward.
                        //
                        currentState += length * 2;
 
                        //
                        //  DOS_QM is the most complicated.  If the name is finished,
                        //  we can match zero characters.  If this name is a '.', we
                        //  don't match, but look at the next expression.  Otherwise
                        //  we match a single character.
                        //
                        if (exprChar == ANSI_DOS_QM) {
 
                            if (nameFinished || (nameChar == '.') ) {
                                continue;
                            }
 
                            currentMatches[destCount++] = currentState;
                            break;
                        }
 
                        //
                        //  A DOS_DOT can match either a period, or zero characters
                        //  beyond the end of name.
                        //
                        if (exprChar == DOS_DOT) {
 
                            if (nameFinished) {
                                continue;
                            }
 
                            if (nameChar == '.') {
                                currentMatches[destCount++] = currentState;
                                break;
                            }
                        }
 
                        //
                        //  From this point on a name character is required to even
                        //  continue, let alone make a match.
                        //
                        if (nameFinished) {
                            break;
                        }
 
                        //
                        //  If this expression was a '?' we can match it once.
                        //
                        if (exprChar == '?') {
                            currentMatches[destCount++] = currentState;
                            break;
                        }
 
                        //
                        //  Finally, check if the expression char matches the name char
                        //
                        if (exprChar == nameChar) {
                            currentMatches[destCount++] = currentState;
                            break;
                        }
 
                        //
                        //  The expression didn't match so go look at the next
                        //  previous match.
                        //
 
                        break;
                    }
 
 
                    //
                    //  Prevent duplication in the destination array.
                    //
                    //  Each of the arrays is montonically increasing and non-
                    //  duplicating, thus we skip over any source element in the src
                    //  array if we just added the same element to the destination
                    //  array.  This guarentees non-duplication in the dest. array.
                    //
 
                    if ((srcCount < matchesCount) && (previousDestCount < destCount) ) {
                        while (previousDestCount < destCount) {
                            int previousLength = previousMatches.Length;
                            while ((srcCount < previousLength) && (previousMatches[srcCount] < currentMatches[previousDestCount])) {
                                srcCount += 1;
                            }
                            previousDestCount += 1;
                        }
                    }
                }
 
                //
                //  If we found no matches in the just finished itteration, it's time
                //  to bail.
                //
 
                if ( destCount == 0 ) {
                    return false;
                }
 
                //
                //  Swap the meaning the two arrays
                //
 
                {
                    int[] tmp;
 
                    tmp = previousMatches;
 
                    previousMatches = currentMatches;
 
                    currentMatches = tmp;
                }
 
                matchesCount = destCount;
            }
 
            currentState = previousMatches[matchesCount - 1];
 
            return currentState == maxState;
        }
    }
}