File: src\Framework\MS\Internal\WindowsRuntime\Windows\Data\Text\WordsSegmenter.cs
Project: wpf\PresentationFramework.csproj (PresentationFramework)
//-----------------------------------------------------------------------------------------------------
//
// File: WordsSegmenter.cs
//
// Description: Provides a class describing the WinRT type 
//                  Windows.Data.Text.WordsSegmenter. In addition to this, it also 
//                  defines a related delegate type used to support methods in this class.
//
//              WordsSegmenter is a class able to segment provided text into words.  
//              
//              The language supplied when these objects are constructed is matched against the 
//                  languages with word breakers on the system, and the best word segmentation rules 
//                  available are used. The language need not be one of the appliation's supported 
//                  languages. If there are no supported language rules available specifically for 
//                  that language, the language-neutral rules are used (an implementation of Unicode 
//                  Standard Annex #29 Unicode Text Segmentation), and the ResolvedLanguage property is 
//                  set to "und" (undetermined language).
//
//-----------------------------------------------------------------------------------------------------
 
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Security;
using System.Security.Permissions;
 
 
namespace MS.Internal.WindowsRuntime
{
    namespace Windows.Data.Text
    {
        internal class WordsSegmenter
        {
            [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1031:DoNotCatchGeneralExceptionTypes")]
            static WordsSegmenter()
            {
                ConstructorInfo constructor = null;
 
                // We don't want to throw here - so wrap in try..catch
                try
                {
                    s_WinRTType = Type.GetType(s_TypeName);
                    if (s_WinRTType != null)
                    {
                        constructor = s_WinRTType.GetConstructor(new Type[] { typeof(string) });
                    }
                }
                catch
                {
                    s_WinRTType = null;
                }
 
                s_PlatformSupported = (s_WinRTType != null) && (constructor != null);
            }
 
            /// <summary>
            /// 
            /// </summary>
            /// <param name="language"></param>
            /// <exception cref="System.ArgumentException"></exception>
            /// <exception cref="System.NotSupportedException"></exception>
            /// <exception cref="PlatformNotSupportedException"></exception>
            private WordsSegmenter(string language)
            {
                if (!s_PlatformSupported)
                {
                    throw new PlatformNotSupportedException();
                }
 
                try
                {
                    _wordsSegmenter = s_WinRTType.ReflectionNew<string>(language);
                }
                catch (TargetInvocationException tiex) when (tiex.InnerException is ArgumentException)
                {
                    throw new ArgumentException(string.Empty, "language", tiex);
                }
 
                if (_wordsSegmenter == null)
                {
                    throw new NotSupportedException();
                }
            }
 
            /// <summary>
            /// </summary>
            /// <param name="language"></param>
            /// <returns></returns>
            /// <exception cref="System.ArgumentException"><paramref name="language"/> is not a well-formed language identifier</exception>
            /// <exception cref="System.NotSupportedException"><paramref name="language"/> is not supported</exception>
            /// <exception cref="PlatformNotSupportedException">The OS platform is not supported</exception>
            public static WordsSegmenter Create(string language, bool shouldPreferNeutralSegmenter = false)
            {
                if (shouldPreferNeutralSegmenter)
                {
                    if (!ShouldUseDedicatedSegmenter(language))
                    {
                        language = Undetermined;
                    }
                }
 
                return new WordsSegmenter(language);
            }
 
            /// <summary>
            /// 
            /// </summary>
            /// <param name="text"></param>
            /// <param name="startIndex"></param>
            /// <returns></returns>
            /// <exception cref="System.IndexOutOfRangeException"></exception>
            /// <exception cref="ArgumentNullException">
            /// <i>text</i> is null; or internal reference to WinRT WordsSegmenter object is null
            /// </exception>
            [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1031:DoNotCatchGeneralExceptionTypes")]
            public WordSegment GetTokenAt(string text, uint startIndex)
            {
                const int E_BOUNDS = -2147483637; //0x8000000B;
 
                if (text == null)
                {
                    throw new ArgumentNullException("text");
                }
 
                try
                {
                    object token = _wordsSegmenter.ReflectionCall<string, uint>("GetTokenAt", text, startIndex);
 
                    if (token != null)
                    {
                        return new WordSegment(token);
                    }
                    else
                    {
                        return null;
                    }
                }
                catch (TargetInvocationException tex) when (tex.InnerException?.HResult == E_BOUNDS)
                {
                    // E_BOUNDS is not an HRESULT with an exception mapping defined
                    // at https://github.com/dotnet/coreclr/blob/master/src/vm/rexcep.h
                    // This results in a System.Exception being thrown with HRESULT == E_BOUNDS
                    
                    throw new IndexOutOfRangeException(string.Empty, tex);
                }
            }
 
            /// <summary>
            /// 
            /// </summary>
            /// <param name="text"></param>
            /// <returns></returns>
            /// <exception cref="ArgumentNullException"><i>text</i> is null</exception>
            /// <exception cref="NullReferenceException">Internal reference to WinRT WordsSegmenter object is null</exception>
            public IReadOnlyList<WordSegment> GetTokens(string text)
            {
                object tokens = _wordsSegmenter.ReflectionCall<string>("GetTokens", text);
 
                List<WordSegment> result = new List<WordSegment>();
                foreach (var item in (IEnumerable)tokens)
                {
                    if (item != null)
                    {
                        result.Add(new WordSegment(item));
                    }
                }
 
                return result.AsReadOnly();
            }
 
            /// <summary>
            /// Not implemented. 
            /// 
            /// Including a stub definition here for completeness because this interface
            /// exists in Winodws.Data.Text.WordsSegmenter. 
            /// 
            /// Implementing this using reflection based calls is non-trivial due to the 
            /// need to pass the WinRT object a delegate. Given this is not used in
            /// spell-checking, we leave this unimplemented for now. 
            /// </summary>
            /// <param name="text"></param>
            /// <param name="startIndex"></param>
            /// <param name="handler"></param>
            /// <exception cref="NotImplementedException">Always throws</exception>
            public void Tokenize(string text, uint startIndex, WordSegmentsTokenizingHandler handler)
            {
                throw new NotImplementedException();
            }
 
            public string ResolvedLanguage
            {
                get
                {
                    if (_resolvedLanguage == null)
                    {
                        _resolvedLanguage = _wordsSegmenter.ReflectionGetProperty<string>("ResolvedLanguage");
                    }
 
                    return _resolvedLanguage;
                }
            }
 
            public static Type WinRTType
            {
                get
                {
                    return s_WinRTType;
                }
            }
 
            private static bool ShouldUseDedicatedSegmenter(string languageTag)
            {
                bool result = true;
 
                try
                {
                    var language = new Globalization.Language(languageTag);
                    string script = language.Script;
 
                    if (ScriptCodesRequiringDedicatedSegmenter.FindIndex(s => s.Equals(script, StringComparison.InvariantCultureIgnoreCase)) == -1)
                    {
                        result = false;
                    }
                }
                catch (Exception e) 
                    when ((e is NotSupportedException)      || 
                          (e is ArgumentException)          || 
                          (e is TargetInvocationException)  || 
                          (e is MissingMemberException))
                {
                    // Do nothing - return true as the default result
                }
 
                return result;
            }
 
            /// <summary>
            /// List of ISO-15924 script codes of langugues for which we must use 
            /// a dedicated word-breaker. These languages are broken using rules 
            /// different than those for "spaced" languages. 
            /// </summary>
            private static List<string> ScriptCodesRequiringDedicatedSegmenter { get; } = new List<string>
            {
                "Bopo", // Bopomofo
                "Brah", // Brahmi
                "Egyp", // Egyptian hieroglyphs
                "Goth", // Gothic
                "Hang", // Hangul (Hangul, Hangeul)
                "Hani", // Han (Hanzi, Kanji, Hanja)
                "Ital", // Old Italic (Etruscan, Oscan, etc.)
                "Java", // Javanese
                "Kana", // Katakana
                "Khar", // Kharoshthi
                "Laoo", // Lao
                "Lisu", // Lisu (Fraser)
                "Mymr", // Myanmar (Burmese)
                "Talu", // New Tai Lue
                "Thai", // Thai
                "Tibt", // Tibetan
                "Xsux", // Cuneiform, Sumero-Akkadian
                "Yiii", // Yi
            };
 
 
            public static readonly string Undetermined = "und";
 
            private static string s_TypeName = "Windows.Data.Text.WordsSegmenter, Windows, ContentType=WindowsRuntime";
 
            private static Type s_WinRTType;
            private static bool s_PlatformSupported;
 
            private object _wordsSegmenter;
            private string _resolvedLanguage = null;
        }
 
        #region Internal Delegates
 
        // unused
        internal delegate void WordSegmentsTokenizingHandler(
            IEnumerable<WordSegment> precedingWords,
            IEnumerable<WordSegment> words);
 
        #endregion 
    }
}