|
//-----------------------------------------------------------------------
//
// Microsoft Windows Client Platform
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// File: Classification.cs
//
// Contents: Unicode classification entry point
//
// Created: 7-14-2002 Tarek Mahmoud Sayed (Microsoft)
//
//------------------------------------------------------------------------
using System;
using System.Diagnostics;
using MS.Internal;
using System.Windows;
using System.Security;
using System.Collections;
using System.Runtime.InteropServices;
using System.Windows.Media.TextFormatting;
using MS.Internal.PresentationCore;
namespace MS.Internal
{
/// <summary>
/// This class is used as a level on indirection for classes in managed c++ to be able to utilize methods
/// from the static class Classification.
/// We cannot make MC++ reference PresentationCore.dll since this will result in cirular reference.
/// </summary>
internal class ClassificationUtility : MS.Internal.Text.TextInterface.IClassification
{
// We have restored this list from WPF 3.x.
// The original list can be found under
// $/Dev10/pu/WPF/wpf/src/Core/CSharp/MS/Internal/Shaping/Script.cs
internal static readonly bool[] ScriptCaretInfo = new bool[]
{
/* Default */ false,
/* Arabic */ false,
/* Armenian */ false,
/* Bengali */ true,
/* Bopomofo */ false,
/* Braille */ false,
/* Buginese */ true,
/* Buhid */ false,
/* CanadianSyllabics */ false,
/* Cherokee */ false,
/* CJKIdeographic */ false,
/* Coptic */ false,
/* CypriotSyllabary */ false,
/* Cyrillic */ false,
/* Deseret */ false,
/* Devanagari */ true,
/* Ethiopic */ false,
/* Georgian */ false,
/* Glagolitic */ false,
/* Gothic */ false,
/* Greek */ false,
/* Gujarati */ true,
/* Gurmukhi */ true,
/* Hangul */ true,
/* Hanunoo */ false,
/* Hebrew */ true,
/* Kannada */ true,
/* Kana */ false,
/* Kharoshthi */ true,
/* Khmer */ true,
/* Lao */ true,
/* Latin */ false,
/* Limbu */ true,
/* LinearB */ false,
/* Malayalam */ true,
/* MathematicalAlphanumericSymbols */ false,
/* Mongolian */ true,
/* MusicalSymbols */ false,
/* Myanmar */ true,
/* NewTaiLue */ true,
/* Ogham */ false,
/* OldItalic */ false,
/* OldPersianCuneiform */ false,
/* Oriya */ true,
/* Osmanya */ false,
/* Runic */ false,
/* Shavian */ false,
/* Sinhala */ true,
/* SylotiNagri */ true,
/* Syriac */ false,
/* Tagalog */ false,
/* Tagbanwa */ false,
/* TaiLe */ false,
/* Tamil */ true,
/* Telugu */ true,
/* Thaana */ true,
/* Thai */ true,
/* Tibetan */ true,
/* Tifinagh */ false,
/* UgariticCuneiform */ false,
/* Yi */ false,
/* Digit */ false,
/* Control */ false,
/* Mirror */ false,
};
static private ClassificationUtility _classificationUtilityInstance = new ClassificationUtility();
static internal ClassificationUtility Instance
{
get
{
return _classificationUtilityInstance;
}
}
public void GetCharAttribute(
int unicodeScalar,
out bool isCombining,
out bool needsCaretInfo,
out bool isIndic,
out bool isDigit,
out bool isLatin,
out bool isStrong
)
{
CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
byte itemClass = charAttribute.ItemClass;
isCombining = (itemClass == (byte)ItemClass.SimpleMarkClass
|| itemClass == (byte)ItemClass.ComplexMarkClass
|| Classification.IsIVS(unicodeScalar));
isStrong = (itemClass == (byte)ItemClass.StrongClass);
int script = charAttribute.Script;
needsCaretInfo = ScriptCaretInfo[script];
ScriptID scriptId = (ScriptID)script;
isDigit = scriptId == ScriptID.Digit;
isLatin = scriptId == ScriptID.Latin;
if (isLatin)
{
isIndic = false;
}
else
{
isIndic = IsScriptIndic(scriptId);
}
}
/// <summary>
/// Returns true if specified script is Indic.
/// </summary>
private static bool IsScriptIndic(ScriptID scriptId)
{
if (scriptId == ScriptID.Bengali
|| scriptId == ScriptID.Devanagari
|| scriptId == ScriptID.Gurmukhi
|| scriptId == ScriptID.Gujarati
|| scriptId == ScriptID.Kannada
|| scriptId == ScriptID.Malayalam
|| scriptId == ScriptID.Oriya
|| scriptId == ScriptID.Tamil
|| scriptId == ScriptID.Telugu)
{
return true;
}
else
{
return false;
}
}
}
/// <summary>
/// Hold the classification table pointers.
/// </summary>
internal static class Classification
{
/// <summary>
/// This structure has a cloned one in the unmanaged side. Doing any change in this
/// structure should have the same change on unmanaged side too.
/// </summary>
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct CombiningMarksClassificationData
{
internal IntPtr CombiningCharsIndexes; // Two dimentional array of base char classes,
internal int CombiningCharsIndexesTableLength;
internal int CombiningCharsIndexesTableSegmentLength;
internal IntPtr CombiningMarkIndexes; // Combining mark classes array, with length = length
internal int CombiningMarkIndexesTableLength;
internal IntPtr CombinationChars; // Two dimentional array of combined characters
internal int CombinationCharsBaseCount;
internal int CombinationCharsMarkCount;
}
/// <summary>
/// This structure has a cloned one in the unmanaged side. doing any change in that
/// structure should have same change in the unmanaged side too.
/// </summary>
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct RawClassificationTables
{
internal IntPtr UnicodeClasses;
internal IntPtr CharacterAttributes;
internal IntPtr Mirroring;
internal CombiningMarksClassificationData CombiningMarksClassification;
};
///<SecurityNote>
/// Critical - as this code performs an elevation.
///</SecurityNote>
[SecurityCritical]
[SuppressUnmanagedCodeSecurity]
[DllImport(DllImport.PresentationNative, EntryPoint="MILGetClassificationTables")]
internal static extern void MILGetClassificationTables(out RawClassificationTables ct);
/// <SecurityNote>
/// Critical: This accesses unsafe code and retrieves pointers that it stores locally
/// The pointers retrieved are not validated for correctness and they are later dereferenced.
/// TreatAsSafe: The constructor is safe since it simply stores these pointers. The risk here
/// in the future is not of these pointers being spoofed since they are not settable from outside.
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static Classification()
{
unsafe
{
RawClassificationTables ct = new RawClassificationTables();
MILGetClassificationTables(out ct);
_unicodeClassTable = new SecurityCriticalData<IntPtr>(ct.UnicodeClasses);
_charAttributeTable = new SecurityCriticalData<IntPtr>(ct.CharacterAttributes);
_mirroredCharTable = new SecurityCriticalData<IntPtr>(ct.Mirroring);
_combiningMarksClassification = new SecurityCriticalData<CombiningMarksClassificationData>(ct.CombiningMarksClassification);
}
}
/// <summary>
/// Lookup Unicode character class for a Unicode UTF16 value
/// </summary>
/// <SecurityNote>
/// Critical: This accesses unsafe code and dereferences a location in
/// a prepopulated Array. The risk is you might derefence a bogus memory
/// location.
/// TreatAsSafe: This code is ok since it reduces codepoint to one of 256 possible
/// values and will always succeed. Also this information is ok to expose.
/// </SecurityNote>
[SecurityCritical, SecurityTreatAsSafe]
static public short GetUnicodeClassUTF16(char codepoint)
{
unsafe
{
short **plane0 = UnicodeClassTable[0];
Invariant.Assert((long)plane0 >= (long)UnicodeClass.Max);
short* pcc = plane0[codepoint >> 8];
return ((long) pcc < (long) UnicodeClass.Max ?
(short)pcc : pcc[codepoint & 0xFF]);
}
}
/// <summary>
/// Lookup Unicode character class for a Unicode scalar value
/// </summary>
/// <SecurityNote>
/// Critical: This accesses unsafe code and derefences a pointer retrieved from unmanaged code
/// TreatAsSafe: There is bounds checking in place and this dereferences a valid structure which
/// is guaranteed to be populated
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static public short GetUnicodeClass(int unicodeScalar)
{
unsafe
{
Invariant.Assert(unicodeScalar >= 0 && unicodeScalar <= 0x10FFFF);
short **ppcc = UnicodeClassTable[((unicodeScalar >> 16) & 0xFF) % 17];
if ((long)ppcc < (long)UnicodeClass.Max)
return (short)ppcc;
short *pcc = ppcc[(unicodeScalar & 0xFFFF) >> 8];
if ((long)pcc < (long)UnicodeClass.Max)
return (short)pcc;
return pcc[unicodeScalar & 0xFF];
}
}
/// <summary>
/// Compute Unicode scalar value from unicode codepoint stream
/// </summary>
static internal int UnicodeScalar(
CharacterBufferRange unicodeString,
out int sizeofChar
)
{
Invariant.Assert(unicodeString.CharacterBuffer != null && unicodeString.Length > 0);
int ch = unicodeString[0];
sizeofChar = 1;
if ( unicodeString.Length >= 2
&& (ch & 0xFC00) == 0xD800
&& (unicodeString[1] & 0xFC00) == 0xDC00
)
{
ch = (((ch & 0x03FF) << 10) | (unicodeString[1] & 0x3FF)) + 0x10000;
sizeofChar++;
}
return ch;
}
/// <summary>
/// Check whether the character is combining mark
/// </summary>
/// <SecurityNote>
/// Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable).
/// It trusts the value passed in to derfence the table with no implicit bounds or validity checks.
/// TreatAsSafe: This information is safe to expose at the same time the unicodeScalar passed in
/// is validated for bounds
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static public bool IsCombining(int unicodeScalar)
{
unsafe
{
byte itemClass = Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].ItemClass;
return itemClass == (byte)ItemClass.SimpleMarkClass
|| itemClass == (byte)ItemClass.ComplexMarkClass
|| IsIVS(unicodeScalar);
}
}
/// <summary>
/// Check whether the character is a joiner character
/// </summary>
/// <SecurityNote>
/// Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable).
/// It trusts the value passed in to derfence the table with no implicit bounds or validity checks.
/// TreatAsSafe: This information is safe to expose at the same time the unicodeScalar passed in
/// is validated for bounds
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static public bool IsJoiner(int unicodeScalar)
{
unsafe
{
byte itemClass = Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].ItemClass;
return itemClass == (byte) ItemClass.JoinerClass;
}
}
/// <summary>
/// Check whether the character is an IVS selector character
/// </summary>
static public bool IsIVS(int unicodeScalar)
{
// An Ideographic Variation Sequence (IVS) is a sequence of two
// coded characters, the first being a character with the
// Unified_Ideograph property, the second being a variation
// selector character in the range U+E0100 to U+E01EF.
return unicodeScalar >= 0xE0100 && unicodeScalar <= 0xE01EF;
}
/// <summary>
/// Scan UTF16 character string until a character with specified attributes is found
/// </summary>
/// <returns>character index of first character matching the attribute.</returns>
/// <SecurityNote>
/// Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable).
/// It keeps accesing a buffer with no validation in terms of the variables passed in.
/// TreatAsSafe: This information is safe to expose, as in the worst case it tells you information
/// of where the next UTF16 character is. Also the constructor for characterbuffer can be one of three
/// a string, a char array or an unmanaged char*. The third case is critical and tightly controlled
/// so the risk of bogus length is significantly mitigated.
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static public int AdvanceUntilUTF16(
CharacterBuffer charBuffer,
int offsetToFirstChar,
int stringLength,
ushort mask,
out ushort charFlags
)
{
int i = offsetToFirstChar;
int limit = offsetToFirstChar + stringLength;
charFlags = 0;
while (i < limit)
{
unsafe
{
ushort flags = (ushort)Classification.CharAttributeTable[(int)GetUnicodeClassUTF16(charBuffer[i])].Flags;
if((flags & mask) != 0)
break;
charFlags |= flags;
}
i++;
}
return i - offsetToFirstChar;
}
/// <summary>
/// Scan character string until a character that is not the specified ItemClass is found
/// </summary>
/// <returns>character index of first character that is not the specified ItemClass</returns>
/// <SecurityNote>
/// Critical: This code acceses a function call that returns a pointer (get_CharAttributeTable). It acceses
/// elements in an array with no type checking.
/// TreatAsSafe: This code exposes the index of the next non UTF16 character in a run. This is ok to expose
/// Also the calls to CharBuffer and CahrAttribute do the requisite bounds checking.
/// </SecurityNote>
[SecurityCritical,SecurityTreatAsSafe]
static public int AdvanceWhile(
CharacterBufferRange unicodeString,
ItemClass itemClass
)
{
int i = 0;
int limit = unicodeString.Length;
int sizeofChar = 0;
while (i < limit)
{
int ch = Classification.UnicodeScalar(
new CharacterBufferRange(unicodeString, i, limit - i),
out sizeofChar
);
unsafe
{
byte currentClass = (byte) Classification.CharAttributeTable[(int)GetUnicodeClass(ch)].ItemClass;
if (currentClass != (byte) itemClass)
break;
}
i += sizeofChar;
}
return i;
}
/// <SecurityNote>
/// Critical: This accesses unsafe code and returns a pointer
/// </SecurityNote>
private static unsafe short*** UnicodeClassTable
{
[SecurityCritical]
get { return (short***)_unicodeClassTable.Value; }
}
/// <SecurityNote>
/// Critical: This accesses unsafe code and returns a pointer
/// </SecurityNote>
private static unsafe CharacterAttribute* CharAttributeTable
{
[SecurityCritical]
get { return (CharacterAttribute*)_charAttributeTable.Value; }
}
/// <SecurityNote>
/// Critical: This accesses unsafe code and indexes into an array
/// Safe : This method does bound check on the input char class.
/// </SecurityNote>
[SecurityCritical, SecurityTreatAsSafe]
internal static CharacterAttribute CharAttributeOf(int charClass)
{
unsafe
{
Invariant.Assert(charClass >= 0 && charClass < (int) UnicodeClass.Max);
return CharAttributeTable[charClass];
}
}
static private readonly SecurityCriticalData<IntPtr> _unicodeClassTable;
static private readonly SecurityCriticalData<IntPtr> _charAttributeTable;
static private readonly SecurityCriticalData<IntPtr> _mirroredCharTable;
static private readonly SecurityCriticalData<CombiningMarksClassificationData> _combiningMarksClassification;
}
}
|