|
// ==++==
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// ==--==
// The worker functions in this file was optimized for performance. If you make changes
// you should use care to consider all of the interesting cases.
// The code of all worker functions in this file is written twice: Once as as a slow loop, and the
// second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
// The fast loops attempts to blaze through as fast as possible with optimistic range checks,
// processing multiple characters at a time, and falling back to the slow loop for all special cases.
// This define can be used to turn off the fast loops. Useful for finding whether
// the problem is fastloop-specific.
#define FASTLOOP
namespace System.Text
{
using System;
using System.Globalization;
using System.Runtime.Serialization;
using System.Security.Permissions;
using System.Diagnostics.Contracts;
// Encodes text into and out of UTF-8. UTF-8 is a way of writing
// Unicode characters with variable numbers of bytes per character,
// optimized for the lower 127 ASCII characters. It's an efficient way
// of encoding US English in an internationalizable way.
//
// Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
//
// The UTF-8 byte order mark is simply the Unicode byte order mark
// (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
// used mostly to distinguish UTF-8 text from other encodings, and doesn't
// switch the byte orderings.
[Serializable]
[System.Runtime.InteropServices.ComVisible(true)]
public class UTF8Encoding : Encoding
{
/*
bytes bits UTF-8 representation
----- ---- -----------------------------------
1 7 0vvvvvvv
2 11 110vvvvv 10vvvvvv
3 16 1110vvvv 10vvvvvv 10vvvvvv
4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
----- ---- -----------------------------------
Surrogate:
Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
*/
private const int UTF8_CODEPAGE=65001;
// Allow for de-virtualization
internal sealed class UTF8EncodingSealed : UTF8Encoding
{
public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
}
// Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
// the standard.
private bool emitUTF8Identifier = false;
private bool isThrowException = false;
public UTF8Encoding(): this(false)
{
}
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier):
this(encoderShouldEmitUTF8Identifier, false)
{
}
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes):
base(UTF8_CODEPAGE)
{
this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
this.isThrowException = throwOnInvalidBytes;
// Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
if (this.isThrowException)
SetDefaultFallbacks();
}
internal override void SetDefaultFallbacks()
{
// For UTF-X encodings, we use a replacement fallback with an empty string
if (this.isThrowException)
{
this.encoderFallback = EncoderFallback.ExceptionFallback;
this.decoderFallback = DecoderFallback.ExceptionFallback;
}
else
{
this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
}
}
//
// WARNING: GetByteCount(string chars)
// WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
// WARNING: otherwise it'll break VB's way of declaring these.
//
// The following methods are copied from EncodingNLS.cs.
// Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
// These should be kept in sync for the following classes:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
//
// Returns the number of bytes required to encode a range of characters in
// a character array.
//
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetByteCount(char[] chars, int index, int count)
{
// Validate input parameters
if (chars == null)
throw new ArgumentNullException("chars",
Environment.GetResourceString("ArgumentNull_Array"));
if (index < 0 || count < 0)
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (chars.Length - index < count)
throw new ArgumentOutOfRangeException("chars",
Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
Contract.EndContractBlock();
// If no input, return 0, avoid fixed empty array problem
if (chars.Length == 0)
return 0;
// Just call the pointer version
fixed (char* pChars = chars)
return GetByteCount(pChars + index, count, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetByteCount(String chars)
{
// Validate input
if (chars==null)
throw new ArgumentNullException("s");
Contract.EndContractBlock();
fixed (char* pChars = chars)
return GetByteCount(pChars, chars.Length, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
[System.Security.SecurityCritical] // auto-generated
[CLSCompliant(false)]
[System.Runtime.InteropServices.ComVisible(false)]
public override unsafe int GetByteCount(char* chars, int count)
{
// Validate Parameters
if (chars == null)
throw new ArgumentNullException("chars",
Environment.GetResourceString("ArgumentNull_Array"));
if (count < 0)
throw new ArgumentOutOfRangeException("count",
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
// Call it with empty encoder
return GetByteCount(chars, count, null);
}
// Parent method is safe.
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetBytes(String s, int charIndex, int charCount,
byte[] bytes, int byteIndex)
{
if (s == null || bytes == null)
throw new ArgumentNullException((s == null ? "s" : "bytes"),
Environment.GetResourceString("ArgumentNull_Array"));
if (charIndex < 0 || charCount < 0)
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (s.Length - charIndex < charCount)
throw new ArgumentOutOfRangeException("s",
Environment.GetResourceString("ArgumentOutOfRange_IndexCount"));
if (byteIndex < 0 || byteIndex > bytes.Length)
throw new ArgumentOutOfRangeException("byteIndex",
Environment.GetResourceString("ArgumentOutOfRange_Index"));
Contract.EndContractBlock();
int byteCount = bytes.Length - byteIndex;
// Fixed doesn't like 0 length arrays.
if (bytes.Length == 0)
bytes = new byte[1];
fixed (char* pChars = s)
fixed ( byte* pBytes = bytes)
return GetBytes(pChars + charIndex, charCount,
pBytes + byteIndex, byteCount, null);
}
// Encodes a range of characters in a character array into a range of bytes
// in a byte array. An exception occurs if the byte array is not large
// enough to hold the complete encoding of the characters. The
// GetByteCount method can be used to determine the exact number of
// bytes that will be produced for a given range of characters.
// Alternatively, the GetMaxByteCount method can be used to
// determine the maximum number of bytes that will be produced for a given
// number of characters, regardless of the actual character values.
//
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
byte[] bytes, int byteIndex)
{
// Validate parameters
if (chars == null || bytes == null)
throw new ArgumentNullException((chars == null ? "chars" : "bytes"),
Environment.GetResourceString("ArgumentNull_Array"));
if (charIndex < 0 || charCount < 0)
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (chars.Length - charIndex < charCount)
throw new ArgumentOutOfRangeException("chars",
Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
if (byteIndex < 0 || byteIndex > bytes.Length)
throw new ArgumentOutOfRangeException("byteIndex",
Environment.GetResourceString("ArgumentOutOfRange_Index"));
Contract.EndContractBlock();
// If nothing to encode return 0, avoid fixed problem
if (chars.Length == 0)
return 0;
// Just call pointer version
int byteCount = bytes.Length - byteIndex;
// Fixed doesn't like 0 length arrays.
if (bytes.Length == 0)
bytes = new byte[1];
fixed (char* pChars = chars)
fixed (byte* pBytes = bytes)
// Remember that byteCount is # to decode, not size of array.
return GetBytes(pChars + charIndex, charCount,
pBytes + byteIndex, byteCount, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
[System.Security.SecurityCritical] // auto-generated
[CLSCompliant(false)]
[System.Runtime.InteropServices.ComVisible(false)]
public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
{
// Validate Parameters
if (bytes == null || chars == null)
throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
Environment.GetResourceString("ArgumentNull_Array"));
if (charCount < 0 || byteCount < 0)
throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
return GetBytes(chars, charCount, bytes, byteCount, null);
}
// Returns the number of characters produced by decoding a range of bytes
// in a byte array.
//
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetCharCount(byte[] bytes, int index, int count)
{
// Validate Parameters
if (bytes == null)
throw new ArgumentNullException("bytes",
Environment.GetResourceString("ArgumentNull_Array"));
if (index < 0 || count < 0)
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (bytes.Length - index < count)
throw new ArgumentOutOfRangeException("bytes",
Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
Contract.EndContractBlock();
// If no input just return 0, fixed doesn't like 0 length arrays.
if (bytes.Length == 0)
return 0;
// Just call pointer version
fixed (byte* pBytes = bytes)
return GetCharCount(pBytes + index, count, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
[System.Security.SecurityCritical] // auto-generated
[CLSCompliant(false)]
[System.Runtime.InteropServices.ComVisible(false)]
public override unsafe int GetCharCount(byte* bytes, int count)
{
// Validate Parameters
if (bytes == null)
throw new ArgumentNullException("bytes",
Environment.GetResourceString("ArgumentNull_Array"));
if (count < 0)
throw new ArgumentOutOfRangeException("count",
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
return GetCharCount(bytes, count, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
char[] chars, int charIndex)
{
// Validate Parameters
if (bytes == null || chars == null)
throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
Environment.GetResourceString("ArgumentNull_Array"));
if (byteIndex < 0 || byteCount < 0)
throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if ( bytes.Length - byteIndex < byteCount)
throw new ArgumentOutOfRangeException("bytes",
Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
if (charIndex < 0 || charIndex > chars.Length)
throw new ArgumentOutOfRangeException("charIndex",
Environment.GetResourceString("ArgumentOutOfRange_Index"));
Contract.EndContractBlock();
// If no input, return 0 & avoid fixed problem
if (bytes.Length == 0)
return 0;
// Just call pointer version
int charCount = chars.Length - charIndex;
// Fixed doesn't like 0 length arrays.
if (chars.Length == 0)
chars = new char[1];
fixed (byte* pBytes = bytes)
fixed (char* pChars = chars)
// Remember that charCount is # to decode, not size of array
return GetChars(pBytes + byteIndex, byteCount,
pChars + charIndex, charCount, null);
}
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
[System.Security.SecurityCritical] // auto-generated
[CLSCompliant(false)]
[System.Runtime.InteropServices.ComVisible(false)]
public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
{
// Validate Parameters
if (bytes == null || chars == null)
throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
Environment.GetResourceString("ArgumentNull_Array"));
if (charCount < 0 || byteCount < 0)
throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
return GetChars(bytes, byteCount, chars, charCount, null);
}
// Returns a string containing the decoded representation of a range of
// bytes in a byte array.
//
// All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
// So if you fix this, fix the others. Currently those include:
// EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
// parent method is safe
[System.Security.SecuritySafeCritical] // auto-generated
[System.Runtime.InteropServices.ComVisible(false)]
public override unsafe String GetString(byte[] bytes, int index, int count)
{
// Validate Parameters
if (bytes == null)
throw new ArgumentNullException("bytes",
Environment.GetResourceString("ArgumentNull_Array"));
if (index < 0 || count < 0)
throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"),
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (bytes.Length - index < count)
throw new ArgumentOutOfRangeException("bytes",
Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
Contract.EndContractBlock();
// Avoid problems with empty input buffer
if (bytes.Length == 0) return String.Empty;
fixed (byte* pBytes = bytes)
return String.CreateStringFromEncoding(
pBytes + index, count, this);
}
//
// End of standard methods copied from EncodingNLS.cs
//
// To simplify maintenance, the structure of GetByteCount and GetBytes should be
// kept the same as much as possible
[System.Security.SecurityCritical] // auto-generated
internal override unsafe int GetByteCount(char *chars, int count, EncoderNLS baseEncoder)
{
// For fallback we may need a fallback buffer.
// We wait to initialize it though in case we don't have any broken input unicode
EncoderFallbackBuffer fallbackBuffer = null;
char *pSrc = chars;
char *pEnd = pSrc+count;
// Start by assuming we have as many as count
int byteCount = count;
int ch = 0;
if (baseEncoder != null) {
UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
ch = encoder.surrogateChar;
// We mustn't have left over fallback data when counting
if (encoder.InternalHasFallbackBuffer)
{
fallbackBuffer = encoder.FallbackBuffer;
if (fallbackBuffer.Remaining > 0)
throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
this.EncodingName, encoder.Fallback.GetType()));
// Set our internal fallback interesting things.
fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
}
}
for (;;) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
if (ch == 0) {
// Unroll any fallback that happens at the end
ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
if (ch > 0) {
byteCount++;
goto ProcessChar;
}
} else {
// Case of surrogates in the fallback.
if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
ch = fallbackBuffer.InternalGetNextChar();
byteCount++;
if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
ch = 0xfffd;
byteCount++;
goto EncodeChar;
} else if (ch > 0){
goto ProcessChar;
} else {
byteCount--; // ignore last one.
break;
}
}
}
if (ch <= 0) {
break;
}
if (baseEncoder != null && !baseEncoder.MustFlush) {
break;
}
// attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
byteCount++;
goto EncodeChar;
}
if (ch > 0) {
Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int cha = *pSrc;
// count the pending surrogate
byteCount++;
// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
// if (IsLowSurrogate(cha)) {
if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
// Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
ch = 0xfffd;
// ch = cha + (ch << 10) +
// (0x10000
// - CharUnicodeInfo.LOW_SURROGATE_START
// - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
// Use this next char
pSrc++;
}
// else ch is still high surrogate and encoding will fail (so don't add count)
// attempt to encode the surrogate or partial surrogate
goto EncodeChar;
}
// If we've used a fallback, then we have to check for it
if (fallbackBuffer != null)
{
ch = fallbackBuffer.InternalGetNextChar();
if (ch > 0)
{
// We have an extra byte we weren't expecting.
byteCount++;
goto ProcessChar;
}
}
// read next char. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
ch = *pSrc;
pSrc++;
ProcessChar:
// if (IsHighSurrogate(ch)) {
if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
// we will count this surrogate next time around
byteCount--;
continue;
}
// either good char or partial surrogate
EncodeChar:
// throw exception on partial surrogate if necessary
// if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
{
// Lone surrogates aren't allowed
// Have to make a fallback buffer if we don't have one
if (fallbackBuffer == null)
{
// wait on fallbacks if we can
// For fallback we may need a fallback buffer
if (baseEncoder == null)
fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
else
fallbackBuffer = baseEncoder.FallbackBuffer;
// Set our internal fallback interesting things.
fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
}
// Do our fallback. Actually we already know its a mixed up surrogate,
// so the ref pSrc isn't gonna do anything.
fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
// Ignore it if we don't throw (we had preallocated this ch)
byteCount--;
ch = 0;
continue;
}
// Count them
if (ch > 0x7F) {
if (ch > 0x7FF) {
// the extra surrogate byte was compensated by the second surrogate character
// (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
byteCount++;
}
byteCount++;
}
#if WIN64
// check for overflow
if (byteCount < 0) {
break;
}
#endif
#if FASTLOOP
// If still have fallback don't do fast loop
if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
{
// We're reserving 1 byte for each char by default
byteCount++;
goto ProcessChar;
}
int availableChars = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough characters
if (availableChars <= 13) {
// try to get over the remainder of the ascii characters fast though
char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
}
// we are done
break;
}
#if WIN64
// make sure that we won't get a silent overflow inside the fast loop
// (Fall out to slow loop if we have this many characters)
availableChars &= 0x0FFFFFFF;
#endif
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
char *pStop = pSrc + availableChars - (3 + 4);
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) // Not ASCII
{
if (ch > 0x7FF) // Not 2 Byte
{
if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
goto LongCode;
byteCount++;
}
byteCount ++;
}
// get pSrc aligned
if ((unchecked((int)pSrc) & 0x2) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) // Not ASCII
{
if (ch > 0x7FF) // Not 2 Byte
{
if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
goto LongCode;
byteCount++;
}
byteCount ++;
}
}
// Run 2 * 4 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chc = *(int*)(pSrc+2);
if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
{
if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
{
goto LongCodeWithMask;
}
if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
byteCount++;
if ((ch & unchecked((int)0xFF80)) != 0)
byteCount++;
if ((chc & unchecked((int)0xFF800000)) != 0)
byteCount++;
if ((chc & unchecked((int)0xFF80)) != 0)
byteCount++;
}
pSrc += 4;
ch = *(int*)pSrc;
chc = *(int*)(pSrc+2);
if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
{
if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
{
goto LongCodeWithMask;
}
if ((ch & unchecked((int)0xFF800000)) != 0)
byteCount++;
if ((ch & unchecked((int)0xFF80)) != 0)
byteCount++;
if ((chc & unchecked((int)0xFF800000)) != 0)
byteCount++;
if ((chc & unchecked((int)0xFF80)) != 0)
byteCount++;
}
pSrc += 4;
}
break;
LongCodeWithMask:
#if BIGENDIAN
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
ch = (char)ch;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
continue;
}
LongCode:
// use separate helper variables for slow and fast loop so that the jit optimizations
// won't get confused about the variable lifetimes
if (ch > 0x7FF) {
// if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
// 4 byte encoding - high surrogate + low surrogate
int chd = *pSrc;
if (
// !IsHighSurrogate(ch) // low without high -> bad
ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
// !IsLowSurrogate(chd) // high not followed by low -> bad
!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END) )
{
// Back up and drop out to slow loop to figure out error
pSrc--;
break;
}
pSrc++;
// byteCount - this byte is compensated by the second surrogate character
}
byteCount++;
}
byteCount++;
// byteCount - the last byte is already included
}
#endif // FASTLOOP
// no pending char at this point
ch = 0;
}
#if WIN64
// check for overflow
if (byteCount < 0) {
throw new ArgumentException(
Environment.GetResourceString("Argument_ConversionOverflow"));
}
#endif
Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
"[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
return byteCount;
}
// diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
// is good enough for us, and it tends to generate better code than the signed
// arithmetic generated by default
[System.Security.SecurityCritical] // auto-generated
unsafe private static int PtrDiff(char *a, char* b)
{
return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
}
// byte* flavor just for parity
[System.Security.SecurityCritical] // auto-generated
unsafe private static int PtrDiff(byte* a, byte* b)
{
return (int)(a - b);
}
private static bool InRange(int ch, int start, int end)
{
return (uint)(ch - start) <= (uint)(end - start);
}
// Our workhorse
// Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
[System.Security.SecurityCritical] // auto-generated
internal override unsafe int GetBytes(char* chars, int charCount,
byte* bytes, int byteCount, EncoderNLS baseEncoder)
{
Contract.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null");
Contract.Assert(byteCount >=0, "[UTF8Encoding.GetBytes]byteCount >=0");
Contract.Assert(charCount >=0, "[UTF8Encoding.GetBytes]charCount >=0");
Contract.Assert(bytes!=null, "[UTF8Encoding.GetBytes]bytes!=null");
UTF8Encoder encoder = null;
// For fallback we may need a fallback buffer.
// We wait to initialize it though in case we don't have any broken input unicode
EncoderFallbackBuffer fallbackBuffer = null;
char *pSrc = chars;
byte *pTarget = bytes;
char *pEnd = pSrc+charCount;
byte *pAllocatedBufferEnd = pTarget+byteCount;
int ch = 0;
// assume that JIT will enregister pSrc, pTarget and ch
if (baseEncoder != null) {
encoder = (UTF8Encoder)baseEncoder;
ch = encoder.surrogateChar;
// We mustn't have left over fallback data when counting
if (encoder.InternalHasFallbackBuffer)
{
// We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
fallbackBuffer = encoder.FallbackBuffer;
if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
this.EncodingName, encoder.Fallback.GetType()));
// Set our internal fallback interesting things.
fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
}
}
for (;;) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
if (ch == 0) {
// Check if there's anthing left to get out of the fallback buffer
ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
if (ch > 0) {
goto ProcessChar;
}
} else {
// Case of leftover surrogates in the fallback buffer
if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
int cha = ch;
ch = fallbackBuffer.InternalGetNextChar();
if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
goto EncodeChar;
} else if (ch > 0){
goto ProcessChar;
} else {
break;
}
}
}
// attempt to encode the partial surrogate (will fail or ignore)
if (ch > 0 && (encoder == null || encoder.MustFlush))
goto EncodeChar;
// We're done
break;
}
if (ch > 0) {
// We have a high surrogate left over from a previous loop.
Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int cha = *pSrc;
// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
// if (IsLowSurrogate(cha)) {
if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
ch = cha + (ch << 10) +
(0x10000
- CharUnicodeInfo.LOW_SURROGATE_START
- (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
pSrc++;
}
// else ch is still high surrogate and encoding will fail
// attempt to encode the surrogate or partial surrogate
goto EncodeChar;
}
// If we've used a fallback, then we have to check for it
if (fallbackBuffer != null)
{
ch = fallbackBuffer.InternalGetNextChar();
if (ch > 0) goto ProcessChar;
}
// read next char. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
ch = *pSrc;
pSrc++;
ProcessChar:
// if (IsHighSurrogate(ch)) {
if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
continue;
}
// either good char or partial surrogate
EncodeChar:
// throw exception on partial surrogate if necessary
// if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
{
// Lone surrogates aren't allowed, we have to do fallback for them
// Have to make a fallback buffer if we don't have one
if (fallbackBuffer == null)
{
// wait on fallbacks if we can
// For fallback we may need a fallback buffer
if (baseEncoder == null)
fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
else
fallbackBuffer = baseEncoder.FallbackBuffer;
// Set our internal fallback interesting things.
fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
}
// Do our fallback. Actually we already know its a mixed up surrogate,
// so the ref pSrc isn't gonna do anything.
fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
// Ignore it if we don't throw
ch = 0;
continue;
}
// Count bytes needed
int bytesNeeded = 1;
if (ch > 0x7F) {
if (ch > 0x7FF) {
if (ch > 0xFFFF) {
bytesNeeded++; // 4 bytes (surrogate pair)
}
bytesNeeded++; // 3 bytes (800-FFFF)
}
bytesNeeded++; // 2 bytes (80-7FF)
}
if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
// Left over surrogate from last time will cause pSrc == chars, so we'll throw
if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
{
fallbackBuffer.MovePrevious(); // Didn't use this fallback char
if (ch > 0xFFFF)
fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
}
else
{
pSrc--; // Didn't use this char
if (ch > 0xFFFF)
pSrc--; // Was surrogate, didn't use 2nd part either
}
Contract.Assert(pSrc >= chars || pTarget == bytes,
"[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
break;
}
if (ch <= 0x7F) {
*pTarget = (byte)ch;
}
else {
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int chb;
if (ch <= 0x7FF) {
// 2 byte encoding
chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
}
else
{
if (ch <= 0xFFFF) {
chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
}
else
{
*pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
pTarget++;
chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
}
*pTarget = (byte)chb;
pTarget++;
chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
}
*pTarget = (byte)chb;
pTarget++;
*pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
}
pTarget++;
#if FASTLOOP
// If still have fallback don't do fast loop
if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
goto ProcessChar;
int availableChars = PtrDiff(pEnd, pSrc);
int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
// don't fall into the fast decoding loop if we don't have enough characters
// Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
if (availableChars <= 13) {
// we are hoping for 1 byte per char
if (availableBytes < availableChars) {
// not enough output room. no pending bits at this point
ch = 0;
continue;
}
// try to get over the remainder of the ascii characters fast though
char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
// Not ASCII, need more than 1 byte per char
if (ch > 0x7F)
goto ProcessChar;
*pTarget = (byte)ch;
pTarget++;
}
// we are done, let ch be 0 to clear encoder
ch = 0;
break;
}
// we need at least 1 byte per character, but Convert might allow us to convert
// only part of the input, so try as much as we can. Reduce charCount if necessary
if (availableBytes < availableChars)
{
availableChars = availableBytes;
}
// FASTLOOP:
// - optimistic range checks
// - fallbacks to the slow loop for all special cases, exception throwing, etc.
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
// If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
char *pStop = pSrc + availableChars - 5;
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (byte)ch;
pTarget++;
// get pSrc aligned
if ((unchecked((int)pSrc) & 0x2) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (byte)ch;
pTarget++;
}
// Run 4 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chc = *(int*)(pSrc+2);
if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) {
goto LongCodeWithMask;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (byte)(ch>>16);
*(pTarget+1) = (byte)ch;
pSrc += 4;
*(pTarget+2) = (byte)(chc>>16);
*(pTarget+3) = (byte)chc;
pTarget += 4;
#else // BIGENDIAN
*pTarget = (byte)ch;
*(pTarget+1) = (byte)(ch>>16);
pSrc += 4;
*(pTarget+2) = (byte)chc;
*(pTarget+3) = (byte)(chc>>16);
pTarget += 4;
#endif // BIGENDIAN
}
continue;
LongCodeWithMask:
#if BIGENDIAN
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
ch = (char)ch;
#endif // BIGENDIAN
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (byte)ch;
pTarget++;
continue;
LongCode:
// use separate helper variables for slow and fast loop so that the jit optimizations
// won't get confused about the variable lifetimes
int chd;
if (ch <= 0x7FF) {
// 2 byte encoding
chd = unchecked((sbyte)0xC0) | (ch >> 6);
}
else {
// if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
// 3 byte encoding
chd = unchecked((sbyte)0xE0) | (ch >> 12);
}
else
{
// 4 byte encoding - high surrogate + low surrogate
// if (!IsHighSurrogate(ch))
if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) {
// low without high -> bad, try again in slow loop
pSrc -= 1;
break;
}
chd = *pSrc;
pSrc++;
// if (!IsLowSurrogate(chd)) {
if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
// high not followed by low -> bad, try again in slow loop
pSrc -= 2;
break;
}
ch = chd + (ch << 10) +
(0x10000
- CharUnicodeInfo.LOW_SURROGATE_START
- (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
*pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
// pStop - this byte is compensated by the second surrogate character
// 2 input chars require 4 output bytes. 2 have been anticipated already
// and 2 more will be accounted for by the 2 pStop-- calls below.
pTarget++;
chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
}
*pTarget = (byte)chd;
pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
pTarget++;
chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
}
*pTarget = (byte)chd;
pStop--; // 2 byte sequence for 1 char so need pStop--.
pTarget++;
*pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
// pStop - this byte is already included
pTarget++;
}
Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
#endif // FASTLOOP
// no pending char at this point
ch = 0;
}
// Do we have to set the encoder bytes?
if (encoder != null)
{
Contract.Assert(!encoder.MustFlush || ch == 0,
"[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
encoder.surrogateChar = ch;
encoder.m_charsUsed = (int)(pSrc - chars);
}
Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
baseEncoder == null || !baseEncoder.m_throwOnOverflow,
"[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
return (int)(pTarget - bytes);
}
// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
// while the actual character is being built in the lower bits. They are shifted together
// with the actual bits of the character.
// bits 30 & 31 are used for pending bits fixup
private const int FinalByte = 1 << 29;
private const int SupplimentarySeq = 1 << 28;
private const int ThreeByteSeq = 1 << 27;
// Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
// If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
//
// To simplify maintenance, the structure of GetCharCount and GetChars should be
// kept the same as much as possible
[System.Security.SecurityCritical] // auto-generated
internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
{
Contract.Assert(count >=0, "[UTF8Encoding.GetCharCount]count >=0");
Contract.Assert(bytes!=null, "[UTF8Encoding.GetCharCount]bytes!=null");
// Initialize stuff
byte *pSrc = bytes;
byte *pEnd = pSrc+count;
// Start by assuming we have as many as count, charCount always includes the adjustment
// for the character being decoded
int charCount = count;
int ch = 0;
DecoderFallbackBuffer fallback = null;
if (baseDecoder != null) {
UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
ch = decoder.bits;
charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
// Shouldn't have anything in fallback buffer for GetCharCount
// (don't have to check m_throwOnOverflow for count)
Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
"[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
}
for (;;)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
break;
}
if (ch == 0) {
// no pending bits
goto ReadChar;
}
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
charCount += (ch >> 30);
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0) {
Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
"[UTF8Encoding.GetChars]Invariant volation");
if ((ch & SupplimentarySeq) != 0) {
if ((ch & (FinalByte >> 6)) != 0) {
// this is 3rd byte (of 4 byte supplimentary) - nothing to do
continue;
}
// 2nd byte, check for non-shortest form of supplimentary char and the valid
// supplimentary characters in range 0x010000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
goto InvalidByteSequence;
}
}
else {
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6) ) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// adjust for surrogates in non-shortest form
if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
charCount--;
}
goto EncodeChar;
InvalidByteSequence:
// this code fragment should be close to the gotos referencing it
// Have to do fallback for invalid bytes
if (fallback == null)
{
if (baseDecoder == null)
fallback = this.decoderFallback.CreateFallbackBuffer();
else
fallback = baseDecoder.FallbackBuffer;
fallback.InternalInitialize(bytes, null);
}
charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F) {
// If its > 0x7F, its start of a new multi-byte sequence
// Long sequence, so unreserve our char.
charCount--;
// bit 6 has to be non-zero for start of multibyte chars.
if ((ch & 0x40) == 0) {
// Unexpected trail byte
goto InvalidByteSequence;
}
// start a new long code
if ((ch & 0x20) != 0) {
if ((ch & 0x10) != 0) {
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04) {
ch |= 0xf0;
goto InvalidByteSequence;
}
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
// Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
ch |= (FinalByte >> 3*6) | // Final byte is 3 more bytes from now
(1 << 30) | // If it dies on next byte we'll need an extra char
(3 << (30-2*6)) | // If it dies on last byte we'll need to subtract a char
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
// Our character count will be 2 characters for these 4 bytes, so subtract another char
charCount--;
}
else {
// 3 byte encoding
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
// We'll expect 1 character for these 3 bytes, so subtract another char.
charCount--;
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
ch |= 0xc0;
goto InvalidByteSequence;
}
// Add bit flags so we'll be flagged correctly
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
#if FASTLOOP
int availableBytes = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
if (availableBytes <= 13) {
// try to get over the remainder of the ascii characters fast though
byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
}
// we are done
ch = 0;
break;
}
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
byte *pStop = pSrc + availableBytes - 7;
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
// get pSrc 2-byte aligned
if ((unchecked((int)pSrc) & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
}
// get pSrc 4-byte aligned
if ((unchecked((int)pSrc) & 0x2) != 0) {
ch = *(ushort*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
}
pSrc += 2;
}
// Run 8 + 8 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chb = *(int*)(pSrc+4);
if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
goto LongCodeWithMask32;
}
pSrc += 8;
// This is a really small loop - unroll it
if (pSrc >= pStop)
break;
ch = *(int*)pSrc;
chb = *(int*)(pSrc+4);
if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
goto LongCodeWithMask32;
}
pSrc += 8;
}
break;
#if BIGENDIAN
LongCodeWithMask32:
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
LongCodeWithMask16:
ch = (int)(((uint)ch) >> 8);
#else // BIGENDIAN
LongCodeWithMask32:
LongCodeWithMask16:
ch &= 0xFF;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
continue;
}
LongCode:
int chc = *pSrc;
pSrc++;
if (
// bit 6 has to be zero
(ch & 0x40) == 0 ||
// we are expecting to see trailing bytes like 10vvvvvv
(chc & unchecked((sbyte)0xC0)) != 0x80)
{
goto BadLongCode;
}
chc &= 0x3F;
// start a new long code
if ((ch & 0x20) != 0) {
// fold the first two bytes together
chc |= (ch & 0x0F) << 6;
if ((ch & 0x10) != 0) {
// 4 byte encoding - surrogate
ch = *pSrc;
if (
// check that bit 4 is zero, the non-shortest form of surrogate
// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
!InRange(chc >> 4, 0x01, 0x10) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & unchecked((sbyte)0xC0)) != 0x80 )
{
goto BadLongCode;
}
chc = (chc << 6) | (ch & 0x3F);
ch = *(pSrc+1);
// we are expecting to see trailing bytes like 10vvvvvv
if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
goto BadLongCode;
}
pSrc += 2;
// extra byte
charCount--;
}
else {
// 3 byte encoding
ch = *pSrc;
if (
// check for non-shortest form of 3 byte seq
(chc & (0x1F << 5)) == 0 ||
// Can't have surrogates here.
(chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & unchecked((sbyte)0xC0)) != 0x80 )
{
goto BadLongCode;
}
pSrc++;
// extra byte
charCount--;
}
}
else {
// 2 byte encoding
// check for non-shortest form
if ((ch & 0x1E) == 0) {
goto BadLongCode;
}
}
// extra byte
charCount--;
}
#endif // FASTLOOP
// no pending bits at this point
ch = 0;
continue;
BadLongCode:
pSrc -= 2;
ch = 0;
continue;
}
// May have a problem if we have to flush
if (ch != 0)
{
// We were already adjusting for these, so need to unadjust
charCount += (ch >> 30);
if (baseDecoder == null || baseDecoder.MustFlush)
{
// Have to do fallback for invalid bytes
if (fallback == null)
{
if (baseDecoder == null)
fallback = this.decoderFallback.CreateFallbackBuffer();
else
fallback = baseDecoder.FallbackBuffer;
fallback.InternalInitialize(bytes, null);
}
charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
}
}
// Shouldn't have anything in fallback buffer for GetCharCount
// (don't have to check m_throwOnOverflow for count)
Contract.Assert(fallback == null || fallback.Remaining == 0,
"[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
return charCount;
}
// WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
// So if we're really broken, then that could also throw an error... recursively.
// So try to make sure GetChars can at least process all uses by
// System.Resources.ResourceReader!
//
// Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
// If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
//
// To simplify maintenance, the structure of GetCharCount and GetChars should be
// kept the same as much as possible
[System.Security.SecurityCritical] // auto-generated
internal override unsafe int GetChars(byte* bytes, int byteCount,
char* chars, int charCount, DecoderNLS baseDecoder)
{
Contract.Assert(chars!=null, "[UTF8Encoding.GetChars]chars!=null");
Contract.Assert(byteCount >=0, "[UTF8Encoding.GetChars]count >=0");
Contract.Assert(charCount >=0, "[UTF8Encoding.GetChars]charCount >=0");
Contract.Assert(bytes!=null, "[UTF8Encoding.GetChars]bytes!=null");
byte *pSrc = bytes;
char *pTarget = chars;
byte *pEnd = pSrc+byteCount;
char *pAllocatedBufferEnd = pTarget+charCount;
int ch = 0;
DecoderFallbackBuffer fallback = null;
if (baseDecoder != null) {
UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
ch = decoder.bits;
// Shouldn't have anything in fallback buffer for GetChars
// (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
"[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
}
for (;;)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
break;
}
if (ch == 0) {
// no pending bits
goto ReadChar;
}
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0) {
// Not at last byte yet
Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
"[UTF8Encoding.GetChars]Invariant volation");
if ((ch & SupplimentarySeq) != 0) {
// Its a 4-byte supplimentary sequence
if ((ch & (FinalByte >> 6)) != 0) {
// this is 3rd byte of 4 byte sequence - nothing to do
continue;
}
// 2nd byte of 4 bytes
// check for non-shortest form of surrogate and the valid surrogate
// range 0x000000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
goto InvalidByteSequence;
}
}
else {
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6) ) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// surrogate in shortest form?
// Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
// let the range check for the second char throw the exception
if (pTarget < pAllocatedBufferEnd) {
*pTarget = (char)( ((ch >> 10) & 0x7FF) +
unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))) );
pTarget++;
ch = (ch & 0x3FF) +
unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
}
}
goto EncodeChar;
InvalidByteSequence:
// this code fragment should be close to the gotos referencing it
// Have to do fallback for invalid bytes
if (fallback == null)
{
if (baseDecoder == null)
fallback = this.decoderFallback.CreateFallbackBuffer();
else
fallback = baseDecoder.FallbackBuffer;
fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
}
// This'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
{
// Ran out of buffer space
// Need to throw an exception?
Contract.Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
fallback.InternalReset();
ThrowCharsOverflow(baseDecoder, pTarget == chars);
ch = 0;
break;
}
Contract.Assert(pSrc >= bytes,
"[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F) {
// If its > 0x7F, its start of a new multi-byte sequence
// bit 6 has to be non-zero
if ((ch & 0x40) == 0) {
goto InvalidByteSequence;
}
// start a new long code
if ((ch & 0x20) != 0) {
if ((ch & 0x10) != 0) {
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04) {
ch |= 0xf0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 3*6) | (1 << 30) | (3 << (30-2*6)) |
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
}
else {
// 3 byte encoding
ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
ch |= 0xc0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
// write the pending character
if (pTarget >= pAllocatedBufferEnd)
{
// Fix chars so we make sure to throw if we didn't output anything
ch &= 0x1fffff;
if (ch > 0x7f)
{
if (ch > 0x7ff)
{
if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
ch <= CharUnicodeInfo.LOW_SURROGATE_END)
{
pSrc--; // It was 4 bytes
pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
}
else if (ch > 0xffff)
{
pSrc--; // It was 4 bytes, nothing was stored
}
pSrc--; // It was at least 3 bytes
}
pSrc--; // It was at least 2 bytes
}
pSrc--;
// Throw that we don't have enough room (pSrc could be < chars if we had started to process
// a 4 byte sequence alredy)
Contract.Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
ThrowCharsOverflow(baseDecoder, pTarget == chars);
// Don't store ch in decoder, we already backed up to its start
ch = 0;
// Didn't throw, just use this buffer size.
break;
}
*pTarget = (char)ch;
pTarget++;
#if FASTLOOP
int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
int availableBytes = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
// Test for availableChars is done because pStop would be <= pTarget.
if (availableBytes <= 13) {
// we may need as many as 1 character per byte
if (availableChars < availableBytes) {
// not enough output room. no pending bits at this point
ch = 0;
continue;
}
// try to get over the remainder of the ascii characters fast though
byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
*pTarget = (char)ch;
pTarget++;
}
// we are done
ch = 0;
break;
}
// we may need as many as 1 character per byte, so reduce the byte count if necessary.
// If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
if (availableChars < availableBytes) {
availableBytes = availableChars;
}
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
char *pStop = pTarget + availableBytes - 7;
while (pTarget < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (char)ch;
pTarget++;
// get pSrc to be 2-byte aligned
if ((unchecked((int)pSrc) & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (char)ch;
pTarget++;
}
// get pSrc to be 4-byte aligned
if ((unchecked((int)pSrc) & 0x2) != 0) {
ch = *(ushort*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (char)((ch >> 8) & 0x7F);
pSrc += 2;
*(pTarget+1) = (char)(ch & 0x7F);
pTarget += 2;
#else // BIGENDIAN
*pTarget = (char)(ch & 0x7F);
pSrc += 2;
*(pTarget+1) = (char)((ch >> 8) & 0x7F);
pTarget += 2;
#endif // BIGENDIAN
}
// Run 8 characters at a time!
while (pTarget < pStop) {
ch = *(int*)pSrc;
int chb = *(int*)(pSrc+4);
if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
goto LongCodeWithMask32;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (char)((ch >> 24) & 0x7F);
*(pTarget+1) = (char)((ch >> 16) & 0x7F);
*(pTarget+2) = (char)((ch >> 8) & 0x7F);
*(pTarget+3) = (char)(ch & 0x7F);
pSrc += 8;
*(pTarget+4) = (char)((chb >> 24) & 0x7F);
*(pTarget+5) = (char)((chb >> 16) & 0x7F);
*(pTarget+6) = (char)((chb >> 8) & 0x7F);
*(pTarget+7) = (char)(chb & 0x7F);
pTarget += 8;
#else // BIGENDIAN
*pTarget = (char)(ch & 0x7F);
*(pTarget+1) = (char)((ch >> 8) & 0x7F);
*(pTarget+2) = (char)((ch >> 16) & 0x7F);
*(pTarget+3) = (char)((ch >> 24) & 0x7F);
pSrc += 8;
*(pTarget+4) = (char)(chb & 0x7F);
*(pTarget+5) = (char)((chb >> 8) & 0x7F);
*(pTarget+6) = (char)((chb >> 16) & 0x7F);
*(pTarget+7) = (char)((chb >> 24) & 0x7F);
pTarget += 8;
#endif // BIGENDIAN
}
break;
#if BIGENDIAN
LongCodeWithMask32:
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
LongCodeWithMask16:
ch = (int)(((uint)ch) >> 8);
#else // BIGENDIAN
LongCodeWithMask32:
LongCodeWithMask16:
ch &= 0xFF;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
*pTarget = (char)ch;
pTarget++;
continue;
}
LongCode:
int chc = *pSrc;
pSrc++;
if (
// bit 6 has to be zero
(ch & 0x40) == 0 ||
// we are expecting to see trailing bytes like 10vvvvvv
(chc & unchecked((sbyte)0xC0)) != 0x80)
{
goto BadLongCode;
}
chc &= 0x3F;
// start a new long code
if ((ch & 0x20) != 0) {
// fold the first two bytes together
chc |= (ch & 0x0F) << 6;
if ((ch & 0x10) != 0) {
// 4 byte encoding - surrogate
ch = *pSrc;
if (
// check that bit 4 is zero, the non-shortest form of surrogate
// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
!InRange(chc >> 4, 0x01, 0x10) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & unchecked((sbyte)0xC0)) != 0x80 )
{
goto BadLongCode;
}
chc = (chc << 6) | (ch & 0x3F);
ch = *(pSrc+1);
// we are expecting to see trailing bytes like 10vvvvvv
if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
goto BadLongCode;
}
pSrc += 2;
ch = (chc << 6) | (ch & 0x3F);
*pTarget = (char)( ((ch >> 10) & 0x7FF) +
unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))) );
pTarget++;
ch = (ch & 0x3FF) +
unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
// extra byte, we're already planning 2 chars for 2 of these bytes,
// but the big loop is testing the target against pStop, so we need
// to subtract 2 more or we risk overrunning the input. Subtract
// one here and one below.
pStop--;
}
else {
// 3 byte encoding
ch = *pSrc;
if (
// check for non-shortest form of 3 byte seq
(chc & (0x1F << 5)) == 0 ||
// Can't have surrogates here.
(chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & unchecked((sbyte)0xC0)) != 0x80 )
{
goto BadLongCode;
}
pSrc++;
ch = (chc << 6) | (ch & 0x3F);
// extra byte, we're only expecting 1 char for each of these 3 bytes,
// but the loop is testing the target (not source) against pStop, so
// we need to subtract 2 more or we risk overrunning the input.
// Subtract 1 here and one more below
pStop--;
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
goto BadLongCode;
}
ch = (ch << 6) | chc;
}
*pTarget = (char)ch;
pTarget++;
// extra byte, we're only expecting 1 char for each of these 2 bytes,
// but the loop is testing the target (not source) against pStop.
// subtract an extra count from pStop so that we don't overrun the input.
pStop--;
}
#endif // FASTLOOP
Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
// no pending bits at this point
ch = 0;
continue;
BadLongCode:
pSrc -= 2;
ch = 0;
continue;
}
if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
{
// Have to do fallback for invalid bytes
if (fallback == null)
{
if (baseDecoder == null)
fallback = this.decoderFallback.CreateFallbackBuffer();
else
fallback = baseDecoder.FallbackBuffer;
fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
}
// This'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
{
Contract.Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
// Ran out of buffer space
// Need to throw an exception?
fallback.InternalReset();
ThrowCharsOverflow(baseDecoder, pTarget == chars);
}
Contract.Assert(pSrc >= bytes,
"[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
ch = 0;
}
if (baseDecoder != null)
{
UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
// If we're storing flush data we expect all bits to be used or else
// we're stuck in the middle of a conversion
Contract.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
"[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
// Remember our leftover bits.
decoder.bits = ch;
baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
}
// Shouldn't have anything in fallback buffer for GetChars
// (don't have to check m_throwOnOverflow for chars)
Contract.Assert(fallback == null || fallback.Remaining == 0,
"[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
return PtrDiff(pTarget, chars);
}
// During GetChars we had an invalid byte sequence
// pSrc is backed up to the start of the bad sequence if we didn't have room to
// fall it back. Otherwise pSrc remains wher it is.
[System.Security.SecurityCritical] // auto-generated
private unsafe bool FallbackInvalidByteSequence(
ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
{
// Get our byte[]
byte *pStart = pSrc;
byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
// Do the actual fallback
if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
{
// Oops, it failed, back up to pStart
pSrc = pStart;
return false;
}
// It worked
return true;
}
// During GetCharCount we had an invalid byte sequence
// pSrc is used to find the index that points to the invalid bytes,
// however the byte[] contains the fallback bytes (in case the index is -1)
[System.Security.SecurityCritical] // auto-generated
private unsafe int FallbackInvalidByteSequence(
byte* pSrc, int ch, DecoderFallbackBuffer fallback)
{
// Get our byte[]
byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
// Do the actual fallback
int count = fallback.InternalFallback(bytesUnknown, pSrc);
// # of fallback chars expected.
// Note that we only get here for "long" sequences, and have already unreserved
// the count that we prereserved for the input bytes
return count;
}
// Note that some of these bytes may have come from a previous fallback, so we cannot
// just decrement the pointer and use the values we read. In those cases we have
// to regenerate the original values.
[System.Security.SecurityCritical] // auto-generated
private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
{
// Get our byte[]
byte[] bytesUnknown = null;
// See if it was a plain char
// (have to check >= 0 because we have all sorts of wierd bit flags)
if (ch < 0x100 && ch >= 0)
{
pSrc--;
bytesUnknown = new byte[] { unchecked((byte)ch) };
}
// See if its an unfinished 2 byte sequence
else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
{
pSrc--;
bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F )| 0xc0)) };
}
// So now we're either 2nd byte of 3 or 4 byte sequence or
// we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
// 1st check if its a 4 byte sequence
else if ((ch & SupplimentarySeq) != 0)
{
// 3rd byte of 4 byte sequence?
if ((ch & (FinalByte >> 6)) != 0)
{
// 3rd byte of 4 byte sequence
pSrc-=3;
bytesUnknown = new byte[] {
unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
unchecked((byte)(((ch) & 0x3F) | 0x80)) };
}
else if ((ch & (FinalByte >> 12)) != 0)
{
// 2nd byte of a 4 byte sequence
pSrc-=2;
bytesUnknown = new byte[] {
unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
unchecked((byte)(((ch) & 0x3F) | 0x80)) };
}
else
{
// 4th byte of a 4 byte sequence
pSrc--;
bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0))};
}
}
else
{
// 2nd byte of 3 byte sequence?
if ((ch & (FinalByte >> 6)) != 0)
{
// So its 2nd byte of a 3 byte sequence
pSrc-=2;
bytesUnknown = new byte[] {
unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
}
else
{
// 1st byte of a 3 byte sequence
pSrc--;
bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0))};
}
}
return bytesUnknown;
}
public override Decoder GetDecoder() {
return new UTF8Decoder(this);
}
public override Encoder GetEncoder() {
return new UTF8Encoder(this);
}
public override int GetMaxByteCount(int charCount)
{
if (charCount < 0)
throw new ArgumentOutOfRangeException("charCount",
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
// Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
long byteCount = (long)charCount + 1;
if (EncoderFallback.MaxCharCount > 1)
byteCount *= EncoderFallback.MaxCharCount;
// Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
byteCount *= 3;
if (byteCount > 0x7fffffff)
throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow"));
return (int)byteCount;
}
public override int GetMaxCharCount(int byteCount)
{
if (byteCount < 0)
throw new ArgumentOutOfRangeException("byteCount",
Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
Contract.EndContractBlock();
// Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
long charCount = ((long)byteCount + 1);
// Non-shortest form would fall back, so get max count from fallback.
// So would 11... followed by 11..., so you could fall back every byte
if (DecoderFallback.MaxCharCount > 1)
{
charCount *= DecoderFallback.MaxCharCount;
}
if (charCount > 0x7fffffff)
throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow"));
return (int)charCount;
}
public override byte[] GetPreamble()
{
if (emitUTF8Identifier) {
// Allocate new array to prevent users from modifying it.
return new byte[3] { 0xEF, 0xBB, 0xBF };
}
else
return EmptyArray<Byte>.Value;
}
public override bool Equals(Object value) {
UTF8Encoding that = value as UTF8Encoding;
if (that != null) {
return (emitUTF8Identifier == that.emitUTF8Identifier) &&
// (isThrowException == that.isThrowException) && // Same as encoder/decoderfallbacks being exception
(EncoderFallback.Equals(that.EncoderFallback)) &&
(DecoderFallback.Equals(that.DecoderFallback));
}
return (false);
}
public override int GetHashCode() {
//Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
UTF8_CODEPAGE + (emitUTF8Identifier?1:0);
}
[Serializable]
internal class UTF8Encoder : EncoderNLS, ISerializable
{
// We must save a high surrogate value until the next call, looking
// for a low surrogate value.
internal int surrogateChar;
public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
{
// base calls reset
}
// Constructor called by serialization, have to handle deserializing from Everett
internal UTF8Encoder(SerializationInfo info, StreamingContext context)
{
// Any info?
if (info==null) throw new ArgumentNullException("info");
Contract.EndContractBlock();
// Get common info
this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
// SurrogateChar happens to mean the same thing
this.surrogateChar = (int)info.GetValue("surrogateChar", typeof(int));
try
{
this.m_fallback = (EncoderFallback) info.GetValue("m_fallback", typeof(EncoderFallback));
}
catch (SerializationException)
{
this.m_fallback = null;
}
}
#if FEATURE_SERIALIZATION
// ISerializable implementation, get data for this object
[System.Security.SecurityCritical] // auto-generated_required
void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
{
// Any info?
if (info==null) throw new ArgumentNullException("info");
Contract.EndContractBlock();
// Save Whidbey data
// Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding)
info.AddValue("encoding", this.m_encoding);
info.AddValue("surrogateChar", this.surrogateChar);
info.AddValue("m_fallback", this.m_fallback);
// Extra stuff for Everett that Whidbey doesn't use
info.AddValue("storedSurrogate", this.surrogateChar > 0 ? true : false);
info.AddValue("mustFlush", false); // Everett doesn't actually use this either, but it accidently serialized it!
}
#endif
public override void Reset()
{
this.surrogateChar = 0;
if (m_fallbackBuffer != null)
m_fallbackBuffer.Reset();
}
// Anything left in our encoder?
internal override bool HasState
{
get
{
return (this.surrogateChar != 0);
}
}
}
[Serializable]
internal class UTF8Decoder : DecoderNLS, ISerializable
{
// We'll need to remember the previous information. See the comments around definition
// of FinalByte for details.
internal int bits;
public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
{
// base calls reset
}
// Constructor called by serialization, have to handle deserializing from Everett
internal UTF8Decoder(SerializationInfo info, StreamingContext context)
{
// Any info?
if (info==null) throw new ArgumentNullException("info");
Contract.EndContractBlock();
// Get common info
this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
try
{
// Get whidbey version of bits
this.bits = (int)info.GetValue("wbits", typeof(int));
this.m_fallback = (DecoderFallback) info.GetValue("m_fallback", typeof(DecoderFallback));
}
catch (SerializationException)
{
// Everett calls bits bits instead of wbits, so this is Everett
this.bits = 0;
this.m_fallback = null;
}
}
#if FEATURE_SERIALIZATION
// ISerializable implementation, get data for this object
[System.Security.SecurityCritical] // auto-generated_required
void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
{
// Any info?
if (info==null) throw new ArgumentNullException("info");
Contract.EndContractBlock();
// Save new Whidbey data
info.AddValue("encoding", this.m_encoding);
info.AddValue("wbits", this.bits); // Special whidbey bits name
info.AddValue("m_fallback", this.m_fallback);
// Everett has extra stuff, we set it all to 0 in case this deserializes in Everett
info.AddValue("bits", (int)0);
info.AddValue("trailCount", (int)0);
info.AddValue("isSurrogate", false);
info.AddValue("byteSequence", (int)0);
}
#endif
public override void Reset()
{
this.bits = 0;
if (m_fallbackBuffer != null)
m_fallbackBuffer.Reset();
}
// Anything left in our decoder?
internal override bool HasState
{
get
{
return (this.bits != 0);
}
}
}
}
}
|