Given an input string and an encoding, I want to process each character in the input string as follows:
If the codepoint can be encoded, then encode it;
If not, output (the encoding of) the string
&#xUUUU;
where UUUU is the hex value of the Unicode codepoint.
I've read through the .NET documentation for Encoder
and EncoderFallback
, and I can see how to get notified when an unencodable character is found, but I can't see any way to output something that actually depends on the particular character in question.
Any ideas?
Looking a bit deeper (thanks @JosefZ), I see that the description of the EncoderFallback
class says it supports three mechanisms, including:
Best-fit fallback, which maps valid Unicode characters that cannot be encoded to an approximate equivalent. For example, a best-fit fallback handler for the ASCIIEncoding class might map Æ (U 00C6) to AE (U 0041 U 0045). A best-fit fallback handler might also be implemented to transliterate one alphabet (such as Cyrillic) to another (such as Latin or Roman). The .NET Framework does not provide any public best-fit fallback implementations.
which would appear to be the one I am after: so I have to work out how to write my own implementation of EncoderFallback
?
CodePudding user response:
You can use the following EncoderFallback
and EncoderFallbackBuffer
to do what you want
public class HexFallback : EncoderFallback
{
public override int MaxCharCount { get { return int.MaxValue; } } // we can handle any amount of chars
public override EncoderFallbackBuffer CreateFallbackBuffer(){ return new HexFallbackBuffer(); }
}
public class HexFallbackBuffer : EncoderFallbackBuffer
{
int _currentPos; // current position of invalid char encoding
char _charToEncode; // first or main char
char _charToEncode2; // lower pair of surrogate if any
public override bool Fallback(char charUnknown, int index)
{
_charToEncode = charUnknown; // store char
return true;
}
public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index)
{
_charToEncode = charUnknownHigh; // store high and low surrogates
_charToEncode2 = charUnknownLow;
return true;
}
public override int Remaining { get { return 8 - _currentPos (_charToEncode2 != (char)0 ? 8 : 0); } } // 8 chars per invalid char
public override void Reset()
{
_charToEncode = (char)0;
_charToEncode2 = (char)0;
_currentPos = 0;
}
public override bool MovePrevious() // can we move backwards in our encoding
{
if(_currentPos == 0)
return false;
_currentPos -= 1;
return true;
}
public override char GetNextChar()
{
if(_charToEncode2 != (char)0 && _currentPos == 8) // if we have a surrogate
{
_charToEncode = _charToEncode2; // move low surrogate to main
_charToEncode2 = (char)0;
_currentPos = 0; // and start again
}
_currentPos ;
switch(_currentPos)
{
case 1:
return '&';
case 2:
return '#';
case 3:
return 'x';
case 4:
return NibbleToHex(((int)_charToEncode) >> 12); // shift 12 bits
case 5:
return NibbleToHex(((int)_charToEncode) >> 8 & 0x0F); // shift 8 and mask the rest
case 6:
return NibbleToHex(((int)_charToEncode) >> 4 & 0x0F); // shift 4 and mask the rest
case 7:
return NibbleToHex(((int)_charToEncode) & 0x0F); // mask all high bits
case 8:
return ';';
default:
_currentPos = 0;
_charToEncode = (char)0;
return (char)0;
}
}
char NibbleToHex(int nibble) // convert 4 bits to hex char
{
return (char)(
nibble < 10
? nibble (int)'0' // Return a character from '0' to '9'
: nibble (int)'7' // Return A to F
);
}
}
You use it like this
var encoder = Encoding.ASCII.GetEncoder();
encoder.Fallback = new HexFallback();
var str = "Æ";
var buffer = new byte[1000];
var length = encoder.GetBytes(str.ToCharArray(), 0, str.Length, buffer, 0, true);
// write out encoded string
Console.WriteLine(Encoding.ASCII.GetString(buffer, 0, length));