In .NET, I'm trying to use Encoding.UTF8.GetString method, which takes a byte array and converts it to a string.
It looks like this method ignores the BOM (Byte Order Mark), which might be a part of a legitimate binary representation of a UTF8 string, and takes it as a character.
I know I can use a TextReader to digest the BOM as needed, but I thought that the GetString method should be some kind of a macro that makes our code shorter.
Am I missing something? Is this like so intentionally?
Here's a reproduction code:
static void Main(string[] args) { string s1 = "abc"; byte[] abcWithBom; using (var ms = new MemoryStream()) using (var sw = new StreamWriter(ms, new UTF8Encoding(true))) { sw.Write(s1); sw.Flush(); abcWithBom = ms.ToArray(); Console.WriteLine(FormatArray(abcWithBom)); // ef, bb, bf, 61, 62, 63 } byte[] abcWithoutBom; using (var ms = new MemoryStream()) using (var sw = new StreamWriter(ms, new UTF8Encoding(false))) { sw.Write(s1); sw.Flush(); abcWithoutBom = ms.ToArray(); Console.WriteLine(FormatArray(abcWithoutBom)); // 61, 62, 63 } var restore1 = Encoding.UTF8.GetString(abcWithoutBom); Console.WriteLine(restore1.Length); // 3 Console.WriteLine(restore1); // abc var restore2 = Encoding.UTF8.GetString(abcWithBom); Console.WriteLine(restore2.Length); // 4 (!) Console.WriteLine(restore2); // ?abc } private static string FormatArray(byte[] bytes1) { return string.Join(", ", from b in bytes1 select b.ToString("x")); }