Skip to main content
added 1 character in body
Source Link
void ToUTF8(char16_t *str) { while (*str) { unsigned int codepoint = 0x0; //-------(1) UTF-16 to codepoint ------- if (*str <= 0xD7FF) { codepoint = *str; str++; } else if (*str <= 0xDBFF) { unsigned short highSurrogate = (*str - 0xD800) * 0x400; unsigned short lowSurrogate = *(str+1) - 0xDC00; codepoint = (lowSurrogate | highSurrogate) + 0x10000; str += 2; } //-------(2) Codepoint to UTF-8 ------- if (codepoint <= 0x007F) { unsigned char hex[2] = { 0 }; hex[0] = (char)codepoint; hex[1] = 0; cout << std::hex << std::uppercase << "(1Byte) " << (unsigned short)hex[0] << endl; } else if (codepoint <= 0x07FF) { unsigned char hex[3] = { 0 }; hex[0] = ((codepoint >> 6) & 0x1F) | 0xC0; hex[1] = (codepoint & 0x3F) | 0x80; hex[2] = 0; cout << std::hex << std::uppercase << "(2Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << endl; } else if (codepoint <= 0xFFFF) { unsigned char hex[4] = { 0 }; hex[0] = ((codepoint >> 12) & 0x0F) | 0xE0; hex[1] = ((codepoint >> 6) & 0x3F) | 0x80; hex[2] = ((codepoint) & 0x3F) | 0x80; hex[3] = 0; cout << std::hex << std::uppercase << "(3Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << endl;  } else if (codepoint <= 0x10FFFF) { unsigned char hex[5] = { 0 }; hex[0] = ((codepoint >> 18) & 0x07) | 0xF0; hex[1] = ((codepoint >> 12) & 0x3F) | 0x80; hex[2] = ((codepoint >> 6) & 0x3F) | 0x80; hex[3] = ((codepoint) & 0x3F) | 0x80; hex[4] = 0; cout << std::hex << std::uppercase << "(4Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << "-" << (unsigned short)hex[3] << endl; } } } 
void ToUTF8(char16_t *str) { while (*str) { unsigned int codepoint = 0x0; //-------(1) UTF-16 to codepoint ------- if (*str <= 0xD7FF) { codepoint = *str; str++; } else if (*str <= 0xDBFF) { unsigned short highSurrogate = (*str - 0xD800) * 0x400; unsigned short lowSurrogate = *(str+1) - 0xDC00; codepoint = (lowSurrogate | highSurrogate) + 0x10000; str += 2; } //-------(2) Codepoint to UTF-8 ------- if (codepoint <= 0x007F) { unsigned char hex[2] = { 0 }; hex[0] = (char)codepoint; hex[1] = 0; cout << std::hex << std::uppercase << "(1Byte) " << (unsigned short)hex[0] << endl; } else if (codepoint <= 0x07FF) { unsigned char hex[3] = { 0 }; hex[0] = ((codepoint >> 6) & 0x1F) | 0xC0; hex[1] = (codepoint & 0x3F) | 0x80; hex[2] = 0; cout << std::hex << std::uppercase << "(2Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << endl; } else if (codepoint <= 0xFFFF) { unsigned char hex[4] = { 0 }; hex[0] = ((codepoint >> 12) & 0x0F) | 0xE0; hex[1] = ((codepoint >> 6) & 0x3F) | 0x80; hex[2] = ((codepoint) & 0x3F) | 0x80; hex[3] = 0; cout << std::hex << std::uppercase << "(3Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << endl; } else if (codepoint <= 0x10FFFF) { unsigned char hex[5] = { 0 }; hex[0] = ((codepoint >> 18) & 0x07) | 0xF0; hex[1] = ((codepoint >> 12) & 0x3F) | 0x80; hex[2] = ((codepoint >> 6) & 0x3F) | 0x80; hex[3] = ((codepoint) & 0x3F) | 0x80; hex[4] = 0; cout << std::hex << std::uppercase << "(4Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << "-" << (unsigned short)hex[3] << endl; } } } 
void ToUTF8(char16_t *str) { while (*str) { unsigned int codepoint = 0x0; //-------(1) UTF-16 to codepoint ------- if (*str <= 0xD7FF) { codepoint = *str; str++; } else if (*str <= 0xDBFF) { unsigned short highSurrogate = (*str - 0xD800) * 0x400; unsigned short lowSurrogate = *(str+1) - 0xDC00; codepoint = (lowSurrogate | highSurrogate) + 0x10000; str += 2; } //-------(2) Codepoint to UTF-8 ------- if (codepoint <= 0x007F) { unsigned char hex[2] = { 0 }; hex[0] = (char)codepoint; hex[1] = 0; cout << std::hex << std::uppercase << "(1Byte) " << (unsigned short)hex[0] << endl; } else if (codepoint <= 0x07FF) { unsigned char hex[3] = { 0 }; hex[0] = ((codepoint >> 6) & 0x1F) | 0xC0; hex[1] = (codepoint & 0x3F) | 0x80; hex[2] = 0; cout << std::hex << std::uppercase << "(2Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << endl; } else if (codepoint <= 0xFFFF) { unsigned char hex[4] = { 0 }; hex[0] = ((codepoint >> 12) & 0x0F) | 0xE0; hex[1] = ((codepoint >> 6) & 0x3F) | 0x80; hex[2] = ((codepoint) & 0x3F) | 0x80; hex[3] = 0; cout << std::hex << std::uppercase << "(3Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << endl;  } else if (codepoint <= 0x10FFFF) { unsigned char hex[5] = { 0 }; hex[0] = ((codepoint >> 18) & 0x07) | 0xF0; hex[1] = ((codepoint >> 12) & 0x3F) | 0x80; hex[2] = ((codepoint >> 6) & 0x3F) | 0x80; hex[3] = ((codepoint) & 0x3F) | 0x80; hex[4] = 0; cout << std::hex << std::uppercase << "(4Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << "-" << (unsigned short)hex[3] << endl; } } } 
Improve title
Link
slepic
  • 5.7k
  • 2
  • 10
  • 27

The conversion from UTF-16 to UTF-8 manually

Source Link

The conversion from UTF-16 to UTF-8 manually

I have created a function that converts from UTF-16 to UTF-8.
This function converts from UTF-16 to codepoint firstly, then from codepoint to UTF-8.

void ToUTF8(char16_t *str) { while (*str) { unsigned int codepoint = 0x0; //-------(1) UTF-16 to codepoint ------- if (*str <= 0xD7FF) { codepoint = *str; str++; } else if (*str <= 0xDBFF) { unsigned short highSurrogate = (*str - 0xD800) * 0x400; unsigned short lowSurrogate = *(str+1) - 0xDC00; codepoint = (lowSurrogate | highSurrogate) + 0x10000; str += 2; } //-------(2) Codepoint to UTF-8 ------- if (codepoint <= 0x007F) { unsigned char hex[2] = { 0 }; hex[0] = (char)codepoint; hex[1] = 0; cout << std::hex << std::uppercase << "(1Byte) " << (unsigned short)hex[0] << endl; } else if (codepoint <= 0x07FF) { unsigned char hex[3] = { 0 }; hex[0] = ((codepoint >> 6) & 0x1F) | 0xC0; hex[1] = (codepoint & 0x3F) | 0x80; hex[2] = 0; cout << std::hex << std::uppercase << "(2Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << endl; } else if (codepoint <= 0xFFFF) { unsigned char hex[4] = { 0 }; hex[0] = ((codepoint >> 12) & 0x0F) | 0xE0; hex[1] = ((codepoint >> 6) & 0x3F) | 0x80; hex[2] = ((codepoint) & 0x3F) | 0x80; hex[3] = 0; cout << std::hex << std::uppercase << "(3Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << endl; } else if (codepoint <= 0x10FFFF) { unsigned char hex[5] = { 0 }; hex[0] = ((codepoint >> 18) & 0x07) | 0xF0; hex[1] = ((codepoint >> 12) & 0x3F) | 0x80; hex[2] = ((codepoint >> 6) & 0x3F) | 0x80; hex[3] = ((codepoint) & 0x3F) | 0x80; hex[4] = 0; cout << std::hex << std::uppercase << "(4Bytes) " << (unsigned short)hex[0] << "-" << (unsigned short)hex[1] << "-" << (unsigned short)hex[2] << "-" << (unsigned short)hex[3] << endl; } } } 

Also, you can compile and test the code from here

What do you think about that function in terms of performance, and ease?