A small function that handles both combining diacritics and 2UTF-byte characters8 strings can have:
- Combining diacritics such as
b̃which composed of thebcharacter and a following~diacritic generated by the unicode escape sequnce\u0303; - Multi-byte characters such as
🎥; which is generated by the multi-byte unicode escape sequence\uD83C\uDFA5; and - Multiple characters may be combined together with a zero-width joiner character (given by the unicode escape sequence
\u200D). For example, the character👨👩👦can be composed using the individual (multi-byte) emojis 👨 then a zero-width joiner then 👩 then another zero-width joiner then 👦 such that the entire 3-person character is 8-bytes (\uD83D\uDC68\u200D\uD83D\uDC69\u200D\uD83D\uDC66).
This will handle reversing all 3 cases and keeping the bytes in the correct order such that the characters are reversed (rather than naively reversing the bytes of the string):
(function(){ var isCombiningDiacritic = function( code ) { return (0x0300 <= code && code <= 0x036F) // Comb. Diacritical Marks || (0x1AB0 <= code && code <= 0x1AFF) // Comb. Diacritical Marks Extended || (0x1DC0 <= code && code <= 0x1DFF) // Comb. Diacritical Marks Supplement || (0x20D0 <= code && code <= 0x20FF) // Comb. Diacritical Marks for Symbols || (0xFE20 <= code && code <= 0xFE2F); // Comb. Half Marks }; String.prototype.reverse = function() { varlet output = "",""; for ( let i = this.length - 1, length; i > 0; ) width; { for ( ;let iwidth >== 0; --i ) { let widthhas_zero_width_joiner = 1;false; while( i > 0 && isCombiningDiacritic( this.charCodeAt(i-1) ) ) { --i; width++; } do { --i; width++; if ( i > 0 && "\uDC00" <= this[i] && this[i] <= "\uDFFF" && "\uD800" <= this[i-1] && this[i-1] <= "\uDBFF" ) { --i; width++; } has_zero_width_joiner = i > 0 && "\u200D" == this[i-1]; if ( has_zero_width_joiner ) { --i; width++; } } while( i > 0 && has_zero_width_joiner ); output += this.substr( i, width ); } return output; } })(); // Tests [ 'abcdefg', 'ab\u0303c', 'a\uD83C\uDFA5b', 'a\uD83C\uDFA5b\uD83C\uDFA6c', 'a\uD83C\uDFA5b\u0306c\uD83C\uDFA6d', 'TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚N̐Y̡', // copied from http://stackoverflow.com/a/1732454/1509264 'What 👨👩👦 is this?' ].forEach( function(str){ console.log( str + " -> " + str.reverse() ); } ); The above code identifies some of the more commonly used combining diacritics. A more complete list of combining diacritics (that could be swapped into the above code) is:
Update 2:
Handling zero-width joiners.
(function(){ var isCombiningDiacritic = function( code ) { return (0x0300 <= code && code <= 0x036F) // Comb. Diacritical Marks || (0x1AB0 <= code && code <= 0x1AFF) // Comb. Diacritical Marks Extended || (0x1DC0 <= code && code <= 0x1DFF) // Comb. Diacritical Marks Supplement || (0x20D0 <= code && code <= 0x20FF) // Comb. Diacritical Marks for Symbols || (0xFE20 <= code && code <= 0xFE2F); // Comb. Half Marks }; String.prototype.reverse = function() { let output = ""; for ( let i = this.length; i > 0; ) { let width = 0; let has_zero_width_joiner = false; while( i > 0 && isCombiningDiacritic( this.charCodeAt(i-1) ) ) { --i; width++; } do { --i; width++; if ( i > 0 && "\uDC00" <= this[i] && this[i] <= "\uDFFF" && "\uD800" <= this[i-1] && this[i-1] <= "\uDBFF" ) { --i; width++; } has_zero_width_joiner = i > 0 && "\u200D" == this[i-1]; if ( has_zero_width_joiner ) { --i; width++; } } while( i > 0 && has_zero_width_joiner ); output += this.substr( i, width ); } return output; } })(); // Tests [ 'abcdefg', 'ab\u0303c', 'a\uD83C\uDFA5b', 'a\uD83C\uDFA5b\uD83C\uDFA6c', 'a\uD83C\uDFA5b\u0306c\uD83C\uDFA6d', 'TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚N̐Y̡', // copied from http://stackoverflow.com/a/1732454/1509264 'What 👨👩👦 is this?' ].forEach( function(str){ console.log( str + " -> " + str.reverse() ); } );