Other characters after the second backslash are rejected (e.g. '\c\d').
[bug introduced in the sed-4.0.* releases]
+ sed no longer mishandles incomplete multibyte sequences in s,y commands
+ and valid multibyte SHIFT-JIS characters in character classes.
+ Previously, the following commands would fail:
+ LC_ALL=en_US.UTF-8 sed $'s/\316/X/'
+ LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p'
+ [bug introduced some time after sed-4.1.5 and before sed-4.2.1]
+
** Feature removal
The "L" command (format a paragraph like the fmt(1) command would)
int ch;
int state = 0;
int delim IF_LINT ( = 0) ;
- bool pending_mb = 0;
ch = inchar();
if (ch == '^')
for (;; ch = add_then_next (b, ch))
{
- pending_mb = BRLEN (ch, cur_stat) != 1;
+ const int mb_char = IS_MB_CHAR (ch, cur_stat);
switch (ch)
{
case '.':
case ':':
case '=':
- if (pending_mb)
+ if (mb_char)
continue;
if (state == 1)
continue;
case OPEN_BRACKET:
- if (pending_mb)
+ if (mb_char)
continue;
if (state == 0)
continue;
case CLOSE_BRACKET:
- if (pending_mb)
+ if (mb_char)
continue;
if (state == 0 || state == 1)
mbstate_t cur_stat = { 0, };
/* We allow only 1 byte characters for a slash. */
- if (BRLEN (slash, &cur_stat) == -2)
+ if (IS_MB_CHAR (slash, &cur_stat))
bad_prog (BAD_DELIM);
memset (&cur_stat, 0, sizeof cur_stat);
b = init_buffer();
while ((ch = inchar()) != EOF && ch != '\n')
{
- bool pending_mb = !MBSINIT (&cur_stat);
- if (BRLEN (ch, &cur_stat) == 1 && !pending_mb)
+ const int mb_char = IS_MB_CHAR (ch, &cur_stat);
+
+ if (!mb_char)
{
if (ch == slash)
return b;
int mb_cur_max;
bool is_utf8;
-/* Add a byte to the multibyte character represented by the state
- CUR_STAT, and answer its length if a character is completed,
- or -2 if it is yet to be completed. */
-int brlen (int ch, mbstate_t *cur_stat)
-{
- char c = ch;
+/* Return non-zero if CH is part of a valid multibyte sequence:
+ Either incomplete yet valid sequence (in case of a leading byte),
+ or the last byte of a valid multibyte sequence.
+
+ Return zero in all other cases:
+ CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
+ CH is an invalid byte in a multibyte sequence for the currentl locale,
+ CH is the NUL byte.
- /* If we use the generic brlen, then MBRLEN == mbrlen. */
- int result = mbrtowc(NULL, &c, 1, cur_stat);
+ Reset CUR_STAT in the case of an invalid byte.
+*/
+int
+is_mb_char (int ch, mbstate_t *cur_stat)
+{
+ const char c = ch ;
+ const int mb_pending = !mbsinit (cur_stat);
+ const int result = mbrtowc (NULL, &c, 1, cur_stat);
- /* An invalid sequence is treated like a single-byte character. */
- if (result == -1)
+ switch (result)
{
+ case -2: /* Beginning or middle of valid multibyte sequence */
+ return 1;
+
+ case -1: /* Invalid sequence, byte treated like a single-byte character */
memset (cur_stat, 0, sizeof (mbstate_t));
+ return 0;
+
+ case 1: /* A valid byte, check if part of on-going multibyte sequence */
+ return mb_pending;
+
+ case 0: /* Special case of mbrtowc(3): the NUL character */
+ /* TODO: test this */
return 1;
- }
- return result;
+ default: /* Should never happen, as per mbrtowc(3) documentation */
+ panic ("is_mb_char: mbrtowc (0x%x) returned %d",ch,result);
+ }
}
void
#define MBRLEN(s, n, ps) \
(mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
-#define BRLEN(ch, ps) \
- (mb_cur_max == 1 ? 1 : brlen (ch, ps))
+#define IS_MB_CHAR(ch, ps) \
+ (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps))
-extern int brlen (int ch, mbstate_t *ps);
+extern int is_mb_char (int ch, mbstate_t *ps);
extern void initialize_mbcs (void);
extern void register_cleanup_file (char const *file);
extern void cancel_cleanup (void);
in-place-hyphen.sh \
in-place-suffix-backup.sh \
invalid-mb-seq-UMR.sh \
+ mb-charclass-non-utf8.sh \
+ mb-match-slash.sh \
normalize-text.sh \
nulldata.sh \
panic-tests.sh \
*) skip_ 'en_US.UTF-8 locale not found' ;;
esac
}
+
+# Some tests would fail without this particular locale.
+# If the locale is not available, just skip the test.
+# The exact spelling differs between operating systems
+# (ja_JP.shiftjis on Ubuntu, ja_JP.sjis on Debian, ja_JP.SJIS on Mac OS X).
+# If a sjift-jis locale is found the function sets shell variable
+# 'LOCALE_JA_SJIS' to the locale name.
+require_ja_shiftjis_locale_()
+{
+ path_prepend_ .
+ LOCALE_JA_SJIS=
+ for l in shiftjis sjis SJIS ; do
+ n=$(get-mb-cur-max ja_JP.$l) || continue
+ test 2 -eq "$n" || continue
+ LOCALE_JA_SJIS="ja_JP.$l"
+ break
+ done
+ test -z "$LOCALE_JA_SJIS" && skip_ 'ja_JP shift-jis locale not found'
+}
--- /dev/null
+#!/bin/sh
+# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
+# This is a stateful locale. Same byte value can be either
+# a single-byte character, or the second byte of a multibyte
+# character.
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+# If found, LOCALE_JA_SJIS will contain the locale name.
+require_ja_shiftjis_locale_
+
+fail=0
+
+# This test uses two characters:
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# In SHIFT-JIS locale, these multibyte characters contain
+# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
+#
+# See also:
+# https://en.wikipedia.org/wiki/Shift_JIS
+# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
+
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+#
+# UTF-8: hex: 0xE3 0x82 0xBC
+# bin: 11100011 10000010 10111100
+#
+# Shift-jis hex: 0x83 0x5B
+# oct: 203 133
+# bin: 10000011 01011011
+#
+# Conversion example:
+# $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+# 0000000 e3 82 bc
+# 343 202 274
+# 343 202 274
+
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# UTF-8: hex: 0xE3 0x82 0xBE
+# bin: 11100011 10000010 10111110
+#
+# Shift-jis hex: 0x83 0x5D
+# oct: 203 135
+# bin: 10000011 01011101
+#
+# Conversion example:
+# $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+# 0000000 e3 82 be
+# 343 202 276
+# 343 202 276
+#
+
+
+#
+# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
+# Implmenetation notes: str_append() has special code path for non-utf8 cases.
+#
+
+# Test 1: valid multibyte seqeunce
+printf 'y/a/\203\133/' > p1 || framework_failure_
+echo Xa > in1 || framework_failure_
+printf 'X\203\133\n' > exp1 || framework_failure_
+
+LC_ALL="$LOCALE_JA_SJIS" sed -f p1 <in1 >out1 || fail=1
+compare_ exp1 out1 || fail=1
+
+# Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
+printf 'y/aa/\203\060/' > p2 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p2 </dev/null 2>out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+#
+# Test 3: multibyte character class with these characters.
+#
+# Before sed-4.3, snarf_char_class would parse it incorrectly,
+# Treating the first closing-bracket as closing the character-class,
+# instead of being part of a multibyte sequence.
+
+printf '/[\203]/]/p' > p3 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+# Test 4:
+# Same as test 3, but with the other multibyte character.
+# (this did not cause a failure before sed-4.3, but the code was incorrect).
+# Keep this test for code-coverage purposes.
+printf '/[\203[/]/p' > p4 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p4 </dev/null >out4 || fail=1
+compare_ /dev/null out4 || fail=1
+
+# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
+#
+# snarf_char_class specifically tests for five bytes: ':.=[]' .
+# '[' and ']' are tested above, yet '.:=' are not valid as part of a
+# multibyte shift-jis sequence.
+#
+# valid:
+# $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
+# $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
+#
+# invalid:
+# $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
+# iconv: (stdin):1:0: cannot convert
+#
+# $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
+# iconv: (stdin):1:0: cannot convert
+#
+# $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
+# iconv: (stdin):0:0: cannot convert
+
+Exit $fail
--- /dev/null
+#!/bin/sh
+# Test slash following an incomplete multibyte character
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+require_en_utf8_locale_
+
+fail=0
+
+# before sed-4.3, a slash following an incomplete multibyte character
+# would be ignored during program compilation, leading to an error.
+
+
+# Test 1: match_slash in 's' command.
+# Before sed-4.3, this would fail with "unterminated `s' command".
+printf 's/\316/X/' > p1 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p1 </dev/null >out1 || fail=1
+compare_ /dev/null out1 || fail=1
+
+# Test 2: match_slash in address regex.
+# Before sed-4.3, this would fail with "unterminated address regex".
+printf '/\316/p' >p2 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p2 </dev/null >out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+# Test 3: match_slash in 'y' command..
+# Before sed-4.3, this would fail with "unterminated `y' command".
+printf 'y/\316/X/' >p3 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+
+Exit $fail