From: Assaf Gordon Date: Thu, 23 Jun 2016 00:34:01 +0000 (-0400) Subject: sed: fix minor multibyte parsing bug X-Git-Tag: v4.3~71 X-Git-Url: https://gitweb.git.savannah.gnu.org/gitweb/?a=commitdiff_plain;h=0052daf1e5651c4772d75b595dfc714e0e20587e;p=sed.git sed: fix minor multibyte parsing bug Previously sed would parse multibyte characters incorrectly in two scenarios: 1. Slash following an incomplete-yet-valid multibyte sequence (match_slash): $ LC_ALL=en_US.UTF-8 sed $'s/\316/X/' sed: -e expression #1, char 6: unterminated `s' command 2. Open/close brackets as part of a valid mutilbyte string inside a character class (snarf_char_class). In the example below, '\203]' is a valid multibyte character in SHIFT-JIS locale: $ LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p' sed: -e expression #1, char #5: Unmatched [ or [^ Both cases stem from mbcs.c:brlen() being non-intuitive: It returned 1 for valid single-byte character, invalid multibyte-character, and a for the last byte of a valid multibyte sequence - making it non-trivial to use correctly. This commit replaces brlen() with a simpler is_mb_char() function: returns non-zero for multibyte sequences, zero for single/invalid sequences. * sed/sed.h: (BRLEN, brlen): Remove delaration. (IS_MB_CHAR,is_mb_char): Add macro and function declaration. * sed/mbcs.c: (brlen): Remove function. (is_mb_char): New function. * sed/compile.c: (snarf_char_class, match_slash): Use IS_MB_CHAR instead of BRLEN; Adjust local variables accordingly. * testsuite/mb-match-slash.sh: New test for scenario 1. * testsuite/mb-charclass-non-utf8.sh: New test for scenario 2, requires SHIFT-JIS locale. * testsuite/Makefile.am: Add new tests * testsuite/init.cfg: (require_ja_shiftjis_locale_): New function. * NEWS: Mention bug fix. --- diff --git a/NEWS b/NEWS index 4399b74..6af8b9a 100644 --- a/NEWS +++ b/NEWS @@ -54,6 +54,13 @@ GNU sed NEWS -*- outline -*- Other characters after the second backslash are rejected (e.g. '\c\d'). [bug introduced in the sed-4.0.* releases] + sed no longer mishandles incomplete multibyte sequences in s,y commands + and valid multibyte SHIFT-JIS characters in character classes. + Previously, the following commands would fail: + LC_ALL=en_US.UTF-8 sed $'s/\316/X/' + LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p' + [bug introduced some time after sed-4.1.5 and before sed-4.2.1] + ** Feature removal The "L" command (format a paragraph like the fmt(1) command would) diff --git a/sed/compile.c b/sed/compile.c index c039c24..ca09e78 100644 --- a/sed/compile.c +++ b/sed/compile.c @@ -432,7 +432,6 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat) int ch; int state = 0; int delim IF_LINT ( = 0) ; - bool pending_mb = 0; ch = inchar(); if (ch == '^') @@ -448,7 +447,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat) for (;; ch = add_then_next (b, ch)) { - pending_mb = BRLEN (ch, cur_stat) != 1; + const int mb_char = IS_MB_CHAR (ch, cur_stat); switch (ch) { @@ -459,7 +458,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat) case '.': case ':': case '=': - if (pending_mb) + if (mb_char) continue; if (state == 1) @@ -475,7 +474,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat) continue; case OPEN_BRACKET: - if (pending_mb) + if (mb_char) continue; if (state == 0) @@ -483,7 +482,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat) continue; case CLOSE_BRACKET: - if (pending_mb) + if (mb_char) continue; if (state == 0 || state == 1) @@ -512,7 +511,7 @@ match_slash (int slash, int regex) mbstate_t cur_stat = { 0, }; /* We allow only 1 byte characters for a slash. */ - if (BRLEN (slash, &cur_stat) == -2) + if (IS_MB_CHAR (slash, &cur_stat)) bad_prog (BAD_DELIM); memset (&cur_stat, 0, sizeof cur_stat); @@ -520,8 +519,9 @@ match_slash (int slash, int regex) b = init_buffer(); while ((ch = inchar()) != EOF && ch != '\n') { - bool pending_mb = !MBSINIT (&cur_stat); - if (BRLEN (ch, &cur_stat) == 1 && !pending_mb) + const int mb_char = IS_MB_CHAR (ch, &cur_stat); + + if (!mb_char) { if (ch == slash) return b; diff --git a/sed/mbcs.c b/sed/mbcs.c index f06f3f3..bce39fa 100644 --- a/sed/mbcs.c +++ b/sed/mbcs.c @@ -24,24 +24,43 @@ int mb_cur_max; bool is_utf8; -/* Add a byte to the multibyte character represented by the state - CUR_STAT, and answer its length if a character is completed, - or -2 if it is yet to be completed. */ -int brlen (int ch, mbstate_t *cur_stat) -{ - char c = ch; +/* Return non-zero if CH is part of a valid multibyte sequence: + Either incomplete yet valid sequence (in case of a leading byte), + or the last byte of a valid multibyte sequence. + + Return zero in all other cases: + CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales); + CH is an invalid byte in a multibyte sequence for the currentl locale, + CH is the NUL byte. - /* If we use the generic brlen, then MBRLEN == mbrlen. */ - int result = mbrtowc(NULL, &c, 1, cur_stat); + Reset CUR_STAT in the case of an invalid byte. +*/ +int +is_mb_char (int ch, mbstate_t *cur_stat) +{ + const char c = ch ; + const int mb_pending = !mbsinit (cur_stat); + const int result = mbrtowc (NULL, &c, 1, cur_stat); - /* An invalid sequence is treated like a single-byte character. */ - if (result == -1) + switch (result) { + case -2: /* Beginning or middle of valid multibyte sequence */ + return 1; + + case -1: /* Invalid sequence, byte treated like a single-byte character */ memset (cur_stat, 0, sizeof (mbstate_t)); + return 0; + + case 1: /* A valid byte, check if part of on-going multibyte sequence */ + return mb_pending; + + case 0: /* Special case of mbrtowc(3): the NUL character */ + /* TODO: test this */ return 1; - } - return result; + default: /* Should never happen, as per mbrtowc(3) documentation */ + panic ("is_mb_char: mbrtowc (0x%x) returned %d",ch,result); + } } void diff --git a/sed/sed.h b/sed/sed.h index 8fe2147..bbddd25 100644 --- a/sed/sed.h +++ b/sed/sed.h @@ -254,10 +254,10 @@ extern bool is_utf8; #define MBRLEN(s, n, ps) \ (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps)) -#define BRLEN(ch, ps) \ - (mb_cur_max == 1 ? 1 : brlen (ch, ps)) +#define IS_MB_CHAR(ch, ps) \ + (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps)) -extern int brlen (int ch, mbstate_t *ps); +extern int is_mb_char (int ch, mbstate_t *ps); extern void initialize_mbcs (void); extern void register_cleanup_file (char const *file); extern void cancel_cleanup (void); diff --git a/testsuite/Makefile.am b/testsuite/Makefile.am index 6019d85..5ba4539 100644 --- a/testsuite/Makefile.am +++ b/testsuite/Makefile.am @@ -19,6 +19,8 @@ T = \ in-place-hyphen.sh \ in-place-suffix-backup.sh \ invalid-mb-seq-UMR.sh \ + mb-charclass-non-utf8.sh \ + mb-match-slash.sh \ normalize-text.sh \ nulldata.sh \ panic-tests.sh \ diff --git a/testsuite/init.cfg b/testsuite/init.cfg index e3950e4..2d685e2 100644 --- a/testsuite/init.cfg +++ b/testsuite/init.cfg @@ -51,3 +51,22 @@ require_en_utf8_locale_() *) skip_ 'en_US.UTF-8 locale not found' ;; esac } + +# Some tests would fail without this particular locale. +# If the locale is not available, just skip the test. +# The exact spelling differs between operating systems +# (ja_JP.shiftjis on Ubuntu, ja_JP.sjis on Debian, ja_JP.SJIS on Mac OS X). +# If a sjift-jis locale is found the function sets shell variable +# 'LOCALE_JA_SJIS' to the locale name. +require_ja_shiftjis_locale_() +{ + path_prepend_ . + LOCALE_JA_SJIS= + for l in shiftjis sjis SJIS ; do + n=$(get-mb-cur-max ja_JP.$l) || continue + test 2 -eq "$n" || continue + LOCALE_JA_SJIS="ja_JP.$l" + break + done + test -z "$LOCALE_JA_SJIS" && skip_ 'ja_JP shift-jis locale not found' +} diff --git a/testsuite/mb-charclass-non-utf8.sh b/testsuite/mb-charclass-non-utf8.sh new file mode 100755 index 0000000..ea68d7a --- /dev/null +++ b/testsuite/mb-charclass-non-utf8.sh @@ -0,0 +1,129 @@ +#!/bin/sh +# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis) +# This is a stateful locale. Same byte value can be either +# a single-byte character, or the second byte of a multibyte +# character. + +# Copyright (C) 2016 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +. "${srcdir=.}/init.sh"; path_prepend_ ../sed +print_ver_ sed + +# If found, LOCALE_JA_SJIS will contain the locale name. +require_ja_shiftjis_locale_ + +fail=0 + +# This test uses two characters: +# Unicode Character 'KATAKANA LETTER ZE' (U+30BC) +# Unicode Character 'KATAKANA LETTER ZO' (U+30BE) +# +# In SHIFT-JIS locale, these multibyte characters contain +# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte. +# +# See also: +# https://en.wikipedia.org/wiki/Shift_JIS +# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml + +# Unicode Character 'KATAKANA LETTER ZE' (U+30BC) +# +# UTF-8: hex: 0xE3 0x82 0xBC +# bin: 11100011 10000010 10111100 +# +# Shift-jis hex: 0x83 0x5B +# oct: 203 133 +# bin: 10000011 01011011 +# +# Conversion example: +# $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c +# 0000000 e3 82 bc +# 343 202 274 +# 343 202 274 + +# Unicode Character 'KATAKANA LETTER ZO' (U+30BE) +# +# UTF-8: hex: 0xE3 0x82 0xBE +# bin: 11100011 10000010 10111110 +# +# Shift-jis hex: 0x83 0x5D +# oct: 203 135 +# bin: 10000011 01011101 +# +# Conversion example: +# $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c +# 0000000 e3 82 be +# 343 202 276 +# 343 202 276 +# + + +# +# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce. +# Implmenetation notes: str_append() has special code path for non-utf8 cases. +# + +# Test 1: valid multibyte seqeunce +printf 'y/a/\203\133/' > p1 || framework_failure_ +echo Xa > in1 || framework_failure_ +printf 'X\203\133\n' > exp1 || framework_failure_ + +LC_ALL="$LOCALE_JA_SJIS" sed -f p1 out1 || fail=1 +compare_ exp1 out1 || fail=1 + +# Test 2: invalid multibyte seqeunce, treated as two single-byte characters. +printf 'y/aa/\203\060/' > p2 || framework_failure_ +LC_ALL="$LOCALE_JA_SJIS" sed -f p2 out2 || fail=1 +compare_ /dev/null out2 || fail=1 + +# +# Test 3: multibyte character class with these characters. +# +# Before sed-4.3, snarf_char_class would parse it incorrectly, +# Treating the first closing-bracket as closing the character-class, +# instead of being part of a multibyte sequence. + +printf '/[\203]/]/p' > p3 || framework_failure_ +LC_ALL="$LOCALE_JA_SJIS" sed -f p3 out3 || fail=1 +compare_ /dev/null out3 || fail=1 + +# Test 4: +# Same as test 3, but with the other multibyte character. +# (this did not cause a failure before sed-4.3, but the code was incorrect). +# Keep this test for code-coverage purposes. +printf '/[\203[/]/p' > p4 || framework_failure_ +LC_ALL="$LOCALE_JA_SJIS" sed -f p4 out4 || fail=1 +compare_ /dev/null out4 || fail=1 + +# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet. +# +# snarf_char_class specifically tests for five bytes: ':.=[]' . +# '[' and ']' are tested above, yet '.:=' are not valid as part of a +# multibyte shift-jis sequence. +# +# valid: +# $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8 +# $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8 +# +# invalid: +# $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8 +# iconv: (stdin):1:0: cannot convert +# +# $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8 +# iconv: (stdin):1:0: cannot convert +# +# $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8 +# iconv: (stdin):0:0: cannot convert + +Exit $fail diff --git a/testsuite/mb-match-slash.sh b/testsuite/mb-match-slash.sh new file mode 100755 index 0000000..c0a30f5 --- /dev/null +++ b/testsuite/mb-match-slash.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# Test slash following an incomplete multibyte character + +# Copyright (C) 2016 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +. "${srcdir=.}/init.sh"; path_prepend_ ../sed +print_ver_ sed + +require_en_utf8_locale_ + +fail=0 + +# before sed-4.3, a slash following an incomplete multibyte character +# would be ignored during program compilation, leading to an error. + + +# Test 1: match_slash in 's' command. +# Before sed-4.3, this would fail with "unterminated `s' command". +printf 's/\316/X/' > p1 || framework_failure_ +LC_ALL=en_US.UTF-8 sed -f p1 out1 || fail=1 +compare_ /dev/null out1 || fail=1 + +# Test 2: match_slash in address regex. +# Before sed-4.3, this would fail with "unterminated address regex". +printf '/\316/p' >p2 || framework_failure_ +LC_ALL=en_US.UTF-8 sed -f p2 out2 || fail=1 +compare_ /dev/null out2 || fail=1 + +# Test 3: match_slash in 'y' command.. +# Before sed-4.3, this would fail with "unterminated `y' command". +printf 'y/\316/X/' >p3 || framework_failure_ +LC_ALL=en_US.UTF-8 sed -f p3 out3 || fail=1 +compare_ /dev/null out3 || fail=1 + + +Exit $fail