From: Assaf Gordon <assafgordon@gmail.com>
Date: Thu, 23 Jun 2016 00:34:01 +0000 (-0400)
Subject: sed: fix minor multibyte parsing bug
X-Git-Tag: v4.3~71
X-Git-Url: https://gitweb.git.savannah.gnu.org/gitweb/?a=commitdiff_plain;h=0052daf1e5651c4772d75b595dfc714e0e20587e;p=sed.git

sed: fix minor multibyte parsing bug

Previously sed would parse multibyte characters incorrectly in two scenarios:

1. Slash following an incomplete-yet-valid multibyte sequence (match_slash):
    $ LC_ALL=en_US.UTF-8 sed $'s/\316/X/'
    sed: -e expression #1, char 6: unterminated `s' command

2. Open/close brackets as part of a valid mutilbyte string inside a character
class (snarf_char_class). In the example below, '\203]' is a valid
multibyte character in SHIFT-JIS locale:

    $ LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p'
    sed: -e expression #1, char #5: Unmatched [ or [^

Both cases stem from mbcs.c:brlen() being non-intuitive:
It returned 1 for valid single-byte character, invalid multibyte-character,
and a for the last byte of a valid multibyte sequence - making it
non-trivial to use correctly.

This commit replaces brlen() with a simpler is_mb_char() function:
returns non-zero for multibyte sequences, zero for single/invalid sequences.

* sed/sed.h: (BRLEN, brlen): Remove delaration.
(IS_MB_CHAR,is_mb_char): Add macro and function declaration.
* sed/mbcs.c: (brlen): Remove function. (is_mb_char): New function.
* sed/compile.c: (snarf_char_class, match_slash): Use IS_MB_CHAR instead of
BRLEN; Adjust local variables accordingly.
* testsuite/mb-match-slash.sh: New test for scenario 1.
* testsuite/mb-charclass-non-utf8.sh: New test for scenario 2,
requires SHIFT-JIS locale.
* testsuite/Makefile.am: Add new tests
* testsuite/init.cfg: (require_ja_shiftjis_locale_): New function.
* NEWS: Mention bug fix.
---

diff --git a/NEWS b/NEWS
index 4399b74..6af8b9a 100644
--- a/NEWS
+++ b/NEWS
@@ -54,6 +54,13 @@ GNU sed NEWS                                    -*- outline -*-
   Other characters after the second backslash are rejected (e.g. '\c\d').
   [bug introduced in the sed-4.0.* releases]
 
+  sed no longer mishandles incomplete multibyte sequences in s,y commands
+  and valid multibyte SHIFT-JIS characters in character classes.
+  Previously, the following commands would fail:
+    LC_ALL=en_US.UTF-8 sed $'s/\316/X/'
+    LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p'
+  [bug introduced some time after sed-4.1.5 and before sed-4.2.1]
+
 ** Feature removal
 
   The "L" command (format a paragraph like the fmt(1) command would)
diff --git a/sed/compile.c b/sed/compile.c
index c039c24..ca09e78 100644
--- a/sed/compile.c
+++ b/sed/compile.c
@@ -432,7 +432,6 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
   int ch;
   int state = 0;
   int delim IF_LINT ( = 0) ;
-  bool pending_mb = 0;
 
   ch = inchar();
   if (ch == '^')
@@ -448,7 +447,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
 
   for (;; ch = add_then_next (b, ch))
     {
-      pending_mb = BRLEN (ch, cur_stat) != 1;
+      const int mb_char = IS_MB_CHAR (ch, cur_stat);
 
       switch (ch)
         {
@@ -459,7 +458,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
         case '.':
         case ':':
         case '=':
-          if (pending_mb)
+          if (mb_char)
             continue;
 
           if (state == 1)
@@ -475,7 +474,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
           continue;
 
         case OPEN_BRACKET:
-          if (pending_mb)
+          if (mb_char)
             continue;
 
           if (state == 0)
@@ -483,7 +482,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
           continue;
 
         case CLOSE_BRACKET:
-          if (pending_mb)
+          if (mb_char)
             continue;
 
           if (state == 0 || state == 1)
@@ -512,7 +511,7 @@ match_slash (int slash, int regex)
   mbstate_t cur_stat = { 0, };
 
   /* We allow only 1 byte characters for a slash.  */
-  if (BRLEN (slash, &cur_stat) == -2)
+  if (IS_MB_CHAR (slash, &cur_stat))
     bad_prog (BAD_DELIM);
 
   memset (&cur_stat, 0, sizeof cur_stat);
@@ -520,8 +519,9 @@ match_slash (int slash, int regex)
   b = init_buffer();
   while ((ch = inchar()) != EOF && ch != '\n')
     {
-      bool pending_mb = !MBSINIT (&cur_stat);
-      if (BRLEN (ch, &cur_stat) == 1 && !pending_mb)
+      const int mb_char = IS_MB_CHAR (ch, &cur_stat);
+
+      if (!mb_char)
         {
           if (ch == slash)
             return b;
diff --git a/sed/mbcs.c b/sed/mbcs.c
index f06f3f3..bce39fa 100644
--- a/sed/mbcs.c
+++ b/sed/mbcs.c
@@ -24,24 +24,43 @@
 int mb_cur_max;
 bool is_utf8;
 
-/* Add a byte to the multibyte character represented by the state
-   CUR_STAT, and answer its length if a character is completed,
-   or -2 if it is yet to be completed.  */
-int brlen (int ch, mbstate_t *cur_stat)
-{
-  char c = ch;
+/* Return non-zero if CH is part of a valid multibyte sequence:
+   Either incomplete yet valid sequence (in case of a leading byte),
+   or the last byte of a valid multibyte sequence.
+
+   Return zero in all other cases:
+    CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
+    CH is an invalid byte in a multibyte sequence for the currentl locale,
+    CH is the NUL byte.
 
-  /* If we use the generic brlen, then MBRLEN == mbrlen.  */
-  int result = mbrtowc(NULL, &c, 1, cur_stat);
+   Reset CUR_STAT in the case of an invalid byte.
+*/
+int
+is_mb_char (int ch, mbstate_t *cur_stat)
+{
+  const char c = ch ;
+  const int mb_pending = !mbsinit (cur_stat);
+  const int result = mbrtowc (NULL, &c, 1, cur_stat);
 
-  /* An invalid sequence is treated like a single-byte character. */
-  if (result == -1)
+  switch (result)
     {
+    case -2: /* Beginning or middle of valid multibyte sequence */
+      return 1;
+
+    case -1: /* Invalid sequence, byte treated like a single-byte character */
       memset (cur_stat, 0, sizeof (mbstate_t));
+      return 0;
+
+    case 1: /* A valid byte, check if part of on-going multibyte sequence */
+      return mb_pending;
+
+    case 0: /* Special case of mbrtowc(3): the NUL character */
+      /* TODO: test this */
       return 1;
-    }
 
-  return result;
+    default: /* Should never happen, as per mbrtowc(3) documentation */
+      panic ("is_mb_char: mbrtowc (0x%x) returned %d",ch,result);
+    }
 }
 
 void
diff --git a/sed/sed.h b/sed/sed.h
index 8fe2147..bbddd25 100644
--- a/sed/sed.h
+++ b/sed/sed.h
@@ -254,10 +254,10 @@ extern bool is_utf8;
 #define MBRLEN(s, n, ps) \
   (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
 
-#define BRLEN(ch, ps) \
-  (mb_cur_max == 1 ? 1 : brlen (ch, ps))
+#define IS_MB_CHAR(ch, ps)                \
+  (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps))
 
-extern int brlen (int ch, mbstate_t *ps);
+extern int is_mb_char (int ch, mbstate_t *ps);
 extern void initialize_mbcs (void);
 extern void register_cleanup_file (char const *file);
 extern void cancel_cleanup (void);
diff --git a/testsuite/Makefile.am b/testsuite/Makefile.am
index 6019d85..5ba4539 100644
--- a/testsuite/Makefile.am
+++ b/testsuite/Makefile.am
@@ -19,6 +19,8 @@ T =				\
   in-place-hyphen.sh		\
   in-place-suffix-backup.sh	\
   invalid-mb-seq-UMR.sh		\
+  mb-charclass-non-utf8.sh	\
+  mb-match-slash.sh		\
   normalize-text.sh		\
   nulldata.sh			\
   panic-tests.sh		\
diff --git a/testsuite/init.cfg b/testsuite/init.cfg
index e3950e4..2d685e2 100644
--- a/testsuite/init.cfg
+++ b/testsuite/init.cfg
@@ -51,3 +51,22 @@ require_en_utf8_locale_()
     *) skip_ 'en_US.UTF-8 locale not found' ;;
   esac
 }
+
+# Some tests would fail without this particular locale.
+# If the locale is not available, just skip the test.
+# The exact spelling differs between operating systems
+# (ja_JP.shiftjis on Ubuntu, ja_JP.sjis on Debian, ja_JP.SJIS on Mac OS X).
+# If a sjift-jis locale is found the function sets shell variable
+# 'LOCALE_JA_SJIS' to the locale name.
+require_ja_shiftjis_locale_()
+{
+  path_prepend_ .
+  LOCALE_JA_SJIS=
+  for l in shiftjis sjis SJIS ; do
+      n=$(get-mb-cur-max ja_JP.$l) || continue
+      test 2 -eq "$n" || continue
+      LOCALE_JA_SJIS="ja_JP.$l"
+      break
+  done
+  test -z "$LOCALE_JA_SJIS" && skip_ 'ja_JP shift-jis locale not found'
+}
diff --git a/testsuite/mb-charclass-non-utf8.sh b/testsuite/mb-charclass-non-utf8.sh
new file mode 100755
index 0000000..ea68d7a
--- /dev/null
+++ b/testsuite/mb-charclass-non-utf8.sh
@@ -0,0 +1,129 @@
+#!/bin/sh
+# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
+# This is a stateful locale. Same byte value can be either
+# a single-byte character, or the second byte of a multibyte
+# character.
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+# If found, LOCALE_JA_SJIS will contain the locale name.
+require_ja_shiftjis_locale_
+
+fail=0
+
+# This test uses two characters:
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# In SHIFT-JIS locale, these multibyte characters contain
+# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
+#
+# See also:
+# https://en.wikipedia.org/wiki/Shift_JIS
+# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
+
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+#
+# UTF-8:    hex: 0xE3     0x82      0xBC
+#           bin: 11100011 10000010  10111100
+#
+# Shift-jis hex:  0x83     0x5B
+#           oct:  203      133
+#           bin:  10000011 01011011
+#
+# Conversion example:
+#   $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+#   0000000  e3  82  bc
+#            343 202 274
+#            343 202 274
+
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# UTF-8:    hex: 0xE3     0x82      0xBE
+#           bin: 11100011 10000010  10111110
+#
+# Shift-jis hex:  0x83     0x5D
+#           oct:  203      135
+#           bin:  10000011 01011101
+#
+# Conversion example:
+#   $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+#   0000000  e3  82  be
+#            343 202 276
+#            343 202 276
+#
+
+
+#
+# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
+# Implmenetation notes: str_append() has special code path for non-utf8 cases.
+#
+
+# Test 1: valid multibyte seqeunce
+printf 'y/a/\203\133/' > p1 || framework_failure_
+echo Xa > in1 || framework_failure_
+printf 'X\203\133\n' > exp1 || framework_failure_
+
+LC_ALL="$LOCALE_JA_SJIS" sed -f p1 <in1 >out1 || fail=1
+compare_ exp1 out1 || fail=1
+
+# Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
+printf 'y/aa/\203\060/' > p2 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p2 </dev/null 2>out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+#
+# Test 3: multibyte character class with these characters.
+#
+# Before sed-4.3, snarf_char_class would parse it incorrectly,
+# Treating the first closing-bracket as closing the character-class,
+# instead of being part of a multibyte sequence.
+
+printf '/[\203]/]/p' > p3 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+# Test 4:
+# Same as test 3, but with the other multibyte character.
+# (this did not cause a failure before sed-4.3, but the code was incorrect).
+# Keep this test for code-coverage purposes.
+printf '/[\203[/]/p' > p4 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p4 </dev/null >out4 || fail=1
+compare_ /dev/null out4 || fail=1
+
+# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
+#
+# snarf_char_class specifically tests for five bytes: ':.=[]' .
+# '[' and ']' are tested above, yet '.:=' are not valid as part of a
+# multibyte shift-jis sequence.
+#
+# valid:
+#   $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
+#   $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
+#
+# invalid:
+#   $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):1:0: cannot convert
+#
+#   $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):1:0: cannot convert
+#
+#   $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):0:0: cannot convert
+
+Exit $fail
diff --git a/testsuite/mb-match-slash.sh b/testsuite/mb-match-slash.sh
new file mode 100755
index 0000000..c0a30f5
--- /dev/null
+++ b/testsuite/mb-match-slash.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Test slash following an incomplete multibyte character
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+require_en_utf8_locale_
+
+fail=0
+
+# before sed-4.3, a slash following an incomplete multibyte character
+# would be ignored during program compilation, leading to an error.
+
+
+# Test 1: match_slash in 's' command.
+# Before sed-4.3, this would fail with "unterminated `s' command".
+printf 's/\316/X/' > p1 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p1 </dev/null >out1 || fail=1
+compare_ /dev/null out1 || fail=1
+
+# Test 2: match_slash in address regex.
+# Before sed-4.3, this would fail with "unterminated address regex".
+printf '/\316/p' >p2 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p2 </dev/null >out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+# Test 3: match_slash in 'y' command..
+# Before sed-4.3, this would fail with "unterminated `y' command".
+printf 'y/\316/X/' >p3 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+
+Exit $fail