sed: fix minor multibyte parsing bug

author Assaf Gordon <assafgordon@gmail.com>

Thu, 23 Jun 2016 00:34:01 +0000 (20:34 -0400)

committer Assaf Gordon <assafgordon@gmail.com>

Mon, 4 Jul 2016 18:26:09 +0000 (14:26 -0400)
author Assaf Gordon <assafgordon@gmail.com>
Thu, 23 Jun 2016 00:34:01 +0000 (20:34 -0400)
committer Assaf Gordon <assafgordon@gmail.com>
Mon, 4 Jul 2016 18:26:09 +0000 (14:26 -0400)
diff --git a/NEWS b/NEWS
 index 4399b7426ef01ccee565590c77e332d2cd1cefb8..6af8b9abc21935228710908db56a233ca65c2259 100644 (file)
 
--- a/NEWS
+++ b/NEWS
@@ -54,6 +54,13 @@ GNU sed NEWS                                    -*- outline -*-
    Other characters after the second backslash are rejected (e.g. '\c\d').
    [bug introduced in the sed-4.0.* releases]
  
+  sed no longer mishandles incomplete multibyte sequences in s,y commands
+  and valid multibyte SHIFT-JIS characters in character classes.
+  Previously, the following commands would fail:
+    LC_ALL=en_US.UTF-8 sed $'s/\316/X/'
+    LC_ALL=ja_JP.shiftjis sed $'/[\203]/]/p'
+  [bug introduced some time after sed-4.1.5 and before sed-4.2.1]
+
  ** Feature removal
  
    The "L" command (format a paragraph like the fmt(1) command would)
diff --git a/sed/compile.c b/sed/compile.c
 index c039c2425006214234e63ced63ca98dbac51598e..ca09e7839b0afc5159539d760cde55fa2536bd8d 100644 (file)
 
--- a/sed/compile.c
+++ b/sed/compile.c
@@ -432,7 +432,6 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
    int ch;
    int state = 0;
    int delim IF_LINT ( = 0) ;
-  bool pending_mb = 0;
  
    ch = inchar();
    if (ch == '^')
@@ -448,7 +447,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
  
    for (;; ch = add_then_next (b, ch))
      {
-      pending_mb = BRLEN (ch, cur_stat) != 1;
+      const int mb_char = IS_MB_CHAR (ch, cur_stat);
  
        switch (ch)
          {
@@ -459,7 +458,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
          case '.':
          case ':':
          case '=':
-          if (pending_mb)
+          if (mb_char)
              continue;
  
            if (state == 1)
@@ -475,7 +474,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
            continue;
  
          case OPEN_BRACKET:
-          if (pending_mb)
+          if (mb_char)
              continue;
  
            if (state == 0)
@@ -483,7 +482,7 @@ snarf_char_class (struct buffer *b, mbstate_t *cur_stat)
            continue;
  
          case CLOSE_BRACKET:
-          if (pending_mb)
+          if (mb_char)
              continue;
  
            if (state == 0 || state == 1)
@@ -512,7 +511,7 @@ match_slash (int slash, int regex)
    mbstate_t cur_stat = { 0, };
  
    /* We allow only 1 byte characters for a slash.  */
-  if (BRLEN (slash, &cur_stat) == -2)
+  if (IS_MB_CHAR (slash, &cur_stat))
      bad_prog (BAD_DELIM);
  
    memset (&cur_stat, 0, sizeof cur_stat);
@@ -520,8 +519,9 @@ match_slash (int slash, int regex)
    b = init_buffer();
    while ((ch = inchar()) != EOF && ch != '\n')
      {
-      bool pending_mb = !MBSINIT (&cur_stat);
-      if (BRLEN (ch, &cur_stat) == 1 && !pending_mb)
+      const int mb_char = IS_MB_CHAR (ch, &cur_stat);
+
+      if (!mb_char)
          {
            if (ch == slash)
              return b;
diff --git a/sed/mbcs.c b/sed/mbcs.c
 index f06f3f3c70d917e60d0440c072971449741e62b9..bce39fa962ee5a605742925524566d915b84ff48 100644 (file)
 
--- a/sed/mbcs.c
+++ b/sed/mbcs.c
@@ -24,24 +24,43 @@
  int mb_cur_max;
  bool is_utf8;
  
-/* Add a byte to the multibyte character represented by the state
-   CUR_STAT, and answer its length if a character is completed,
-   or -2 if it is yet to be completed.  */
-int brlen (int ch, mbstate_t *cur_stat)
-{
-  char c = ch;
+/* Return non-zero if CH is part of a valid multibyte sequence:
+   Either incomplete yet valid sequence (in case of a leading byte),
+   or the last byte of a valid multibyte sequence.
+
+   Return zero in all other cases:
+    CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
+    CH is an invalid byte in a multibyte sequence for the currentl locale,
+    CH is the NUL byte.
  
-  /* If we use the generic brlen, then MBRLEN == mbrlen.  */
-  int result = mbrtowc(NULL, &c, 1, cur_stat);
+   Reset CUR_STAT in the case of an invalid byte.
+*/
+int
+is_mb_char (int ch, mbstate_t *cur_stat)
+{
+  const char c = ch ;
+  const int mb_pending = !mbsinit (cur_stat);
+  const int result = mbrtowc (NULL, &c, 1, cur_stat);
  
-  /* An invalid sequence is treated like a single-byte character. */
-  if (result == -1)
+  switch (result)
      {
+    case -2: /* Beginning or middle of valid multibyte sequence */
+      return 1;
+
+    case -1: /* Invalid sequence, byte treated like a single-byte character */
        memset (cur_stat, 0, sizeof (mbstate_t));
+      return 0;
+
+    case 1: /* A valid byte, check if part of on-going multibyte sequence */
+      return mb_pending;
+
+    case 0: /* Special case of mbrtowc(3): the NUL character */
+      /* TODO: test this */
        return 1;
-    }
  
-  return result;
+    default: /* Should never happen, as per mbrtowc(3) documentation */
+      panic ("is_mb_char: mbrtowc (0x%x) returned %d",ch,result);
+    }
  }
  
  void
diff --git a/sed/sed.h b/sed/sed.h
 index 8fe2147c50e98f78bff300a62a4e0943d40aa176..bbddd251f2d7fd8bb91f9435f3bde6be77c0bb6a 100644 (file)
 
--- a/sed/sed.h
+++ b/sed/sed.h
@@ -254,10 +254,10 @@ extern bool is_utf8;
  #define MBRLEN(s, n, ps) \
    (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
  
-#define BRLEN(ch, ps) \
-  (mb_cur_max == 1 ? 1 : brlen (ch, ps))
+#define IS_MB_CHAR(ch, ps)                \
+  (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps))
  
-extern int brlen (int ch, mbstate_t *ps);
+extern int is_mb_char (int ch, mbstate_t *ps);
  extern void initialize_mbcs (void);
  extern void register_cleanup_file (char const *file);
  extern void cancel_cleanup (void);
diff --git a/testsuite/Makefile.am b/testsuite/Makefile.am
 index 6019d8537d78826e99dd6976694416b10f238bac..5ba453980b9e767073cfbc8ec1e0c588a79058e9 100644 (file)
 
--- a/testsuite/Makefile.am
+++ b/testsuite/Makefile.am
@@ -19,6 +19,8 @@ T =                           \
    in-place-hyphen.sh           \
    in-place-suffix-backup.sh    \
    invalid-mb-seq-UMR.sh                \
+  mb-charclass-non-utf8.sh     \
+  mb-match-slash.sh            \
    normalize-text.sh            \
    nulldata.sh                  \
    panic-tests.sh               \
diff --git a/testsuite/init.cfg b/testsuite/init.cfg
 index e3950e4ebb4277b3cb6b95cc1c47e4f98e3b389a..2d685e25c728a3461d267447aa8dd30e98172d96 100644 (file)
 
--- a/testsuite/init.cfg
+++ b/testsuite/init.cfg
@@ -51,3 +51,22 @@ require_en_utf8_locale_()
      *) skip_ 'en_US.UTF-8 locale not found' ;;
    esac
  }
+
+# Some tests would fail without this particular locale.
+# If the locale is not available, just skip the test.
+# The exact spelling differs between operating systems
+# (ja_JP.shiftjis on Ubuntu, ja_JP.sjis on Debian, ja_JP.SJIS on Mac OS X).
+# If a sjift-jis locale is found the function sets shell variable
+# 'LOCALE_JA_SJIS' to the locale name.
+require_ja_shiftjis_locale_()
+{
+  path_prepend_ .
+  LOCALE_JA_SJIS=
+  for l in shiftjis sjis SJIS ; do
+      n=$(get-mb-cur-max ja_JP.$l) || continue
+      test 2 -eq "$n" || continue
+      LOCALE_JA_SJIS="ja_JP.$l"
+      break
+  done
+  test -z "$LOCALE_JA_SJIS" && skip_ 'ja_JP shift-jis locale not found'
+}
diff --git a/testsuite/mb-charclass-non-utf8.sh b/testsuite/mb-charclass-non-utf8.sh
 new file mode 100755 (executable)
 index 0000000..ea68d7a
 
--- /dev/null
+++ b/testsuite/mb-charclass-non-utf8.sh
@@ -0,0 +1,129 @@
+#!/bin/sh
+# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
+# This is a stateful locale. Same byte value can be either
+# a single-byte character, or the second byte of a multibyte
+# character.
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+# If found, LOCALE_JA_SJIS will contain the locale name.
+require_ja_shiftjis_locale_
+
+fail=0
+
+# This test uses two characters:
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# In SHIFT-JIS locale, these multibyte characters contain
+# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
+#
+# See also:
+# https://en.wikipedia.org/wiki/Shift_JIS
+# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
+
+# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
+#
+# UTF-8:    hex: 0xE3     0x82      0xBC
+#           bin: 11100011 10000010  10111100
+#
+# Shift-jis hex:  0x83     0x5B
+#           oct:  203      133
+#           bin:  10000011 01011011
+#
+# Conversion example:
+#   $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+#   0000000  e3  82  bc
+#            343 202 274
+#            343 202 274
+
+# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
+#
+# UTF-8:    hex: 0xE3     0x82      0xBE
+#           bin: 11100011 10000010  10111110
+#
+# Shift-jis hex:  0x83     0x5D
+#           oct:  203      135
+#           bin:  10000011 01011101
+#
+# Conversion example:
+#   $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
+#   0000000  e3  82  be
+#            343 202 276
+#            343 202 276
+#
+
+
+#
+# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
+# Implmenetation notes: str_append() has special code path for non-utf8 cases.
+#
+
+# Test 1: valid multibyte seqeunce
+printf 'y/a/\203\133/' > p1 || framework_failure_
+echo Xa > in1 || framework_failure_
+printf 'X\203\133\n' > exp1 || framework_failure_
+
+LC_ALL="$LOCALE_JA_SJIS" sed -f p1 <in1 >out1 || fail=1
+compare_ exp1 out1 || fail=1
+
+# Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
+printf 'y/aa/\203\060/' > p2 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p2 </dev/null 2>out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+#
+# Test 3: multibyte character class with these characters.
+#
+# Before sed-4.3, snarf_char_class would parse it incorrectly,
+# Treating the first closing-bracket as closing the character-class,
+# instead of being part of a multibyte sequence.
+
+printf '/[\203]/]/p' > p3 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+# Test 4:
+# Same as test 3, but with the other multibyte character.
+# (this did not cause a failure before sed-4.3, but the code was incorrect).
+# Keep this test for code-coverage purposes.
+printf '/[\203[/]/p' > p4 || framework_failure_
+LC_ALL="$LOCALE_JA_SJIS" sed -f p4 </dev/null >out4 || fail=1
+compare_ /dev/null out4 || fail=1
+
+# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
+#
+# snarf_char_class specifically tests for five bytes: ':.=[]' .
+# '[' and ']' are tested above, yet '.:=' are not valid as part of a
+# multibyte shift-jis sequence.
+#
+# valid:
+#   $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
+#   $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
+#
+# invalid:
+#   $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):1:0: cannot convert
+#
+#   $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):1:0: cannot convert
+#
+#   $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
+#   iconv: (stdin):0:0: cannot convert
+
+Exit $fail
diff --git a/testsuite/mb-match-slash.sh b/testsuite/mb-match-slash.sh
 new file mode 100755 (executable)
 index 0000000..c0a30f5
 
--- /dev/null
+++ b/testsuite/mb-match-slash.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Test slash following an incomplete multibyte character
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+. "${srcdir=.}/init.sh"; path_prepend_ ../sed
+print_ver_ sed
+
+require_en_utf8_locale_
+
+fail=0
+
+# before sed-4.3, a slash following an incomplete multibyte character
+# would be ignored during program compilation, leading to an error.
+
+
+# Test 1: match_slash in 's' command.
+# Before sed-4.3, this would fail with "unterminated `s' command".
+printf 's/\316/X/' > p1 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p1 </dev/null >out1 || fail=1
+compare_ /dev/null out1 || fail=1
+
+# Test 2: match_slash in address regex.
+# Before sed-4.3, this would fail with "unterminated address regex".
+printf '/\316/p' >p2 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p2 </dev/null >out2 || fail=1
+compare_ /dev/null out2 || fail=1
+
+# Test 3: match_slash in 'y' command..
+# Before sed-4.3, this would fail with "unterminated `y' command".
+printf 'y/\316/X/' >p3 || framework_failure_
+LC_ALL=en_US.UTF-8 sed -f p3 </dev/null >out3 || fail=1
+compare_ /dev/null out3 || fail=1
+
+
+Exit $fail
author	Assaf Gordon <assafgordon@gmail.com>
	Thu, 23 Jun 2016 00:34:01 +0000 (20:34 -0400)
committer	Assaf Gordon <assafgordon@gmail.com>
	Mon, 4 Jul 2016 18:26:09 +0000 (14:26 -0400)
NEWS		patch \| blob \| history
sed/compile.c		patch \| blob \| history
sed/mbcs.c		patch \| blob \| history
sed/sed.h		patch \| blob \| history
testsuite/Makefile.am		patch \| blob \| history
testsuite/init.cfg		patch \| blob \| history
testsuite/mb-charclass-non-utf8.sh	[new file with mode: 0755]	patch \| blob
testsuite/mb-match-slash.sh	[new file with mode: 0755]	patch \| blob