@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135135}
136136
137137static DWORD
138- _find_last_utf8_boundary (const char * buf , DWORD len )
138+ _find_last_utf8_boundary (const unsigned char * buf , DWORD len )
139139{
140- /* This function never returns 0, returns the original len instead */
141- DWORD count = 1 ;
142- if (len == 0 || (buf [len - 1 ] & 0x80 ) == 0 ) {
143- return len ;
144- }
145- for (;; count ++ ) {
146- if (count > 3 || count >= len ) {
140+ for (DWORD count = 1 ; count < 4 && count <= len ; count ++ ) {
141+ unsigned char c = buf [len - count ];
142+ if (c < 0x80 ) {
143+ /* No starting byte found. */
147144 return len ;
148145 }
149- if ((buf [len - count ] & 0xc0 ) != 0x80 ) {
150- return len - count ;
146+ if (c >= 0xc0 ) {
147+ if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148+ c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149+ c < 0xf8 /* 4-bytes sequence */ )
150+ {
151+ /* Incomplete multibyte sequence. */
152+ return len - count ;
153+ }
154+ /* Either complete or invalid sequence. */
155+ return len ;
156+ }
157+ }
158+ /* Either complete 4-bytes sequence or invalid sequence. */
159+ return len ;
160+ }
161+
162+ /* Find the number of UTF-8 bytes that corresponds to the specified number of
163+ * wchars.
164+ * I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165+ *
166+ * WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167+ * conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168+ * will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169+ * So we need to use binary search.
170+ */
171+ static DWORD
172+ _wchar_to_utf8_count (const unsigned char * s , DWORD len , DWORD n )
173+ {
174+ DWORD start = 0 ;
175+ while (1 ) {
176+ DWORD mid = 0 ;
177+ for (DWORD i = len / 2 ; i <= len ; i ++ ) {
178+ mid = _find_last_utf8_boundary (s , i );
179+ if (mid != 0 ) {
180+ break ;
181+ }
182+ /* The middle could split the first multibytes sequence. */
183+ }
184+ if (mid == len ) {
185+ return start + len ;
186+ }
187+ if (mid == 0 ) {
188+ mid = len > 1 ? len - 1 : 1 ;
189+ }
190+ DWORD wlen = MultiByteToWideChar (CP_UTF8 , 0 , s , mid , NULL , 0 );
191+ if (wlen <= n ) {
192+ s += mid ;
193+ start += mid ;
194+ len -= mid ;
195+ n -= wlen ;
196+ }
197+ else {
198+ len = mid ;
151199 }
152200 }
153201}
@@ -556,8 +604,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
556604 int err = 0 , sig = 0 ;
557605
558606 wchar_t * buf = (wchar_t * )PyMem_Malloc (maxlen * sizeof (wchar_t ));
559- if (!buf )
607+ if (!buf ) {
608+ PyErr_NoMemory ();
560609 goto error ;
610+ }
561611
562612 * readlen = 0 ;
563613
@@ -615,6 +665,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
615665 Py_UNBLOCK_THREADS
616666 if (!newbuf ) {
617667 sig = -1 ;
668+ PyErr_NoMemory ();
618669 break ;
619670 }
620671 buf = newbuf ;
@@ -638,8 +689,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
638689 if (* readlen > 0 && buf [0 ] == L'\x1a' ) {
639690 PyMem_Free (buf );
640691 buf = (wchar_t * )PyMem_Malloc (sizeof (wchar_t ));
641- if (!buf )
692+ if (!buf ) {
693+ PyErr_NoMemory ();
642694 goto error ;
695+ }
643696 buf [0 ] = L'\0' ;
644697 * readlen = 0 ;
645698 }
@@ -817,8 +870,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
817870 bufsize = BUFSIZ ;
818871
819872 buf = (wchar_t * )PyMem_Malloc ((bufsize + 1 ) * sizeof (wchar_t ));
820- if (buf == NULL )
873+ if (buf == NULL ) {
874+ PyErr_NoMemory ();
821875 return NULL ;
876+ }
822877
823878 while (1 ) {
824879 wchar_t * subbuf ;
@@ -840,6 +895,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
840895 (bufsize + 1 ) * sizeof (wchar_t ));
841896 if (tmp == NULL ) {
842897 PyMem_Free (buf );
898+ PyErr_NoMemory ();
843899 return NULL ;
844900 }
845901 buf = tmp ;
@@ -1015,43 +1071,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10151071 len = (DWORD )b -> len ;
10161072
10171073 Py_BEGIN_ALLOW_THREADS
1018- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1019-
10201074 /* issue11395 there is an unspecified upper bound on how many bytes
10211075 can be written at once. We cap at 32k - the caller will have to
10221076 handle partial writes.
10231077 Since we don't know how many input bytes are being ignored, we
10241078 have to reduce and recalculate. */
1025- while (wlen > 32766 / sizeof (wchar_t )) {
1026- len /= 2 ;
1079+ const DWORD max_wlen = 32766 / sizeof (wchar_t );
1080+ /* UTF-8 to wchar ratio is at most 3:1. */
1081+ len = Py_MIN (len , max_wlen * 3 );
1082+ while (1 ) {
10271083 /* Fix for github issues gh-110913 and gh-82052. */
10281084 len = _find_last_utf8_boundary (b -> buf , len );
10291085 wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1086+ if (wlen <= max_wlen ) {
1087+ break ;
1088+ }
1089+ len /= 2 ;
10301090 }
10311091 Py_END_ALLOW_THREADS
10321092
1033- if (!wlen )
1034- return PyErr_SetFromWindowsErr (0 );
1093+ if (!wlen ) {
1094+ return PyLong_FromLong (0 );
1095+ }
10351096
10361097 wbuf = (wchar_t * )PyMem_Malloc (wlen * sizeof (wchar_t ));
1098+ if (!wbuf ) {
1099+ PyErr_NoMemory ();
1100+ return NULL ;
1101+ }
10371102
10381103 Py_BEGIN_ALLOW_THREADS
10391104 wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , wbuf , wlen );
10401105 if (wlen ) {
10411106 res = WriteConsoleW (handle , wbuf , wlen , & n , NULL );
1107+ #ifdef Py_DEBUG
1108+ if (res ) {
1109+ #else
10421110 if (res && n < wlen ) {
1111+ #endif
10431112 /* Wrote fewer characters than expected, which means our
10441113 * len value may be wrong. So recalculate it from the
1045- * characters that were written. As this could potentially
1046- * result in a different value, we also validate that value.
1114+ * characters that were written.
10471115 */
1048- len = WideCharToMultiByte (CP_UTF8 , 0 , wbuf , n ,
1049- NULL , 0 , NULL , NULL );
1050- if (len ) {
1051- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len ,
1052- NULL , 0 );
1053- assert (wlen == len );
1054- }
1116+ len = _wchar_to_utf8_count (b -> buf , len , n );
10551117 }
10561118 } else
10571119 res = 0 ;
0 commit comments