Opened 10 years ago
Last modified 3 months ago
#34631 new enhancement
Extra compat for mbstring: mb_strpos()
| Reported by: | | Owned by: | |
|---|---|---|---|
| Milestone: | Awaiting Review | Priority: | normal |
| Severity: | normal | Version: | 4.4 |
| Component: | Charset | Keywords: | has-patch needs-testing 2nd-opinion needs-unit-tests |
| Focuses: | Cc: |
Description
Hello,
I noticed a missing compat function within compat.php, regarding mb_strpos.
The use of this function within a plugin will result in a fatal error if the server doesn't support mbstring.
So I made a function that will take over the function if it does not exist.
I also implemented debugging errors based on PHP 5.5 source: https://github.com/php/php-src/blob/PHP-5.5/ext/standard/string.c#L1824
if ( ! function_exists( 'mb_strpos' ) ) { function mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) { return _mb_strpos( $haystack, $needle, $offset, $encoding ); } } /* * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. * The behavior of this function for invalid inputs is PHP compliant. */ if ( ! function_exists( '_mb_strpos' ) ) { function _mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) { if ( null === $encoding ) { $encoding = get_option( 'blog_charset' ); } // The solution below works only for UTF-8, // so in case of a different charset just use built-in strpos() if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return $offset === 0 ? strpos( $haystack, $needle ) : strpos( $haystack, $needle, $offset ); } $haystack_len = mb_strlen( $haystack ); if ( $offset < (int) 0 || $offset > $haystack_len ) { trigger_error( 'mb_strpos(): Offset not contained in string', E_USER_WARNING ); return false; } if ( !is_string( $needle ) ) { $needle = (string) $needle; if ( !is_string( $needle ) ) { trigger_error( 'mb_strpos(): Array to string conversion', E_USER_WARNING ); return false; } } if ( empty( $needle ) ) { trigger_error( 'mb_strpos(): Empty needle', E_USER_WARNING ); return false; } // Slice off the offset $haystack_sub = mb_substr( $haystack, $offset ); if ( _wp_can_use_pcre_u() ) { // Use the regex unicode support to separate the UTF-8 characters into an array preg_match_all( "/./us", $haystack, $match_h ); preg_match_all( "/$needle/us", $haystack_sub, $match_n ); $pos = key( array_intersect( $match_h[0], $match_n[0] ) ); if ( empty( $pos ) ) { return false; } return (int) $pos; } $regex = '/( [\x00-\x7F] # single-byte sequences 0xxxxxxx | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 | [\xE1-\xEC][\x80-\xBF]{2} | \xED[\x80-\x9F][\x80-\xBF] | [\xEE-\xEF][\x80-\xBF]{2} | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\xF1-\xF3][\x80-\xBF]{3} | \xF4[\x80-\x8F][\x80-\xBF]{2} )/x'; /** * Place haystack into array */ $match_h = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_h ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $haystack, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_h = array_merge( $match_h, $pieces ); } while ( count( $pieces ) > 1 && $haystack = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Place haystack offset into array */ $match_hs = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_hs ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $haystack_sub, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_hs = array_merge( $match_hs, $pieces ); } while ( count( $pieces ) > 1 && $haystack_sub = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Put needle into array */ $match_n = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_n ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $needle, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_n = array_merge( $match_n, $pieces ); } while ( count( $pieces ) > 1 && $needle = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Compute match of haystack offset with needle * If passed, find the array key number within the full haystack. */ $pos = in_array( $match_n[0], $match_hs ) ? key( array_intersect( $match_h, $match_n ) ) : ''; if ( empty( $pos ) ) { return false; } return (int) $pos; } } if ( ! function_exists( '_mb_strpos' ) ) { could probably be removed since it could be a core function.
To test this, I've used the following lines of code:
var_dump( _mb_strpos( '象形指事', '指', 0 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 1 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 2 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 3 ) ); // false var_dump( _mb_strpos( '象形指事', '指', -1 ) ); // false WARNING var_dump( _mb_strpos( '象形指事', '指', 4 ) ); // false var_dump( _mb_strpos( '象形指事', '指', 5 ) ); // false WARNING echo PHP_EOL.PHP_EOL; var_dump( mb_strpos( '象形指事', '指', 0 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 1 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 2 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 3 ) ); // false var_dump( mb_strpos( '象形指事', '指', -1 ) ); // false WARNING var_dump( mb_strpos( '象形指事', '指', 4 ) ); // false var_dump( mb_strpos( '象形指事', '指', 5 ) ); // false WARNING Feel free to contribute your thoughts :) Thanks!
Attachments (1)
Change History (6)
#2
@
16 months ago
- Keywords close 2nd-opinion added
- Milestone set to Awaiting Review
Hi @Cybr,
My apologies that this ticket took so long to receive a response.
It looks like mb_strpos() is still not included in the compat.php file, so this is still a relevant report. However, I looked into some stats related to PHP extensions for WordPress sites and it seems that 99.41% of all sites have mbstring enabled. It's also documented as highly recommended in the Make Hosting team's Handbook.
All that said, I'm not sure this compatibility shim benefits the majority of sites. When originally opened, I'm sure that the percentage of sites missing mbstring was a lot higher. Using function_exists() checks could at least guard against fatal errors on the rare occasion that it's missing today.
I'm going to suggest this be closed as a wontfix, but leaving it open for other contributors to weigh in.
#3
@
16 months ago
Hi @desrosj,
Thank you for sharing that information. Where can I access those numbers? It's critical for plugin developers to know these. We cannot keep developing in the dark or be forced to inject spyware into our user's sites.
Still, 0.59% of 41 million active sites still amounts to 242,000 sites not having mbstring support, many of which are subjected to inconsistent and inaccurate polyfills implemented by plugin authors. Too many of these polyfills simply fall back to strpos(), which can incur security issues when one relies on it to locate XML/HTML, as some do: (1) (2).
#4
@
5 months ago
- Keywords needs-unit-tests added; close removed
For as long as WordPress supports running without multibyte compiled in, I think it's valuable to include these polyfills.
That said, I would like to see some unit tests for this. PHP has a number of them that we may be able to reuse.
#5
@
3 months ago
It’s been a long time since this polyfill was suggested, but I think this could be incorporated into #63863.
From what I understand after a quick look, mb_strpos() is still performing a byte-for-byte search. The main difference compared to strpos() is that its offset and return units are code points instead of bytes.
<?php php > $a = "Bücher"; // This uses U+00FC, the LATIN SMALL LETTER U WITH DIAERESIS php > $b = "Bu\u{0308}cher"; // This uses U+0308, the COMBINING DIAERESIS, which is canonically equivalent to $a php > var_dump( $a, $b, strpos( $a, "ch" ), mb_strpos( $a, "ch" ), strpos( $b, "ch" ), mb_strpos( $b, "ch" ) ); string(7) "Bücher" string(8) "Bücher" int(3) int(2) int(4) int(3)
What this should mean is that we can build a polyfill based on bytes and avoid splitting the string into an array of every character.
<?php function _mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) { // handle the args fully in the actual code if ( ! is_utf8_charset( $encoding ) ) { return false; } $byte_offset = _mb_codepoint_span( $haystack, 0, $offset, $found_offset_codepoints ); if ( $found_offset_codepoints !== $offset ) { // start is after end of string return false; } $match_at_byte = strpos( $haystack, $needle, $byte_offset ); if ( false === $match_at_byte ) { return false; } $codepoints_to_match = _wp_codepoint_count( $haystack, $byte_offset, $match_at - $byte_offset ); return $offset + $codepoints_to_match; }
granted the details are mostly omitted here and this assumes we merge the proposal currently in WordPress/wordpress-develop#9498, but it should involve no memory overhead and be more performant than splitting and matching array elements. plus, as a bonus, it should work normatively in the presence of invalid UTF-8.
ideally, we’d want the search to return the same value as if we had called mb_strpos( mb_scrub( $haystack ), $needle )
(Redundant comments in diff file at bottom by mistake)
Added array to int conversion (needs testing).
Fixed offset ignore when key is found twice or more.
Fixed wrongfully false return when position is 0.