Make WordPress Core

Changeset 61003

Timestamp:
10/21/2025 03:48:20 AM (4 weeks ago)
Author:
dmsnell
Message:

HTML API: Replace PCRE in set_attribute() with new UTF-8 utility.

The HTML API has relied upon a single PCRE to determine whether to allow setting certain attribute names. This was because those names aren’t allowed to contain Unicode noncharacters, but detecting noncharacters without a UTF-8 parser is nontrivial.

In this change the direct PCRE has been replaced with a number of strcpn() calls and a call to the newer wp_has_noncharacters() function. Under the hood, this function will still defer to a PCRE if Unicode support is available, but otherwise will fall back to the UTF-8 pipeline in Core.

This change removes the platform variability, making the HTML API more reliable when Unicode support for PCRE is lacking.

Developed in https://github.com/WordPress/wordpress-develop/pull/9798
Discussed in https://core.trac.wordpress.org/ticket/63863

See #63863.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r60919 r61003  
    39313931        }
    39323932
    3933         /*
     3933        $name_length = strlen( $name );
     3934
     3935        /**
    39343936         * WordPress rejects more characters than are strictly forbidden
    39353937         * in HTML5. This is to prevent additional security risks deeper
    3936          * in the WordPress and plugin stack. Specifically the
    3937          * less-than (<) greater-than (>) and ampersand (&) aren't allowed.
     3938         * in the WordPress and plugin stack. Specifically the following
     3939         * are not allowed to be set as part of an HTML attribute name:
    39383940         *
    3939          * The use of a PCRE match enables looking for specific Unicode
    3940          * code points without writing a UTF-8 decoder. Whereas scanning
    3941          * for one-byte characters is trivial (with `strcspn`), scanning
    3942          * for the longer byte sequences would be more complicated. Given
    3943          * that this shouldn't be in the hot path for execution, it's a
    3944          * reasonable compromise in efficiency without introducing a
    3945          * noticeable impact on the overall system.
     3941         *  - greater-than “>”
     3942         *  - ampersand “&”
    39463943         *
    39473944         * @see https://html.spec.whatwg.org/#attributes-2
    3948          *
    3949          * @todo As the only regex pattern maybe we should take it out?
    3950          *       Are Unicode patterns available broadly in Core?
    39513945         */
    3952         if ( preg_match(
    3953             '~[' .
    3954                 // Syntax-like characters.
    3955                 '"\'>&</ =' .
    3956                 // Control characters.
    3957                 '\x{00}-\x{1F}' .
    3958                 // HTML noncharacters.
    3959                 '\x{FDD0}-\x{FDEF}' .
    3960                 '\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
    3961                 '\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
    3962                 '\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
    3963                 '\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
    3964                 '\x{10FFFE}\x{10FFFF}' .
    3965             ']~Ssu',
    3966             $name
    3967         ) ) {
     3946        if (
     3947            0 === $name_length ||
     3948            // Syntax-like characters.
     3949            strcspn( $name, '"\'>&</ =' ) !== $name_length ||
     3950            // Control characters.
     3951            strcspn(
     3952                $name,
     3953                "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" .
     3954                "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
     3955            ) !== $name_length ||
     3956            // Unicode noncharacters.
     3957            wp_has_noncharacters( $name )
     3958        ) {
    39683959            _doing_it_wrong(
    39693960                __METHOD__,
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r60919 r61003  
    310310
    311311        $this->assertSame( '<div data-enabled="abc">Test</div>', $processor->get_updated_html(), 'A case-insensitive set_attribute call did not update the existing attribute' );
     312    }
     313
     314    /**
     315     * Ensures that set_attribute doesn’t allow setting an
     316     * attribute with an invalid name and thus break syntax.
     317     *
     318     * @ticket 63863
     319     *
     320     * @expectedIncorrectUsage WP_HTML_Tag_Processor::set_attribute
     321     *
     322     * @dataProvider data_invalid_attribute_names
     323     *
     324     * @param string $invalid_name Invalid attribute name.
     325     */
     326    public function test_set_attribute_rejects_invalid_names( $invalid_name ) {
     327        $processor = new WP_HTML_Tag_Processor( '<div>' );
     328        $processor->next_tag();
     329
     330        $this->assertFalse(
     331            $processor->set_attribute( $invalid_name, true ),
     332            'Should have rejected invalid attribute name.'
     333        );
     334    }
     335
     336    /**
     337     * Data provider.
     338     *
     339     * @return array[]
     340     */
     341    public static function data_invalid_attribute_names() {
     342        $invalid_names = array(
     343            'Empty' => array( '' ),
     344        );
     345
     346        // Syntax-like characters.
     347        foreach ( str_split( '"\'>&</ =' ) as $c ) {
     348            $invalid_names[ $c ] = array( "too{$c}late" );
     349        }
     350
     351        // C0 controls.
     352        for ( $i = 0; $i <= 0x1F; $i++ ) {
     353            $c                                    = chr( $i );
     354            $invalid_names[ "C0 Controls: {$i}" ] = array( "shut{$c}down" );
     355        }
     356
     357        // Noncharacters.
     358        for ( $i = 0xFDD0; $i <= 0xFDEF; $i++ ) {
     359            $h                                       = dechex( $i );
     360            $c                                       = mb_chr( $i );
     361            $invalid_names[ "Noncharacter: U+{$h}" ] = array( "shut{$c}down" );
     362        }
     363
     364        for ( $b = 0; $b <= 16; $b++ ) {
     365            for ( $x = 0xFFFE; $x <= 0xFFFF; $x++ ) {
     366                $i                                       = ( $b << 16 ) + $x;
     367                $h                                       = dechex( $i );
     368                $c                                       = mb_chr( $i );
     369                $invalid_names[ "Noncharacter: U+{$h}" ] = array( "shut{$c}down" );
     370            }
     371        }
     372
     373        return $invalid_names;
    312374    }
    313375
Note: See TracChangeset for help on using the changeset viewer.