Файловый менеджер - Редактировать - /home/rcdroneh/iwannatestify.com/wp-includes/css/html-api.zip
Назад
PK +]VYt5�� � class-wp-html-token.phpnu �[��� <?php /** * HTML API: WP_HTML_Token class * * @package WordPress * @subpackage HTML-API * @since 6.4.0 */ /** * Core class used by the HTML processor during HTML parsing * for referring to tokens in the input HTML string. * * This class is designed for internal use by the HTML processor. * * @since 6.4.0 * * @access private * * @see WP_HTML_Processor */ class WP_HTML_Token { /** * Name of bookmark corresponding to source of token in input HTML string. * * Having a bookmark name does not imply that the token still exists. It * may be that the source token and underlying bookmark was wiped out by * some modification to the source HTML. * * @since 6.4.0 * * @var string */ public $bookmark_name = null; /** * Name of node; lowercase names such as "marker" are not HTML elements. * * For HTML elements/tags this value should come from WP_HTML_Processor::get_tag(). * * @since 6.4.0 * * @see WP_HTML_Processor::get_tag() * * @var string */ public $node_name = null; /** * Whether node contains the self-closing flag. * * A node may have a self-closing flag when it shouldn't. This value * only reports if the flag is present in the original HTML. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#self-closing-flag * * @var bool */ public $has_self_closing_flag = false; /** * Called when token is garbage-collected or otherwise destroyed. * * @var callable|null */ public $on_destroy = null; /** * Constructor - creates a reference to a token in some external HTML string. * * @since 6.4.0 * * @param string $bookmark_name Name of bookmark corresponding to location in HTML where token is found. * @param string $node_name Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker". * @param bool $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid. * @param callable $on_destroy Function to call when destroying token, useful for releasing the bookmark. */ public function __construct( $bookmark_name, $node_name, $has_self_closing_flag, $on_destroy = null ) { $this->bookmark_name = $bookmark_name; $this->node_name = $node_name; $this->has_self_closing_flag = $has_self_closing_flag; $this->on_destroy = $on_destroy; } /** * Destructor. * * @since 6.4.0 */ public function __destruct() { if ( is_callable( $this->on_destroy ) ) { call_user_func( $this->on_destroy, $this->bookmark_name ); } } /** * Wakeup magic method. * * @since 6.4.2 */ public function __wakeup() { throw new \LogicException( __CLASS__ . ' should never be unserialized' ); } } PK +]VY��+vK K class-wp-html-stack-event.phpnu �[��� <?php /** * HTML API: WP_HTML_Stack_Event class * * @package WordPress * @subpackage HTML-API * @since 6.6.0 */ /** * Core class used by the HTML Processor as a record for stack operations. * * This class is for internal usage of the WP_HTML_Processor class. * * @access private * @since 6.6.0 * * @see WP_HTML_Processor */ class WP_HTML_Stack_Event { /** * Refers to popping an element off of the stack of open elements. * * @since 6.6.0 */ const POP = 'pop'; /** * Refers to pushing an element onto the stack of open elements. * * @since 6.6.0 */ const PUSH = 'push'; /** * References the token associated with the stack push event, * even if this is a pop event for that element. * * @since 6.6.0 * * @var WP_HTML_Token */ public $token; /** * Indicates which kind of stack operation this event represents. * * May be one of the class constants. * * @since 6.6.0 * * @see self::POP * @see self::PUSH * * @var string */ public $operation; /** * Indicates if the stack element is a real or virtual node. * * @since 6.6.0 * * @var string */ public $provenance; /** * Constructor function. * * @since 6.6.0 * * @param WP_HTML_Token $token Token associated with stack event, always an opening token. * @param string $operation One of self::PUSH or self::POP. * @param string $provenance "virtual" or "real". */ public function __construct( $token, $operation, $provenance ) { $this->token = $token; $this->operation = $operation; $this->provenance = $provenance; } } PK +]VY^]�C C class-wp-html-span.phpnu �[��� <?php /** * HTML API: WP_HTML_Span class * * @package WordPress * @subpackage HTML-API * @since 6.2.0 */ /** * Core class used by the HTML tag processor to represent a textual span * inside an HTML document. * * This is a two-tuple in disguise, used to avoid the memory overhead * involved in using an array for the same purpose. * * This class is for internal usage of the WP_HTML_Tag_Processor class. * * @access private * @since 6.2.0 * @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`. * * @see WP_HTML_Tag_Processor */ class WP_HTML_Span { /** * Byte offset into document where span begins. * * @since 6.2.0 * * @var int */ public $start; /** * Byte length of this span. * * @since 6.5.0 * * @var int */ public $length; /** * Constructor. * * @since 6.2.0 * * @param int $start Byte offset into document where replacement span begins. * @param int $length Byte length of span. */ public function __construct( $start, $length ) { $this->start = $start; $this->length = $length; } } PK +]VY�I_�� � error_lognu �[��� [22-Oct-2024 11:34:45 UTC] PHP Fatal error: Uncaught Error: Class "WP_Token_Map" not found in /home/rcdroneh/iwannatestify.com/wp-includes/html-api/html5-named-character-references.php:38 Stack trace: #0 {main} thrown in /home/rcdroneh/iwannatestify.com/wp-includes/html-api/html5-named-character-references.php on line 38 [22-Oct-2024 11:39:05 UTC] PHP Fatal error: Uncaught Error: Class "WP_HTML_Tag_Processor" not found in /home/rcdroneh/iwannatestify.com/wp-includes/html-api/class-wp-html-processor.php:139 Stack trace: #0 {main} thrown in /home/rcdroneh/iwannatestify.com/wp-includes/html-api/class-wp-html-processor.php on line 139 PK +]VY�v0a? a? class-wp-html-processor.phpnu �[��� <?php /** * HTML API: WP_HTML_Processor class * * @package WordPress * @subpackage HTML-API * @since 6.4.0 */ /** * Core class used to safely parse and modify an HTML document. * * The HTML Processor class properly parses and modifies HTML5 documents. * * It supports a subset of the HTML5 specification, and when it encounters * unsupported markup, it aborts early to avoid unintentionally breaking * the document. The HTML Processor should never break an HTML document. * * While the `WP_HTML_Tag_Processor` is a valuable tool for modifying * attributes on individual HTML tags, the HTML Processor is more capable * and useful for the following operations: * * - Querying based on nested HTML structure. * * Eventually the HTML Processor will also support: * - Wrapping a tag in surrounding HTML. * - Unwrapping a tag by removing its parent. * - Inserting and removing nodes. * - Reading and changing inner content. * - Navigating up or around HTML structure. * * ## Usage * * Use of this class requires three steps: * * 1. Call a static creator method with your input HTML document. * 2. Find the location in the document you are looking for. * 3. Request changes to the document at that location. * * Example: * * $processor = WP_HTML_Processor::create_fragment( $html ); * if ( $processor->next_tag( array( 'breadcrumbs' => array( 'DIV', 'FIGURE', 'IMG' ) ) ) ) { * $processor->add_class( 'responsive-image' ); * } * * #### Breadcrumbs * * Breadcrumbs represent the stack of open elements from the root * of the document or fragment down to the currently-matched node, * if one is currently selected. Call WP_HTML_Processor::get_breadcrumbs() * to inspect the breadcrumbs for a matched tag. * * Breadcrumbs can specify nested HTML structure and are equivalent * to a CSS selector comprising tag names separated by the child * combinator, such as "DIV > FIGURE > IMG". * * Since all elements find themselves inside a full HTML document * when parsed, the return value from `get_breadcrumbs()` will always * contain any implicit outermost elements. For example, when parsing * with `create_fragment()` in the `BODY` context (the default), any * tag in the given HTML document will contain `array( 'HTML', 'BODY', … )` * in its breadcrumbs. * * Despite containing the implied outermost elements in their breadcrumbs, * tags may be found with the shortest-matching breadcrumb query. That is, * `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )` * matches all IMG elements directly inside a P element. To ensure that no * partial matches erroneously match it's possible to specify in a query * the full breadcrumb match all the way down from the root HTML element. * * Example: * * $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>'; * // ----- Matches here. * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG' ) ) ); * * $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>'; * // ---- Matches here. * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) ); * * $html = '<div><img></div><img>'; * // ----- Matches here, because IMG must be a direct child of the implicit BODY. * $processor->next_tag( array( 'breadcrumbs' => array( 'BODY', 'IMG' ) ) ); * * ## HTML Support * * This class implements a small part of the HTML5 specification. * It's designed to operate within its support and abort early whenever * encountering circumstances it can't properly handle. This is * the principle way in which this class remains as simple as possible * without cutting corners and breaking compliance. * * ### Supported elements * * If any unsupported element appears in the HTML input the HTML Processor * will abort early and stop all processing. This draconian measure ensures * that the HTML Processor won't break any HTML it doesn't fully understand. * * The following list specifies the HTML tags that _are_ supported: * * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. * - Custom elements: All custom elements are supported. :) * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH. * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. * - Links: A. * - Lists: DD, DL, DT, LI, OL, UL. * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. * - Paragraph: BR, P. * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. * - Templating elements: SLOT. * - Text decoration: RUBY. * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. * * ### Supported markup * * Some kinds of non-normative HTML involve reconstruction of formatting elements and * re-parenting of mis-nested elements. For example, a DIV tag found inside a TABLE * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters * such a case it will stop processing. * * The following list specifies HTML markup that _is_ supported: * * - Markup involving only those tags listed above. * - Fully-balanced and non-overlapping tags. * - HTML with unexpected tag closers. * - Some unbalanced or overlapping tags. * - P tags after unclosed P tags. * - BUTTON tags after unclosed BUTTON tags. * - A tags after unclosed A tags that don't involve any active formatting elements. * * @since 6.4.0 * * @see WP_HTML_Tag_Processor * @see https://html.spec.whatwg.org/ */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * The maximum number of bookmarks allowed to exist at any given time. * * HTML processing requires more bookmarks than basic tag processing, * so this class constant from the Tag Processor is overwritten. * * @since 6.4.0 * * @var int */ const MAX_BOOKMARKS = 100; /** * Holds the working state of the parser, including the stack of * open elements and the stack of active formatting elements. * * Initialized in the constructor. * * @since 6.4.0 * * @var WP_HTML_Processor_State */ private $state = null; /** * Used to create unique bookmark names. * * This class sets a bookmark for every tag in the HTML document that it encounters. * The bookmark name is auto-generated and increments, starting with `1`. These are * internal bookmarks and are automatically released when the referring WP_HTML_Token * goes out of scope and is garbage-collected. * * @since 6.4.0 * * @see WP_HTML_Processor::$release_internal_bookmark_on_destruct * * @var int */ private $bookmark_counter = 0; /** * Stores an explanation for why something failed, if it did. * * @see self::get_last_error * * @since 6.4.0 * * @var string|null */ private $last_error = null; /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * * This function is created inside the class constructor so that it can be passed to * the stack of open elements and the stack of active formatting elements without * exposing it as a public method on the class. * * @since 6.4.0 * * @var closure */ private $release_internal_bookmark_on_destruct = null; /** * Stores stack events which arise during parsing of the * HTML document, which will then supply the "match" events. * * @since 6.6.0 * * @var WP_HTML_Stack_Event[] */ private $element_queue = array(); /** * Current stack event, if set, representing a matched token. * * Because the parser may internally point to a place further along in a document * than the nodes which have already been processed (some "virtual" nodes may have * appeared while scanning the HTML document), this will point at the "current" node * being processed. It comes from the front of the element queue. * * @since 6.6.0 * * @var ?WP_HTML_Stack_Event */ private $current_element = null; /** * Context node if created as a fragment parser. * * @var ?WP_HTML_Token */ private $context_node = null; /** * Whether the parser has yet processed the context node, * if created as a fragment parser. * * The context node will be initially pushed onto the stack of open elements, * but when created as a fragment parser, this context element (and the implicit * HTML document node above it) should not be exposed as a matched token or node. * * This boolean indicates whether the processor should skip over the current * node in its initial search for the first node created from the input HTML. * * @var bool */ private $has_seen_context_node = false; /* * Public Interface Functions */ /** * Creates an HTML processor in the fragment parsing mode. * * Use this for cases where you are processing chunks of HTML that * will be found within a bigger HTML document, such as rendered * block output that exists within a post, `the_content` inside a * rendered site layout. * * Fragment parsing occurs within a context, which is an HTML element * that the document will eventually be placed in. It becomes important * when special elements have different rules than others, such as inside * a TEXTAREA or a TITLE tag where things that look like tags are text, * or inside a SCRIPT tag where things that look like HTML syntax are JS. * * The context value should be a representation of the tag into which the * HTML is found. For most cases this will be the body element. The HTML * form is provided because a context element may have attributes that * impact the parse, such as with a SCRIPT tag and its `type` attribute. * * ## Current HTML Support * * - The only supported context is `<body>`, which is the default value. * - The only supported document encoding is `UTF-8`, which is the default value. * * @since 6.4.0 * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. * * @param string $html Input HTML fragment to process. * @param string $context Context element for the fragment, must be default of `<body>`. * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. * @return static|null The created processor if successful, otherwise null. */ public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) { if ( '<body>' !== $context || 'UTF-8' !== $encoding ) { return null; } $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); $processor->state->context_node = array( 'BODY', array() ); $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; // @todo Create "fake" bookmarks for non-existent but implied nodes. $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); $processor->state->stack_of_open_elements->push( new WP_HTML_Token( 'root-node', 'HTML', false ) ); $context_node = new WP_HTML_Token( 'context-node', $processor->state->context_node[0], false ); $processor->state->stack_of_open_elements->push( $context_node ); $processor->context_node = $context_node; return $processor; } /** * Constructor. * * Do not use this method. Use the static creator methods instead. * * @access private * * @since 6.4.0 * * @see WP_HTML_Processor::create_fragment() * * @param string $html HTML to process. * @param string|null $use_the_static_create_methods_instead This constructor should not be called manually. */ public function __construct( $html, $use_the_static_create_methods_instead = null ) { parent::__construct( $html ); if ( self::CONSTRUCTOR_UNLOCK_CODE !== $use_the_static_create_methods_instead ) { _doing_it_wrong( __METHOD__, sprintf( /* translators: %s: WP_HTML_Processor::create_fragment(). */ __( 'Call %s to create an HTML Processor instead of calling the constructor directly.' ), '<code>WP_HTML_Processor::create_fragment()</code>' ), '6.4.0' ); } $this->state = new WP_HTML_Processor_State(); $this->state->stack_of_open_elements->set_push_handler( function ( WP_HTML_Token $token ) { $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance ); } ); $this->state->stack_of_open_elements->set_pop_handler( function ( WP_HTML_Token $token ) { $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance ); } ); /* * Create this wrapper so that it's possible to pass * a private method into WP_HTML_Token classes without * exposing it to any public API. */ $this->release_internal_bookmark_on_destruct = function ( $name ) { parent::release_bookmark( $name ); }; } /** * Returns the last error, if any. * * Various situations lead to parsing failure but this class will * return `false` in all those cases. To determine why something * failed it's possible to request the last error. This can be * helpful to know to distinguish whether a given tag couldn't * be found or if content in the document caused the processor * to give up and abort processing. * * Example * * $processor = WP_HTML_Processor::create_fragment( '<template><strong><button><em><p><em>' ); * false === $processor->next_tag(); * WP_HTML_Processor::ERROR_UNSUPPORTED === $processor->get_last_error(); * * @since 6.4.0 * * @see self::ERROR_UNSUPPORTED * @see self::ERROR_EXCEEDED_MAX_BOOKMARKS * * @return string|null The last error, if one exists, otherwise null. */ public function get_last_error() { return $this->last_error; } /** * Finds the next tag matching the $query. * * @todo Support matching the class name and tag name. * * @since 6.4.0 * @since 6.6.0 Visits all tokens, including virtual ones. * * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type string $tag_closers 'visit' to pause at tag closers, 'skip' or unset to only visit openers. * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this whole class name to match. * @type string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`. * May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`. * } * @return bool Whether a tag was matched. */ public function next_tag( $query = null ) { $visit_closers = isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers']; if ( null === $query ) { while ( $this->next_token() ) { if ( '#tag' !== $this->get_token_type() ) { continue; } if ( ! $this->is_tag_closer() || $visit_closers ) { return true; } } return false; } if ( is_string( $query ) ) { $query = array( 'breadcrumbs' => array( $query ) ); } if ( ! is_array( $query ) ) { _doing_it_wrong( __METHOD__, __( 'Please pass a query array to this function.' ), '6.4.0' ); return false; } $needs_class = ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) ? $query['class_name'] : null; if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->next_token() ) { if ( '#tag' !== $this->get_token_type() ) { continue; } if ( isset( $needs_class ) && ! $this->has_class( $needs_class ) ) { continue; } if ( ! $this->is_tag_closer() || $visit_closers ) { return true; } } return false; } $breadcrumbs = $query['breadcrumbs']; $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; while ( $match_offset > 0 && $this->next_token() ) { if ( '#tag' !== $this->get_token_type() || $this->is_tag_closer() ) { continue; } if ( isset( $needs_class ) && ! $this->has_class( $needs_class ) ) { continue; } if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { return true; } } return false; } /** * Ensures internal accounting is maintained for HTML semantic rules while * the underlying Tag Processor class is seeking to a bookmark. * * This doesn't currently have a way to represent non-tags and doesn't process * semantic rules for text nodes. For access to the raw tokens consider using * WP_HTML_Tag_Processor instead. * * @since 6.5.0 Added for internal support; do not use. * * @access private * * @return bool */ public function next_token() { $this->current_element = null; if ( isset( $this->last_error ) ) { return false; } if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) { while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) { continue; } $this->has_seen_context_node = 'done'; return $this->next_token(); } $this->current_element = array_shift( $this->element_queue ); while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) { if ( isset( $this->current_element ) ) { if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { $this->has_seen_context_node = true; return $this->next_token(); } } $this->current_element = array_shift( $this->element_queue ); } if ( ! isset( $this->current_element ) ) { if ( 'done' === $this->has_seen_context_node ) { return false; } else { return $this->next_token(); } } if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) { $this->element_queue = array(); $this->current_element = null; return false; } // Avoid sending close events for elements which don't expect a closing. if ( WP_HTML_Stack_Event::POP === $this->current_element->operation && ! static::expects_closer( $this->current_element->token ) ) { return $this->next_token(); } return true; } /** * Indicates if the current tag token is a tag closer. * * Example: * * $p = WP_HTML_Processor::create_fragment( '<div></div>' ); * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === false; * * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === true; * * @since 6.6.0 Subclassed for HTML Processor. * * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer() { return $this->is_virtual() ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() ) : parent::is_tag_closer(); } /** * Indicates if the currently-matched token is virtual, created by a stack operation * while processing HTML, rather than a token found in the HTML text itself. * * @since 6.6.0 * * @return bool Whether the current token is virtual. */ private function is_virtual() { return ( isset( $this->current_element->provenance ) && 'virtual' === $this->current_element->provenance ); } /** * Indicates if the currently-matched tag matches the given breadcrumbs. * * A "*" represents a single tag wildcard, where any tag matches, but not no tags. * * At some point this function _may_ support a `**` syntax for matching any number * of unspecified tags in the breadcrumb stack. This has been intentionally left * out, however, to keep this function simple and to avoid introducing backtracking, * which could open up surprising performance breakdowns. * * Example: * * $processor = WP_HTML_Processor::create_fragment( '<div><span><figure><img></figure></span></div>' ); * $processor->next_tag( 'img' ); * true === $processor->matches_breadcrumbs( array( 'figure', 'img' ) ); * true === $processor->matches_breadcrumbs( array( 'span', 'figure', 'img' ) ); * false === $processor->matches_breadcrumbs( array( 'span', 'img' ) ); * true === $processor->matches_breadcrumbs( array( 'span', '*', 'img' ) ); * * @since 6.4.0 * * @param string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`. * May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`. * @return bool Whether the currently-matched tag is found at the given nested structure. */ public function matches_breadcrumbs( $breadcrumbs ) { // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; } // Start at the last crumb. $crumb = end( $breadcrumbs ); if ( '*' !== $crumb && $this->get_tag() !== strtoupper( $crumb ) ) { return false; } foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { $crumb = strtoupper( current( $breadcrumbs ) ); if ( '*' !== $crumb && $node->node_name !== $crumb ) { return false; } if ( false === prev( $breadcrumbs ) ) { return true; } } return false; } /** * Indicates if the currently-matched node expects a closing * token, or if it will self-close on the next step. * * Most HTML elements expect a closer, such as a P element or * a DIV element. Others, like an IMG element are void and don't * have a closing tag. Special elements, such as SCRIPT and STYLE, * are treated just like void tags. Text nodes and self-closing * foreign content will also act just like a void tag, immediately * closing as soon as the processor advances to the next token. * * @since 6.6.0 * * @todo When adding support for foreign content, ensure that * this returns false for self-closing elements in the * SVG and MathML namespace. * * @param ?WP_HTML_Token $node Node to examine instead of current node, if provided. * @return bool Whether to expect a closer for the currently-matched node, * or `null` if not matched on any token. */ public function expects_closer( $node = null ) { $token_name = $node->node_name ?? $this->get_token_name(); if ( ! isset( $token_name ) ) { return null; } return ! ( // Comments, text nodes, and other atomic tokens. '#' === $token_name[0] || // Doctype declarations. 'html' === $token_name || // Void elements. self::is_void( $token_name ) || // Special atomic elements. in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ); } /** * Steps through the HTML document and stop at the next tag, if any. * * @since 6.4.0 * * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. * * @see self::PROCESS_NEXT_NODE * @see self::REPROCESS_CURRENT_NODE * * @param string $node_to_process Whether to parse the next node or reprocess the current node. * @return bool Whether a tag was matched. */ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. if ( null !== $this->last_error ) { return false; } if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) { /* * Void elements still hop onto the stack of open elements even though * there's no corresponding closing tag. This is important for managing * stack-based operations such as "navigate to parent node" or checking * on an element's breadcrumbs. * * When moving on to the next node, therefore, if the bottom-most element * on the stack is a void element, it must be closed. * * @todo Once self-closing foreign elements and BGSOUND are supported, * they must also be implicitly closed here too. BGSOUND is * special since it's only self-closing if the self-closing flag * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { parent::next_token(); } // Finish stepping when there are no more tokens in the document. if ( WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state ) { return false; } $this->state->current_token = new WP_HTML_Token( $this->bookmark_token(), $this->get_token_name(), $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); try { switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: return $this->step_in_body(); default: $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." ); } } catch ( WP_HTML_Unsupported_Exception $e ) { /* * Exceptions are used in this class to escape deep call stacks that * otherwise might involve messier calling and return conventions. */ return false; } } /** * Computes the HTML breadcrumbs for the currently-matched node, if matched. * * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root HTML node to the matched element. * * @todo It could be more efficient to expose a generator-based version of this function * to avoid creating the array copy on tag iteration. If this is done, it would likely * be more useful to walk up the stack when yielding instead of starting at the top. * * Example * * $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' ); * $processor->next_tag( 'IMG' ); * $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' ); * * @since 6.4.0 * * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; } if ( ! $this->is_virtual() ) { return $breadcrumbs; } foreach ( $this->element_queue as $queue_item ) { if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) { break; } if ( 'context-node' === $queue_item->token->bookmark_name ) { break; } if ( 'real' === $queue_item->provenance ) { break; } if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) { $breadcrumbs[] = $queue_item->token->node_name; } else { array_pop( $breadcrumbs ); } } if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) { array_pop( $breadcrumbs ); } // Add the virtual node we're at. if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { $breadcrumbs[] = $this->current_element->token->node_name; } return $breadcrumbs; } /** * Returns the nesting depth of the current location in the document. * * Example: * * $processor = WP_HTML_Processor::create_fragment( '<div><p></p></div>' ); * // The processor starts in the BODY context, meaning it has depth from the start: HTML > BODY. * 2 === $processor->get_current_depth(); * * // Opening the DIV element increases the depth. * $processor->next_token(); * 3 === $processor->get_current_depth(); * * // Opening the P element increases the depth. * $processor->next_token(); * 4 === $processor->get_current_depth(); * * // The P element is closed during `next_token()` so the depth is decreased to reflect that. * $processor->next_token(); * 3 === $processor->get_current_depth(); * * @since 6.6.0 * * @return int Nesting-depth of current location in the document. */ public function get_current_depth() { return $this->is_virtual() ? count( $this->get_breadcrumbs() ) : $this->state->stack_of_open_elements->count(); } /** * Parses next element in the 'in body' insertion mode. * * This internal function performs the 'in body' insertion mode * logic for the generalized WP_HTML_Processor::step() function. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#parsing-main-inbody * @see WP_HTML_Processor::step * * @return bool Whether an element was found. */ private function step_in_body() { $token_name = $this->get_token_name(); $token_type = $this->get_token_type(); $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; $op = "{$op_sigil}{$token_name}"; switch ( $op ) { case '#comment': case '#funky-comment': case '#presumptuous-tag': $this->insert_html_element( $this->state->current_token ); return true; case '#text': $this->reconstruct_active_formatting_elements(); $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; /* * > A character token that is U+0000 NULL * * Any successive sequence of NULL bytes is ignored and won't * trigger active format reconstruction. Therefore, if the text * only comprises NULL bytes then the token should be ignored * here, but if there are any other characters in the stream * the active formats should be reconstructed. */ if ( 1 <= $current_token->length && "\x00" === $this->html[ $current_token->start ] && strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length ) { // Parse error: ignore the token. return $this->step(); } /* * Whitespace-only text does not affect the frameset-ok flag. * It is probably inter-element whitespace, but it may also * contain character references which decode only to whitespace. */ $text = $this->get_modifiable_text(); if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { $this->state->frameset_ok = false; } $this->insert_html_element( $this->state->current_token ); return true; case 'html': /* * > A DOCTYPE token * > Parse error. Ignore the token. */ return $this->step(); /* * > A start tag whose tag name is "button" */ case '+BUTTON': if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. $this->generate_implied_end_tags(); $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); } $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); $this->state->frameset_ok = false; return true; /* * > A start tag whose tag name is one of: "address", "article", "aside", * > "blockquote", "center", "details", "dialog", "dir", "div", "dl", * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" */ case '+ADDRESS': case '+ARTICLE': case '+ASIDE': case '+BLOCKQUOTE': case '+CENTER': case '+DETAILS': case '+DIALOG': case '+DIR': case '+DIV': case '+DL': case '+FIELDSET': case '+FIGCAPTION': case '+FIGURE': case '+FOOTER': case '+HEADER': case '+HGROUP': case '+MAIN': case '+MENU': case '+NAV': case '+OL': case '+P': case '+SEARCH': case '+SECTION': case '+SUMMARY': case '+UL': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } $this->insert_html_element( $this->state->current_token ); return true; /* * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" */ case '-ADDRESS': case '-ARTICLE': case '-ASIDE': case '-BLOCKQUOTE': case '-BUTTON': case '-CENTER': case '-DETAILS': case '-DIALOG': case '-DIR': case '-DIV': case '-DL': case '-FIELDSET': case '-FIGCAPTION': case '-FIGURE': case '-FOOTER': case '-HEADER': case '-HGROUP': case '-LISTING': case '-MAIN': case '-MENU': case '-NAV': case '-OL': case '-PRE': case '-SEARCH': case '-SECTION': case '-SUMMARY': case '-UL': if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { // @todo Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ case '+H1': case '+H2': case '+H3': case '+H4': case '+H5': case '+H6': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } if ( in_array( $this->state->stack_of_open_elements->current_node()->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) ) { // @todo Indicate a parse error once it's possible. $this->state->stack_of_open_elements->pop(); } $this->insert_html_element( $this->state->current_token ); return true; /* * > A start tag whose tag name is one of: "pre", "listing" */ case '+PRE': case '+LISTING': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } $this->insert_html_element( $this->state->current_token ); $this->state->frameset_ok = false; return true; /* * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ case '-H1': case '-H2': case '-H3': case '-H4': case '-H5': case '-H6': if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { /* * This is a parse error; ignore the token. * * @todo Indicate a parse error once it's possible. */ return $this->step(); } $this->generate_implied_end_tags(); if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); return true; /* * > A start tag whose tag name is "li" * > A start tag whose tag name is one of: "dd", "dt" */ case '+DD': case '+DT': case '+LI': $this->state->frameset_ok = false; $node = $this->state->stack_of_open_elements->current_node(); $is_li = 'LI' === $token_name; in_body_list_loop: /* * The logic for LI and DT/DD is the same except for one point: LI elements _only_ * close other LI elements, but a DT or DD element closes _any_ open DT or DD element. */ if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) { $node_name = $is_li ? 'LI' : $node->node_name; $this->generate_implied_end_tags( $node_name ); if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } $this->state->stack_of_open_elements->pop_until( $node_name ); goto in_body_list_done; } if ( 'ADDRESS' !== $node->node_name && 'DIV' !== $node->node_name && 'P' !== $node->node_name && $this->is_special( $node->node_name ) ) { /* * > If node is in the special category, but is not an address, div, * > or p element, then jump to the step labeled done below. */ goto in_body_list_done; } else { /* * > Otherwise, set node to the previous entry in the stack of open elements * > and return to the step labeled loop. */ foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { $node = $item; break; } goto in_body_list_loop; } in_body_list_done: if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } $this->insert_html_element( $this->state->current_token ); return true; /* * > An end tag whose tag name is "li" * > An end tag whose tag name is one of: "dd", "dt" */ case '-DD': case '-DT': case '-LI': if ( /* * An end tag whose tag name is "li": * If the stack of open elements does not have an li element in list item scope, * then this is a parse error; ignore the token. */ ( 'LI' === $token_name && ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' ) ) || /* * An end tag whose tag name is one of: "dd", "dt": * If the stack of open elements does not have an element in scope that is an * HTML element with the same tag name as that of the token, then this is a * parse error; ignore the token. */ ( 'LI' !== $token_name && ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) ) { /* * This is a parse error, ignore the token. * * @todo Indicate a parse error once it's possible. */ return $this->step(); } $this->generate_implied_end_tags( $token_name ); if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* * > An end tag whose tag name is "p" */ case '-P': if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->insert_html_element( $this->state->current_token ); } $this->close_a_p_element(); return true; // > A start tag whose tag name is "a" case '+A': foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { switch ( $item->node_name ) { case 'marker': break; case 'A': $this->run_adoption_agency_algorithm(); $this->state->active_formatting_elements->remove_node( $item ); $this->state->stack_of_open_elements->remove_node( $item ); break; } } $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); $this->state->active_formatting_elements->push( $this->state->current_token ); return true; /* * > A start tag whose tag name is one of: "b", "big", "code", "em", "font", "i", * > "s", "small", "strike", "strong", "tt", "u" */ case '+B': case '+BIG': case '+CODE': case '+EM': case '+FONT': case '+I': case '+S': case '+SMALL': case '+STRIKE': case '+STRONG': case '+TT': case '+U': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); $this->state->active_formatting_elements->push( $this->state->current_token ); return true; /* * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", * > "nobr", "s", "small", "strike", "strong", "tt", "u" */ case '-A': case '-B': case '-BIG': case '-CODE': case '-EM': case '-FONT': case '-I': case '-S': case '-SMALL': case '-STRIKE': case '-STRONG': case '-TT': case '-U': $this->run_adoption_agency_algorithm(); return true; /* * > An end tag whose tag name is "br" * > Parse error. Drop the attributes from the token, and act as described in the next * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather * > than the end tag token that it actually is. */ case '-BR': $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' ); /* * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" */ case '+AREA': case '+BR': case '+EMBED': case '+IMG': case '+KEYGEN': case '+WBR': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); $this->state->frameset_ok = false; return true; /* * > A start tag whose tag name is "input" */ case '+INPUT': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); $type_attribute = $this->get_attribute( 'type' ); /* * > If the token does not have an attribute with the name "type", or if it does, * > but that attribute's value is not an ASCII case-insensitive match for the * > string "hidden", then: set the frameset-ok flag to "not ok". */ if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { $this->state->frameset_ok = false; } return true; /* * > A start tag whose tag name is "hr" */ case '+HR': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } $this->insert_html_element( $this->state->current_token ); $this->state->frameset_ok = false; return true; /* * > A start tag whose tag name is one of: "param", "source", "track" */ case '+PARAM': case '+SOURCE': case '+TRACK': $this->insert_html_element( $this->state->current_token ); return true; } /* * These tags require special handling in the 'in body' insertion mode * but that handling hasn't yet been implemented. * * As the rules for each tag are implemented, the corresponding tag * name should be removed from this list. An accompanying test should * help ensure this list is maintained. * * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags * * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's * possible to handle "any other start tag" and "any other end tag" below, * as that guarantees execution doesn't proceed for the unimplemented tags. * * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody */ switch ( $token_name ) { case 'APPLET': case 'BASE': case 'BASEFONT': case 'BGSOUND': case 'BODY': case 'CAPTION': case 'COL': case 'COLGROUP': case 'FORM': case 'FRAME': case 'FRAMESET': case 'HEAD': case 'HTML': case 'IFRAME': case 'LINK': case 'MARQUEE': case 'MATH': case 'META': case 'NOBR': case 'NOEMBED': case 'NOFRAMES': case 'NOSCRIPT': case 'OBJECT': case 'OPTGROUP': case 'OPTION': case 'PLAINTEXT': case 'RB': case 'RP': case 'RT': case 'RTC': case 'SARCASM': case 'SCRIPT': case 'SELECT': case 'STYLE': case 'SVG': case 'TABLE': case 'TBODY': case 'TD': case 'TEMPLATE': case 'TEXTAREA': case 'TFOOT': case 'TH': case 'THEAD': case 'TITLE': case 'TR': case 'XMP': $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); } if ( ! parent::is_tag_closer() ) { /* * > Any other start tag */ $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); return true; } else { /* * > Any other end tag */ /* * Find the corresponding tag opener in the stack of open elements, if * it exists before reaching a special element, which provides a kind * of boundary in the stack. For example, a `</custom-tag>` should not * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { if ( $token_name === $node->node_name ) { break; } if ( self::is_special( $node->node_name ) ) { // This is a parse error, ignore the token. return $this->step(); } } $this->generate_implied_end_tags( $token_name ); if ( $node !== $this->state->stack_of_open_elements->current_node() ) { // @todo Record parse error: this error doesn't impact parsing. } foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); if ( $node === $item ) { return true; } } } } /* * Internal helpers */ /** * Creates a new bookmark for the currently-matched token and returns the generated name. * * @since 6.4.0 * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token(). * * @throws Exception When unable to allocate requested bookmark. * * @return string|false Name of created bookmark, or false if unable to create. */ private function bookmark_token() { if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); } return "{$this->bookmark_counter}"; } /* * HTML semantic overrides for Tag Processor */ /** * Returns the uppercase name of the matched tag. * * The semantic rules for HTML specify that certain tags be reprocessed * with a different tag name. Because of this, the tag name presented * by the HTML Processor may differ from the one reported by the HTML * Tag Processor, which doesn't apply these semantic rules. * * Example: * * $processor = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' ); * $processor->next_tag() === true; * $processor->get_tag() === 'DIV'; * * $processor->next_tag() === false; * $processor->get_tag() === null; * * @since 6.4.0 * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ public function get_tag() { if ( null !== $this->last_error ) { return null; } if ( $this->is_virtual() ) { return $this->current_element->token->node_name; } $tag_name = parent::get_tag(); switch ( $tag_name ) { case 'IMAGE': /* * > A start tag whose tag name is "image" * > Change the token's tag name to "img" and reprocess it. (Don't ask.) */ return 'IMG'; default: return $tag_name; } } /** * Indicates if the currently matched tag contains the self-closing flag. * * No HTML elements ought to have the self-closing flag and for those, the self-closing * flag will be ignored. For void elements this is benign because they "self close" * automatically. For non-void HTML elements though problems will appear if someone * intends to use a self-closing element in place of that element with an empty body. * For HTML foreign elements and custom elements the self-closing flag determines if * they self-close or not. * * This function does not determine if a tag is self-closing, * but only if the self-closing flag is present in the syntax. * * @since 6.6.0 Subclassed for the HTML Processor. * * @return bool Whether the currently matched tag contains the self-closing flag. */ public function has_self_closing_flag() { return $this->is_virtual() ? false : parent::has_self_closing_flag(); } /** * Returns the node name represented by the token. * * This matches the DOM API value `nodeName`. Some values * are static, such as `#text` for a text node, while others * are dynamically generated from the token itself. * * Dynamic names: * - Uppercase tag name for tag matches. * - `html` for DOCTYPE declarations. * * Note that if the Tag Processor is not matched on a token * then this function will return `null`, either because it * hasn't yet found a token or because it reached the end * of the document without matching a token. * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string|null Name of the matched token. */ public function get_token_name() { return $this->is_virtual() ? $this->current_element->token->node_name : parent::get_token_name(); } /** * Indicates the kind of matched token, if any. * * This differs from `get_token_name()` in that it always * returns a static string indicating the type, whereas * `get_token_name()` may return values derived from the * token itself, such as a tag name or processing * instruction tag. * * Possible values: * - `#tag` when matched on a tag. * - `#text` when matched on a text node. * - `#cdata-section` when matched on a CDATA node. * - `#comment` when matched on a comment. * - `#doctype` when matched on a DOCTYPE declaration. * - `#presumptuous-tag` when matched on an empty tag closer. * - `#funky-comment` when matched on a funky comment. * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string|null What kind of token is matched, or null. */ public function get_token_type() { if ( $this->is_virtual() ) { /* * This logic comes from the Tag Processor. * * @todo It would be ideal not to repeat this here, but it's not clearly * better to allow passing a token name to `get_token_type()`. */ $node_name = $this->current_element->token->node_name; $starting_char = $node_name[0]; if ( 'A' <= $starting_char && 'Z' >= $starting_char ) { return '#tag'; } if ( 'html' === $node_name ) { return '#doctype'; } return $node_name; } return parent::get_token_type(); } /** * Returns the value of a requested attribute from a matched tag opener if that attribute exists. * * Example: * * $p = WP_HTML_Processor::create_fragment( '<div enabled class="test" data-test-id="14">Test</div>' ); * $p->next_token() === true; * $p->get_attribute( 'data-test-id' ) === '14'; * $p->get_attribute( 'enabled' ) === true; * $p->get_attribute( 'aria-label' ) === null; * * $p->next_tag() === false; * $p->get_attribute( 'class' ) === null; * * @since 6.6.0 Subclassed for HTML Processor. * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { return $this->is_virtual() ? null : parent::get_attribute( $name ); } /** * Updates or creates a new attribute on the currently matched tag with the passed value. * * For boolean attributes special handling is provided: * - When `true` is passed as the value, then only the attribute name is added to the tag. * - When `false` is passed, the attribute gets removed if it existed before. * * For string attributes, the value is escaped using the `esc_attr` function. * * @since 6.6.0 Subclassed for the HTML Processor. * * @param string $name The attribute name to target. * @param string|bool $value The new attribute value. * @return bool Whether an attribute value was set. */ public function set_attribute( $name, $value ) { return $this->is_virtual() ? false : parent::set_attribute( $name, $value ); } /** * Remove an attribute from the currently-matched tag. * * @since 6.6.0 Subclassed for HTML Processor. * * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ) { return $this->is_virtual() ? false : parent::remove_attribute( $name ); } /** * Gets lowercase names of all attributes matching a given prefix in the current tag. * * Note that matching is case-insensitive. This is in accordance with the spec: * * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * Example: * * $p = new WP_HTML_Tag_Processor( '<div data-ENABLED class="test" DATA-test-id="14">Test</div>' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); * * $p->next_tag() === false; * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.6.0 Subclassed for the HTML Processor. * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * * @param string $prefix Prefix of requested attribute names. * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ) { return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix ); } /** * Adds a new class name to the currently matched tag. * * @since 6.6.0 Subclassed for the HTML Processor. * * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ public function add_class( $class_name ) { return $this->is_virtual() ? false : parent::add_class( $class_name ); } /** * Removes a class name from the currently matched tag. * * @since 6.6.0 Subclassed for the HTML Processor. * * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ) { return $this->is_virtual() ? false : parent::remove_class( $class_name ); } /** * Returns if a matched tag contains the given ASCII case-insensitive class name. * * @since 6.6.0 Subclassed for the HTML Processor. * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ) { return $this->is_virtual() ? null : parent::has_class( $wanted_class ); } /** * Generator for a foreach loop to step through each class name for the matched tag. * * This generator function is designed to be used inside a "foreach" loop. * * Example: * * $p = WP_HTML_Processor::create_fragment( "<div class='free <egg<\tlang-en'>" ); * $p->next_tag(); * foreach ( $p->class_list() as $class_name ) { * echo "{$class_name} "; * } * // Outputs: "free <egg> lang-en " * * @since 6.6.0 Subclassed for the HTML Processor. */ public function class_list() { return $this->is_virtual() ? null : parent::class_list(); } /** * Returns the modifiable text for a matched token, or an empty string. * * Modifiable text is text content that may be read and changed without * changing the HTML structure of the document around it. This includes * the contents of `#text` nodes in the HTML as well as the inner * contents of HTML comments, Processing Instructions, and others, even * though these nodes aren't part of a parsed DOM tree. They also contain * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any * other section in an HTML document which cannot contain HTML markup (DATA). * * If a token has no modifiable text then an empty string is returned to * avoid needless crashing or type errors. An empty string does not mean * that a token has modifiable text, and a token with modifiable text may * have an empty string (e.g. a comment with no contents). * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string */ public function get_modifiable_text() { return $this->is_virtual() ? '' : parent::get_modifiable_text(); } /** * Indicates what kind of comment produced the comment node. * * Because there are different kinds of HTML syntax which produce * comments, the Tag Processor tracks and exposes this as a type * for the comment. Nominally only regular HTML comments exist as * they are commonly known, but a number of unrelated syntax errors * also produce comments. * * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT * @see self::COMMENT_AS_CDATA_LOOKALIKE * @see self::COMMENT_AS_INVALID_HTML * @see self::COMMENT_AS_HTML_COMMENT * @see self::COMMENT_AS_PI_NODE_LOOKALIKE * * @since 6.6.0 Subclassed for the HTML Processor. * * @return string|null */ public function get_comment_type() { return $this->is_virtual() ? null : parent::get_comment_type(); } /** * Removes a bookmark that is no longer needed. * * Releasing a bookmark frees up the small * performance overhead it requires. * * @since 6.4.0 * * @param string $bookmark_name Name of the bookmark to remove. * @return bool Whether the bookmark already existed before removal. */ public function release_bookmark( $bookmark_name ) { return parent::release_bookmark( "_{$bookmark_name}" ); } /** * Moves the internal cursor in the HTML Processor to a given bookmark's location. * * Be careful! Seeking backwards to a previous location resets the parser to the * start of the document and reparses the entire contents up until it finds the * sought-after bookmarked location. * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. * * @since 6.4.0 * * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { // Flush any pending updates to the document before beginning. $this->get_updated_html(); $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* * If seeking backwards, it's possible that the sought-after bookmark exists within an element * which has been closed before the current cursor; in other words, it has already been removed * from the stack of open elements. This means that it's insufficient to simply pop off elements * from the stack of open elements which appear after the bookmarked location and then jump to * that location, as the elements which were open before won't be re-opened. * * In order to maintain consistency, the HTML Processor rewinds to the start of the document * and reparses everything until it finds the sought-after bookmark. * * There are potentially better ways to do this: cache the parser state for each bookmark and * restore it when seeking; store an immutable and idempotent register of where elements open * and close. * * If caching the parser state it will be essential to properly maintain the cached stack of * open elements and active formatting elements when modifying the document. This could be a * tedious and time-consuming process as well, and so for now will not be performed. * * It may be possible to track bookmarks for where elements open and close, and in doing so * be able to quickly recalculate breadcrumbs for any element in the document. It may even * be possible to remove the stack of open elements and compute it on the fly this way. * If doing this, the parser would need to track the opening and closing locations for all * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves * this list could be automatically maintained while modifying the document. Finding the * breadcrumbs would then amount to traversing that list from the start until the token * being inspected. Once an element closes, if there are no bookmarks pointing to locations * within that element, then all of these locations may be forgotten to save on memory use * and computation time. */ if ( 'backward' === $direction ) { /* * Instead of clearing the parser state and starting fresh, calling the stack methods * maintains the proper flags in the parser. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { if ( 'context-node' === $item->bookmark_name ) { break; } $this->state->stack_of_open_elements->remove_node( $item ); } foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { if ( 'context-node' === $item->bookmark_name ) { break; } $this->state->active_formatting_elements->remove_node( $item ); } parent::seek( 'context-node' ); $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; $this->state->frameset_ok = true; $this->element_queue = array(); $this->current_element = null; } // When moving forwards, reparse the document until reaching the same location as the original bookmark. if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { return true; } while ( $this->next_token() ) { if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) { $this->current_element = array_shift( $this->element_queue ); } return true; } } return false; } /** * Sets a bookmark in the HTML document. * * Bookmarks represent specific places or tokens in the HTML * document, such as a tag opener or closer. When applying * edits to a document, such as setting an attribute, the * text offsets of that token may shift; the bookmark is * kept updated with those shifts and remains stable unless * the entire span of text in which the token sits is removed. * * Release bookmarks when they are no longer needed. * * Example: * * <main><h2>Surprising fact you may not know!</h2></main> * ^ ^ * \-|-- this `H2` opener bookmark tracks the token * * <main class="clickbait"><h2>Surprising fact you may no… * ^ ^ * \-|-- it shifts with edits * * Bookmarks provide the ability to seek to a previously-scanned * place in the HTML document. This avoids the need to re-scan * the entire document. * * Example: * * <ul><li>One</li><li>Two</li><li>Three</li></ul> * ^^^^ * want to note this last item * * $p = new WP_HTML_Tag_Processor( $html ); * $in_list = false; * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { * if ( 'UL' === $p->get_tag() ) { * if ( $p->is_tag_closer() ) { * $in_list = false; * $p->set_bookmark( 'resume' ); * if ( $p->seek( 'last-li' ) ) { * $p->add_class( 'last-li' ); * } * $p->seek( 'resume' ); * $p->release_bookmark( 'last-li' ); * $p->release_bookmark( 'resume' ); * } else { * $in_list = true; * } * } * * if ( 'LI' === $p->get_tag() ) { * $p->set_bookmark( 'last-li' ); * } * } * * Bookmarks intentionally hide the internal string offsets * to which they refer. They are maintained internally as * updates are applied to the HTML document and therefore * retain their "position" - the location to which they * originally pointed. The inability to use bookmarks with * functions like `substr` is therefore intentional to guard * against accidentally breaking the HTML. * * Because bookmarks allocate memory and require processing * for every applied update, they are limited and require * a name. They should not be created with programmatically-made * names, such as "li_{$index}" with some loop. As a general * rule they should only be created with string-literal names * like "start-of-section" or "last-paragraph". * * Bookmarks are a powerful tool to enable complicated behavior. * Consider double-checking that you need this tool if you are * reaching for it, as inappropriate use could lead to broken * HTML structure or unwanted processing overhead. * * @since 6.4.0 * * @param string $bookmark_name Identifies this particular bookmark. * @return bool Whether the bookmark was successfully created. */ public function set_bookmark( $bookmark_name ) { return parent::set_bookmark( "_{$bookmark_name}" ); } /** * Checks whether a bookmark with the given name exists. * * @since 6.5.0 * * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ public function has_bookmark( $bookmark_name ) { return parent::has_bookmark( "_{$bookmark_name}" ); } /* * HTML Parsing Algorithms */ /** * Closes a P element. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#close-a-p-element */ private function close_a_p_element() { $this->generate_implied_end_tags( 'P' ); $this->state->stack_of_open_elements->pop_until( 'P' ); } /** * Closes elements that have implied end tags. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#generate-implied-end-tags * * @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements. */ private function generate_implied_end_tags( $except_for_this_element = null ) { $elements_with_implied_end_tags = array( 'DD', 'DT', 'LI', 'P', ); $current_node = $this->state->stack_of_open_elements->current_node(); while ( $current_node && $current_node->node_name !== $except_for_this_element && in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } } /** * Closes elements that have implied end tags, thoroughly. * * See the HTML specification for an explanation why this is * different from generating end tags in the normal sense. * * @since 6.4.0 * * @see WP_HTML_Processor::generate_implied_end_tags * @see https://html.spec.whatwg.org/#generate-implied-end-tags */ private function generate_implied_end_tags_thoroughly() { $elements_with_implied_end_tags = array( 'DD', 'DT', 'LI', 'P', ); while ( in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } } /** * Reconstructs the active formatting elements. * * > This has the effect of reopening all the formatting elements that were opened * > in the current body, cell, or caption (whichever is youngest) that haven't * > been explicitly closed. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements * * @return bool Whether any formatting elements needed to be reconstructed. */ private function reconstruct_active_formatting_elements() { /* * > If there are no entries in the list of active formatting elements, then there is nothing * > to reconstruct; stop this algorithm. */ if ( 0 === $this->state->active_formatting_elements->count() ) { return false; } $last_entry = $this->state->active_formatting_elements->current_node(); if ( /* * > If the last (most recently added) entry in the list of active formatting elements is a marker; * > stop this algorithm. */ 'marker' === $last_entry->node_name || /* * > If the last (most recently added) entry in the list of active formatting elements is an * > element that is in the stack of open elements, then there is nothing to reconstruct; * > stop this algorithm. */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; } $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); } /** * Runs the adoption agency algorithm. * * @since 6.4.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm */ private function run_adoption_agency_algorithm() { $budget = 1000; $subject = $this->get_tag(); $current_node = $this->state->stack_of_open_elements->current_node(); if ( // > If the current node is an HTML element whose tag name is subject $current_node && $subject === $current_node->node_name && // > the current node is not in the list of active formatting elements ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { $this->state->stack_of_open_elements->pop(); return; } $outer_loop_counter = 0; while ( $budget-- > 0 ) { if ( $outer_loop_counter++ >= 8 ) { return; } /* * > Let formatting element be the last element in the list of active formatting elements that: * > - is between the end of the list and the last marker in the list, * > if any, or the start of the list otherwise, * > - and has the tag name subject. */ $formatting_element = null; foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { if ( 'marker' === $item->node_name ) { break; } if ( $subject === $item->node_name ) { $formatting_element = $item; break; } } // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. if ( null === $formatting_element ) { $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' ); } // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { return; } /* * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack * > than formatting element, and is an element in the special category. There might not be one. */ $is_above_formatting_element = true; $furthest_block = null; foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { continue; } if ( $is_above_formatting_element ) { $is_above_formatting_element = false; continue; } if ( self::is_special( $item->node_name ) ) { $furthest_block = $item; break; } } /* * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the * > stack of open elements, from the current node up to and including formatting element, then * > remove formatting element from the list of active formatting elements, and finally return. */ if ( null === $furthest_block ) { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); if ( $formatting_element->bookmark_name === $item->bookmark_name ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } } } $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' ); } $this->last_error = self::ERROR_UNSUPPORTED; throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' ); } /** * Inserts an HTML element on the stack of open elements. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#insert-a-foreign-element * * @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML. */ private function insert_html_element( $token ) { $this->state->stack_of_open_elements->push( $token ); } /* * HTML Specification Helpers */ /** * Returns whether an element of a given name is in the HTML special category. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#special * * @param string $tag_name Name of element to check. * @return bool Whether the element of the given name is in the special category. */ public static function is_special( $tag_name ) { $tag_name = strtoupper( $tag_name ); return ( 'ADDRESS' === $tag_name || 'APPLET' === $tag_name || 'AREA' === $tag_name || 'ARTICLE' === $tag_name || 'ASIDE' === $tag_name || 'BASE' === $tag_name || 'BASEFONT' === $tag_name || 'BGSOUND' === $tag_name || 'BLOCKQUOTE' === $tag_name || 'BODY' === $tag_name || 'BR' === $tag_name || 'BUTTON' === $tag_name || 'CAPTION' === $tag_name || 'CENTER' === $tag_name || 'COL' === $tag_name || 'COLGROUP' === $tag_name || 'DD' === $tag_name || 'DETAILS' === $tag_name || 'DIR' === $tag_name || 'DIV' === $tag_name || 'DL' === $tag_name || 'DT' === $tag_name || 'EMBED' === $tag_name || 'FIELDSET' === $tag_name || 'FIGCAPTION' === $tag_name || 'FIGURE' === $tag_name || 'FOOTER' === $tag_name || 'FORM' === $tag_name || 'FRAME' === $tag_name || 'FRAMESET' === $tag_name || 'H1' === $tag_name || 'H2' === $tag_name || 'H3' === $tag_name || 'H4' === $tag_name || 'H5' === $tag_name || 'H6' === $tag_name || 'HEAD' === $tag_name || 'HEADER' === $tag_name || 'HGROUP' === $tag_name || 'HR' === $tag_name || 'HTML' === $tag_name || 'IFRAME' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || 'KEYGEN' === $tag_name || 'LI' === $tag_name || 'LINK' === $tag_name || 'LISTING' === $tag_name || 'MAIN' === $tag_name || 'MARQUEE' === $tag_name || 'MENU' === $tag_name || 'META' === $tag_name || 'NAV' === $tag_name || 'NOEMBED' === $tag_name || 'NOFRAMES' === $tag_name || 'NOSCRIPT' === $tag_name || 'OBJECT' === $tag_name || 'OL' === $tag_name || 'P' === $tag_name || 'PARAM' === $tag_name || 'PLAINTEXT' === $tag_name || 'PRE' === $tag_name || 'SCRIPT' === $tag_name || 'SEARCH' === $tag_name || 'SECTION' === $tag_name || 'SELECT' === $tag_name || 'SOURCE' === $tag_name || 'STYLE' === $tag_name || 'SUMMARY' === $tag_name || 'TABLE' === $tag_name || 'TBODY' === $tag_name || 'TD' === $tag_name || 'TEMPLATE' === $tag_name || 'TEXTAREA' === $tag_name || 'TFOOT' === $tag_name || 'TH' === $tag_name || 'THEAD' === $tag_name || 'TITLE' === $tag_name || 'TR' === $tag_name || 'TRACK' === $tag_name || 'UL' === $tag_name || 'WBR' === $tag_name || 'XMP' === $tag_name || // MathML. 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name || 'ANNOTATION-XML' === $tag_name || // SVG. 'FOREIGNOBJECT' === $tag_name || 'DESC' === $tag_name || 'TITLE' === $tag_name ); } /** * Returns whether a given element is an HTML Void Element * * > area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#void-elements * * @param string $tag_name Name of HTML tag to check. * @return bool Whether the given tag is an HTML Void Element. */ public static function is_void( $tag_name ) { $tag_name = strtoupper( $tag_name ); return ( 'AREA' === $tag_name || 'BASE' === $tag_name || 'BASEFONT' === $tag_name || // Obsolete but still treated as void. 'BGSOUND' === $tag_name || // Obsolete but still treated as void. 'BR' === $tag_name || 'COL' === $tag_name || 'EMBED' === $tag_name || 'FRAME' === $tag_name || 'HR' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || 'KEYGEN' === $tag_name || // Obsolete but still treated as void. 'LINK' === $tag_name || 'META' === $tag_name || 'PARAM' === $tag_name || // Obsolete but still treated as void. 'SOURCE' === $tag_name || 'TRACK' === $tag_name || 'WBR' === $tag_name ); } /* * Constants that would pollute the top of the class if they were found there. */ /** * Indicates that the next HTML token should be parsed and processed. * * @since 6.4.0 * * @var string */ const PROCESS_NEXT_NODE = 'process-next-node'; /** * Indicates that the current HTML token should be reprocessed in the newly-selected insertion mode. * * @since 6.4.0 * * @var string */ const REPROCESS_CURRENT_NODE = 'reprocess-current-node'; /** * Indicates that the current HTML token should be processed without advancing the parser. * * @since 6.5.0 * * @var string */ const PROCESS_CURRENT_NODE = 'process-current-node'; /** * Indicates that the parser encountered unsupported markup and has bailed. * * @since 6.4.0 * * @var string */ const ERROR_UNSUPPORTED = 'unsupported'; /** * Indicates that the parser encountered more HTML tokens than it * was able to process and has bailed. * * @since 6.4.0 * * @var string */ const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks'; /** * Unlock code that must be passed into the constructor to create this class. * * This class extends the WP_HTML_Tag_Processor, which has a public class * constructor. Therefore, it's not possible to have a private constructor here. * * This unlock code is used to ensure that anyone calling the constructor is * doing so with a full understanding that it's intended to be a private API. * * @access private */ const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::create_fragment() instead of calling the class constructor directly.'; } PK +]VYWv� v� class-wp-html-tag-processor.phpnu �[��� <?php /** * HTML API: WP_HTML_Tag_Processor class * * Scans through an HTML document to find specific tags, then * transforms those tags by adding, removing, or updating the * values of the HTML attributes within that tag (opener). * * Does not fully parse HTML or _recurse_ into the HTML structure * Instead this scans linearly through a document and only parses * the HTML tag openers. * * ### Possible future direction for this module * * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c". * This would increase the size of the changes for some operations but leave more * natural-looking output HTML. * * @package WordPress * @subpackage HTML-API * @since 6.2.0 */ /** * Core class used to modify attributes in an HTML document for tags matching a query. * * ## Usage * * Use of this class requires three steps: * * 1. Create a new class instance with your input HTML document. * 2. Find the tag(s) you are looking for. * 3. Request changes to the attributes in those tag(s). * * Example: * * $tags = new WP_HTML_Tag_Processor( $html ); * if ( $tags->next_tag( 'option' ) ) { * $tags->set_attribute( 'selected', true ); * } * * ### Finding tags * * The `next_tag()` function moves the internal cursor through * your input HTML document until it finds a tag meeting any of * the supplied restrictions in the optional query argument. If * no argument is provided then it will find the next HTML tag, * regardless of what kind it is. * * If you want to _find whatever the next tag is_: * * $tags->next_tag(); * * | Goal | Query | * |-----------------------------------------------------------|---------------------------------------------------------------------------------| * | Find any tag. | `$tags->next_tag();` | * | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'img' ) );` | * | Find next image tag (without passing the array). | `$tags->next_tag( 'img' );` | * | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'class_name' => 'fullwidth' ) );` | * | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'tag_name' => 'img', 'class_name' => 'fullwidth' ) );` | * * If a tag was found meeting your criteria then `next_tag()` * will return `true` and you can proceed to modify it. If it * returns `false`, however, it failed to find the tag and * moved the cursor to the end of the file. * * Once the cursor reaches the end of the file the processor * is done and if you want to reach an earlier tag you will * need to recreate the processor and start over, as it's * unable to back up or move in reverse. * * See the section on bookmarks for an exception to this * no-backing-up rule. * * #### Custom queries * * Sometimes it's necessary to further inspect an HTML tag than * the query syntax here permits. In these cases one may further * inspect the search results using the read-only functions * provided by the processor or external state or variables. * * Example: * * // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style. * $remaining_count = 5; * while ( $remaining_count > 0 && $tags->next_tag() ) { * if ( * ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) && * 'jazzy' === $tags->get_attribute( 'data-style' ) * ) { * $tags->add_class( 'theme-style-everest-jazz' ); * $remaining_count--; * } * } * * `get_attribute()` will return `null` if the attribute wasn't present * on the tag when it was called. It may return `""` (the empty string) * in cases where the attribute was present but its value was empty. * For boolean attributes, those whose name is present but no value is * given, it will return `true` (the only way to set `false` for an * attribute is to remove it). * * #### When matching fails * * When `next_tag()` returns `false` it could mean different things: * * - The requested tag wasn't found in the input document. * - The input document ended in the middle of an HTML syntax element. * * When a document ends in the middle of a syntax element it will pause * the processor. This is to make it possible in the future to extend the * input document and proceed - an important requirement for chunked * streaming parsing of a document. * * Example: * * $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' ); * false === $processor->next_tag(); * * If a special element (see next section) is encountered but no closing tag * is found it will count as an incomplete tag. The parser will pause as if * the opening tag were incomplete. * * Example: * * $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' ); * false === $processor->next_tag(); * * $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' ); * true === $processor->next_tag( 'DIV' ); * * #### Special elements * * Some HTML elements are handled in a special way; their start and end tags * act like a void tag. These are special because their contents can't contain * HTML markup. Everything inside these elements is handled in a special way * and content that _appears_ like HTML tags inside of them isn't. There can * be no nesting in these elements. * * In the following list, "raw text" means that all of the content in the HTML * until the matching closing tag is treated verbatim without any replacements * and without any parsing. * * - IFRAME allows no content but requires a closing tag. * - NOEMBED (deprecated) content is raw text. * - NOFRAMES (deprecated) content is raw text. * - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment. * - STYLE content is raw text. * - TITLE content is plain text but character references are decoded. * - TEXTAREA content is plain text but character references are decoded. * - XMP (deprecated) content is raw text. * * ### Modifying HTML attributes for a found tag * * Once you've found the start of an opening tag you can modify * any number of the attributes on that tag. You can set a new * value for an attribute, remove the entire attribute, or do * nothing and move on to the next opening tag. * * Example: * * if ( $tags->next_tag( array( 'class_name' => 'wp-group-block' ) ) ) { * $tags->set_attribute( 'title', 'This groups the contained content.' ); * $tags->remove_attribute( 'data-test-id' ); * } * * If `set_attribute()` is called for an existing attribute it will * overwrite the existing value. Similarly, calling `remove_attribute()` * for a non-existing attribute has no effect on the document. Both * of these methods are safe to call without knowing if a given attribute * exists beforehand. * * ### Modifying CSS classes for a found tag * * The tag processor treats the `class` attribute as a special case. * Because it's a common operation to add or remove CSS classes, this * interface adds helper methods to make that easier. * * As with attribute values, adding or removing CSS classes is a safe * operation that doesn't require checking if the attribute or class * exists before making changes. If removing the only class then the * entire `class` attribute will be removed. * * Example: * * // from `<span>Yippee!</span>` * // to `<span class="is-active">Yippee!</span>` * $tags->add_class( 'is-active' ); * * // from `<span class="excited">Yippee!</span>` * // to `<span class="excited is-active">Yippee!</span>` * $tags->add_class( 'is-active' ); * * // from `<span class="is-active heavy-accent">Yippee!</span>` * // to `<span class="is-active heavy-accent">Yippee!</span>` * $tags->add_class( 'is-active' ); * * // from `<input type="text" class="is-active rugby not-disabled" length="24">` * // to `<input type="text" class="is-active not-disabled" length="24"> * $tags->remove_class( 'rugby' ); * * // from `<input type="text" class="rugby" length="24">` * // to `<input type="text" length="24"> * $tags->remove_class( 'rugby' ); * * // from `<input type="text" length="24">` * // to `<input type="text" length="24"> * $tags->remove_class( 'rugby' ); * * When class changes are enqueued but a direct change to `class` is made via * `set_attribute` then the changes to `set_attribute` (or `remove_attribute`) * will take precedence over those made through `add_class` and `remove_class`. * * ### Bookmarks * * While scanning through the input HTMl document it's possible to set * a named bookmark when a particular tag is found. Later on, after * continuing to scan other tags, it's possible to `seek` to one of * the set bookmarks and then proceed again from that point forward. * * Because bookmarks create processing overhead one should avoid * creating too many of them. As a rule, create only bookmarks * of known string literal names; avoid creating "mark_{$index}" * and so on. It's fine from a performance standpoint to create a * bookmark and update it frequently, such as within a loop. * * $total_todos = 0; * while ( $p->next_tag( array( 'tag_name' => 'UL', 'class_name' => 'todo' ) ) ) { * $p->set_bookmark( 'list-start' ); * while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) { * if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) { * $p->set_bookmark( 'list-end' ); * $p->seek( 'list-start' ); * $p->set_attribute( 'data-contained-todos', (string) $total_todos ); * $total_todos = 0; * $p->seek( 'list-end' ); * break; * } * * if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) { * $total_todos++; * } * } * } * * ## Tokens and finer-grained processing. * * It's possible to scan through every lexical token in the * HTML document using the `next_token()` function. This * alternative form takes no argument and provides no built-in * query syntax. * * Example: * * $title = '(untitled)'; * $text = ''; * while ( $processor->next_token() ) { * switch ( $processor->get_token_name() ) { * case '#text': * $text .= $processor->get_modifiable_text(); * break; * * case 'BR': * $text .= "\n"; * break; * * case 'TITLE': * $title = $processor->get_modifiable_text(); * break; * } * } * return trim( "# {$title}\n\n{$text}" ); * * ### Tokens and _modifiable text_. * * #### Special "atomic" HTML elements. * * Not all HTML elements are able to contain other elements inside of them. * For instance, the contents inside a TITLE element are plaintext (except * that character references like & will be decoded). This means that * if the string `<img>` appears inside a TITLE element, then it's not an * image tag, but rather it's text describing an image tag. Likewise, the * contents of a SCRIPT or STYLE element are handled entirely separately in * a browser than the contents of other elements because they represent a * different language than HTML. * * For these elements the Tag Processor treats the entire sequence as one, * from the opening tag, including its contents, through its closing tag. * This means that the it's not possible to match the closing tag for a * SCRIPT element unless it's unexpected; the Tag Processor already matched * it when it found the opening tag. * * The inner contents of these elements are that element's _modifiable text_. * * The special elements are: * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy * style of including JavaScript inside of HTML comments to avoid accidentally * closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '</script>' )`. * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as * raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`. * * #### Other tokens with modifiable text. * * There are also non-elements which are void/self-closing in nature and contain * modifiable text that is part of that individual syntax token itself. * * - `#text` nodes, whose entire token _is_ the modifiable text. * - HTML comments and tokens that become comments due to some syntax error. The * text for these tokens is the portion of the comment inside of the syntax. * E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included). * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for * `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]). * - "Funky comments," which are a special case of invalid closing tags whose name is * invalid. The text for these nodes is the text that a browser would transform into * an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`. * - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag. * - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]). * - The empty end tag `</>` which is ignored in the browser and DOM. * * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything * until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA * section in an HTML document containing `>`. The Tag Processor will first find * all valid and bogus HTML comments, and then if the comment _would_ have been a * CDATA section _were they to exist_, it will indicate this as the type of comment. * * [2]: XML allows a broader range of characters in a processing instruction's target name * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes * target names with an ASCII-representable subset of characters. It also exhibits the * same constraint as with CDATA sections, in that `>` cannot exist within the token * since Processing Instructions do no exist within HTML and their syntax transforms * into a bogus comment in the DOM. * * ## Design and limitations * * The Tag Processor is designed to linearly scan HTML documents and tokenize * HTML tags and their attributes. It's designed to do this as efficiently as * possible without compromising parsing integrity. Therefore it will be * slower than some methods of modifying HTML, such as those incorporating * over-simplified PCRE patterns, but will not introduce the defects and * failures that those methods bring in, which lead to broken page renders * and often to security vulnerabilities. On the other hand, it will be faster * than full-blown HTML parsers such as DOMDocument and use considerably * less memory. It requires a negligible memory overhead, enough to consider * it a zero-overhead system. * * The performance characteristics are maintained by avoiding tree construction * and semantic cleanups which are specified in HTML5. Because of this, for * example, it's not possible for the Tag Processor to associate any given * opening tag with its corresponding closing tag, or to return the inner markup * inside an element. Systems may be built on top of the Tag Processor to do * this, but the Tag Processor is and should be constrained so it can remain an * efficient, low-level, and reliable HTML scanner. * * The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy. * HTML5 specifies that certain invalid content be transformed into different forms * for display, such as removing null bytes from an input document and replacing * invalid characters with the Unicode replacement character `U+FFFD` (visually "�"). * Where errors or transformations exist within the HTML5 specification, the Tag Processor * leaves those invalid inputs untouched, passing them through to the final browser * to handle. While this implies that certain operations will be non-spec-compliant, * such as reading the value of an attribute with invalid content, it also preserves a * simplicity and efficiency for handling those error cases. * * Most operations within the Tag Processor are designed to minimize the difference * between an input and output document for any given change. For example, the * `add_class` and `remove_class` methods preserve whitespace and the class ordering * within the `class` attribute; and when encountering tags with duplicated attributes, * the Tag Processor will leave those invalid duplicate attributes where they are but * update the proper attribute which the browser will read for parsing its value. An * exception to this rule is that all attribute updates store their values as * double-quoted strings, meaning that attributes on input with single-quoted or * unquoted values will appear in the output with double-quotes. * * ### Scripting Flag * * The Tag Processor parses HTML with the "scripting flag" disabled. This means * that it doesn't run any scripts while parsing the page. In a browser with * JavaScript enabled, for example, the script can change the parse of the * document as it loads. On the server, however, evaluating JavaScript is not * only impractical, but also unwanted. * * Practically this means that the Tag Processor will descend into NOSCRIPT * elements and process its child tags. Were the scripting flag enabled, such * as in a typical browser, the contents of NOSCRIPT are skipped entirely. * * This allows the HTML API to process the content that will be presented in * a browser when scripting is disabled, but it offers a different view of a * page than most browser sessions will experience. E.g. the tags inside the * NOSCRIPT disappear. * * ### Text Encoding * * The Tag Processor assumes that the input HTML document is encoded with a * text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=', * "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab, * carriage-return, newline, and form-feed. * * In practice, this includes almost every single-byte encoding as well as * UTF-8. Notably, however, it does not include UTF-16. If providing input * that's incompatible, then convert the encoding beforehand. * * @since 6.2.0 * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. * Allows scanning through all tokens and processing modifiable text, where applicable. */ class WP_HTML_Tag_Processor { /** * The maximum number of bookmarks allowed to exist at * any given time. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::set_bookmark() */ const MAX_BOOKMARKS = 10; /** * Maximum number of times seek() can be called. * Prevents accidental infinite loops. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::seek() */ const MAX_SEEK_OPS = 1000; /** * The HTML document to parse. * * @since 6.2.0 * @var string */ protected $html; /** * The last query passed to next_tag(). * * @since 6.2.0 * @var array|null */ private $last_query; /** * The tag name this processor currently scans for. * * @since 6.2.0 * @var string|null */ private $sought_tag_name; /** * The CSS class name this processor currently scans for. * * @since 6.2.0 * @var string|null */ private $sought_class_name; /** * The match offset this processor currently scans for. * * @since 6.2.0 * @var int|null */ private $sought_match_offset; /** * Whether to visit tag closers, e.g. </div>, when walking an input document. * * @since 6.2.0 * @var bool */ private $stop_on_tag_closers; /** * Specifies mode of operation of the parser at any given time. * * | State | Meaning | * | ----------------|----------------------------------------------------------------------| * | *Ready* | The parser is ready to run. | * | *Complete* | There is nothing left to parse. | * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | * | *Text node* | Found a #text node; this is plaintext and modifiable. | * | *CDATA node* | Found a CDATA section; this is modifiable. | * | *Comment* | Found a comment or bogus comment; this is modifiable. | * | *Presumptuous* | Found an empty tag closer: `</>`. | * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | * * @since 6.5.0 * * @see WP_HTML_Tag_Processor::STATE_READY * @see WP_HTML_Tag_Processor::STATE_COMPLETE * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE * @see WP_HTML_Tag_Processor::STATE_COMMENT * @see WP_HTML_Tag_Processor::STATE_DOCTYPE * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT * * @var string */ protected $parser_state = self::STATE_READY; /** * What kind of syntax token became an HTML comment. * * Since there are many ways in which HTML syntax can create an HTML comment, * this indicates which of those caused it. This allows the Tag Processor to * represent more from the original input document than would appear in the DOM. * * @since 6.5.0 * * @var string|null */ protected $comment_type = null; /** * How many bytes from the original HTML document have been read and parsed. * * This value points to the latest byte offset in the input document which * has been already parsed. It is the internal cursor for the Tag Processor * and updates while scanning through the HTML tokens. * * @since 6.2.0 * @var int */ private $bytes_already_parsed = 0; /** * Byte offset in input document where current token starts. * * Example: * * <div id="test">... * 01234 * - token starts at 0 * * @since 6.5.0 * * @var int|null */ private $token_starts_at; /** * Byte length of current token. * * Example: * * <div id="test">... * 012345678901234 * - token length is 14 - 0 = 14 * * a <!-- comment --> is a token. * 0123456789 123456789 123456789 * - token length is 17 - 2 = 15 * * @since 6.5.0 * * @var int|null */ private $token_length; /** * Byte offset in input document where current tag name starts. * * Example: * * <div id="test">... * 01234 * - tag name starts at 1 * * @since 6.2.0 * * @var int|null */ private $tag_name_starts_at; /** * Byte length of current tag name. * * Example: * * <div id="test">... * 01234 * --- tag name length is 3 * * @since 6.2.0 * * @var int|null */ private $tag_name_length; /** * Byte offset into input document where current modifiable text starts. * * @since 6.5.0 * * @var int */ private $text_starts_at; /** * Byte length of modifiable text. * * @since 6.5.0 * * @var string */ private $text_length; /** * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>. * * @var bool */ private $is_closing_tag; /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * * Example: * * // Supposing the parser is working through this content * // and stops after recognizing the `id` attribute. * // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8"> * // ^ parsing will continue from this point. * $this->attributes = array( * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ) * ); * * // When picking up parsing again, or when asking to find the * // `class` attribute we will continue and add to this array. * $this->attributes = array( * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ), * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false ) * ); * * // Note that only the `class` attribute value is stored in the index. * // That's because it is the only value used by this class at the moment. * * @since 6.2.0 * @var WP_HTML_Attribute_Token[] */ private $attributes = array(); /** * Tracks spans of duplicate attributes on a given tag, used for removing * all copies of an attribute when calling `remove_attribute()`. * * @since 6.3.2 * * @var (WP_HTML_Span[])[]|null */ private $duplicate_attributes = null; /** * Which class names to add or remove from a tag. * * These are tracked separately from attribute updates because they are * semantically distinct, whereas this interface exists for the common * case of adding and removing class names while other attributes are * generally modified as with DOM `setAttribute` calls. * * When modifying an HTML document these will eventually be collapsed * into a single `set_attribute( 'class', $changes )` call. * * Example: * * // Add the `wp-block-group` class, remove the `wp-group` class. * $classname_updates = array( * // Indexed by a comparable class name. * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS, * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS * ); * * @since 6.2.0 * @var bool[] */ private $classname_updates = array(); /** * Tracks a semantic location in the original HTML which * shifts with updates as they are applied to the document. * * @since 6.2.0 * @var WP_HTML_Span[] */ protected $bookmarks = array(); const ADD_CLASS = true; const REMOVE_CLASS = false; const SKIP_CLASS = null; /** * Lexical replacements to apply to input HTML document. * * "Lexical" in this class refers to the part of this class which * operates on pure text _as text_ and not as HTML. There's a line * between the public interface, with HTML-semantic methods like * `set_attribute` and `add_class`, and an internal state that tracks * text offsets in the input document. * * When higher-level HTML methods are called, those have to transform their * operations (such as setting an attribute's value) into text diffing * operations (such as replacing the sub-string from indices A to B with * some given new string). These text-diffing operations are the lexical * updates. * * As new higher-level methods are added they need to collapse their * operations into these lower-level lexical updates since that's the * Tag Processor's internal language of change. Any code which creates * these lexical updates must ensure that they do not cross HTML syntax * boundaries, however, so these should never be exposed outside of this * class or any classes which intentionally expand its functionality. * * These are enqueued while editing the document instead of being immediately * applied to avoid processing overhead, string allocations, and string * copies when applying many updates to a single document. * * Example: * * // Replace an attribute stored with a new value, indices * // sourced from the lazily-parsed HTML recognizer. * $start = $attributes['src']->start; * $length = $attributes['src']->length; * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value ); * * // Correspondingly, something like this will appear in this array. * $lexical_updates = array( * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) * ); * * @since 6.2.0 * @var WP_HTML_Text_Replacement[] */ protected $lexical_updates = array(); /** * Tracks and limits `seek()` calls to prevent accidental infinite loops. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::seek() */ protected $seek_count = 0; /** * Constructor. * * @since 6.2.0 * * @param string $html HTML to process. */ public function __construct( $html ) { $this->html = $html; } /** * Finds the next tag matching the $query. * * @since 6.2.0 * @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token. * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this whole class name to match. * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>. * } * @return bool Whether a tag was matched. */ public function next_tag( $query = null ) { $this->parse_query( $query ); $already_found = 0; do { if ( false === $this->next_token() ) { return false; } if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { continue; } if ( $this->matches() ) { ++$already_found; } } while ( $already_found < $this->sought_match_offset ); return true; } /** * Finds the next token in the HTML document. * * An HTML document can be viewed as a stream of tokens, * where tokens are things like HTML tags, HTML comments, * text nodes, etc. This method finds the next token in * the HTML document and returns whether it found one. * * If it starts parsing a token and reaches the end of the * document then it will seek to the start of the last * token and pause, returning `false` to indicate that it * failed to find a complete token. * * Possible token types, based on the HTML specification: * * - an HTML tag, whether opening, closing, or void. * - a text node - the plaintext inside tags. * - an HTML comment. * - a DOCTYPE declaration. * - a processing instruction, e.g. `<?xml version="1.0" ?>`. * * The Tag Processor currently only supports the tag token. * * @since 6.5.0 * * @return bool Whether a token was parsed. */ public function next_token() { return $this->base_class_next_token(); } /** * Internal method which finds the next token in the HTML document. * * This method is a protected internal function which implements the logic for * finding the next token in a document. It exists so that the parser can update * its state without affecting the location of the cursor in the document and * without triggering subclass methods for things like `next_token()`, e.g. when * applying patches before searching for the next token. * * @since 6.5.0 * * @access private * * @return bool Whether a token was parsed. */ private function base_class_next_token() { $was_at = $this->bytes_already_parsed; $this->after_tag(); // Don't proceed if there's nothing more to scan. if ( self::STATE_COMPLETE === $this->parser_state || self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } /* * The next step in the parsing loop determines the parsing state; * clear it so that state doesn't linger from the previous step. */ $this->parser_state = self::STATE_READY; if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_COMPLETE; return false; } // Find the next tag if it exists. if ( false === $this->parse_next_tag() ) { if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { $this->bytes_already_parsed = $was_at; } return false; } /* * For legacy reasons the rest of this function handles tags and their * attributes. If the processor has reached the end of the document * or if it matched any other token then it should return here to avoid * attempting to process tag-specific syntax. */ if ( self::STATE_INCOMPLETE_INPUT !== $this->parser_state && self::STATE_COMPLETE !== $this->parser_state && self::STATE_MATCHED_TAG !== $this->parser_state ) { return true; } // Parse all of its attributes. while ( $this->parse_next_attribute() ) { continue; } // Ensure that the tag closes before the end of the document. if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state || $this->bytes_already_parsed >= strlen( $this->html ) ) { // Does this appropriately clear state (parsed attributes)? $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; return false; } $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); if ( false === $tag_ends_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; return false; } $this->parser_state = self::STATE_MATCHED_TAG; $this->bytes_already_parsed = $tag_ends_at + 1; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; /* * For non-DATA sections which might contain text that looks like HTML tags but * isn't, scan with the appropriate alternative mode. Looking at the first letter * of the tag name as a pre-check avoids a string allocation when it's not needed. */ $t = $this->html[ $this->tag_name_starts_at ]; if ( $this->is_closing_tag || ! ( 'i' === $t || 'I' === $t || 'n' === $t || 'N' === $t || 's' === $t || 'S' === $t || 't' === $t || 'T' === $t || 'x' === $t || 'X' === $t ) ) { return true; } $tag_name = $this->get_tag(); /* * Preserve the opening tag pointers, as these will be overwritten * when finding the closing tag. They will be reset after finding * the closing to tag to point to the opening of the special atomic * tag sequence. */ $tag_name_starts_at = $this->tag_name_starts_at; $tag_name_length = $this->tag_name_length; $tag_ends_at = $this->token_starts_at + $this->token_length; $attributes = $this->attributes; $duplicate_attributes = $this->duplicate_attributes; // Find the closing tag if necessary. $found_closer = false; switch ( $tag_name ) { case 'SCRIPT': $found_closer = $this->skip_script_data(); break; case 'TEXTAREA': case 'TITLE': $found_closer = $this->skip_rcdata( $tag_name ); break; /* * In the browser this list would include the NOSCRIPT element, * but the Tag Processor is an environment with the scripting * flag disabled, meaning that it needs to descend into the * NOSCRIPT element to be able to properly process what will be * sent to a browser. * * Note that this rule makes HTML5 syntax incompatible with XML, * because the parsing of this token depends on client application. * The NOSCRIPT element cannot be represented in the XHTML syntax. */ case 'IFRAME': case 'NOEMBED': case 'NOFRAMES': case 'STYLE': case 'XMP': $found_closer = $this->skip_rawtext( $tag_name ); break; // No other tags should be treated in their entirety here. default: return true; } if ( ! $found_closer ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; $this->bytes_already_parsed = $was_at; return false; } /* * The values here look like they reference the opening tag but they reference * the closing tag instead. This is why the opening tag values were stored * above in a variable. It reads confusingly here, but that's because the * functions that skip the contents have moved all the internal cursors past * the inner content of the tag. */ $this->token_starts_at = $was_at; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; $this->text_starts_at = $tag_ends_at; $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; $this->tag_name_starts_at = $tag_name_starts_at; $this->tag_name_length = $tag_name_length; $this->attributes = $attributes; $this->duplicate_attributes = $duplicate_attributes; return true; } /** * Whether the processor paused because the input HTML document ended * in the middle of a syntax element, such as in the middle of a tag. * * Example: * * $processor = new WP_HTML_Tag_Processor( '<input type="text" value="Th' ); * false === $processor->get_next_tag(); * true === $processor->paused_at_incomplete_token(); * * @since 6.5.0 * * @return bool Whether the parse paused at the start of an incomplete token. */ public function paused_at_incomplete_token() { return self::STATE_INCOMPLETE_INPUT === $this->parser_state; } /** * Generator for a foreach loop to step through each class name for the matched tag. * * This generator function is designed to be used inside a "foreach" loop. * * Example: * * $p = new WP_HTML_Tag_Processor( "<div class='free <egg<\tlang-en'>" ); * $p->next_tag(); * foreach ( $p->class_list() as $class_name ) { * echo "{$class_name} "; * } * // Outputs: "free <egg> lang-en " * * @since 6.4.0 */ public function class_list() { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return; } /** @var string $class contains the string value of the class attribute, with character references decoded. */ $class = $this->get_attribute( 'class' ); if ( ! is_string( $class ) ) { return; } $seen = array(); $at = 0; while ( $at < strlen( $class ) ) { // Skip past any initial boundary characters. $at += strspn( $class, " \t\f\r\n", $at ); if ( $at >= strlen( $class ) ) { return; } // Find the byte length until the next boundary. $length = strcspn( $class, " \t\f\r\n", $at ); if ( 0 === $length ) { return; } /* * CSS class names are case-insensitive in the ASCII range. * * @see https://www.w3.org/TR/CSS2/syndata.html#x1 */ $name = strtolower( substr( $class, $at, $length ) ); $at += $length; /* * It's expected that the number of class names for a given tag is relatively small. * Given this, it is probably faster overall to scan an array for a value rather * than to use the class name as a key and check if it's a key of $seen. */ if ( in_array( $name, $seen, true ) ) { continue; } $seen[] = $name; yield $name; } } /** * Returns if a matched tag contains the given ASCII case-insensitive class name. * * @since 6.4.0 * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } $wanted_class = strtolower( $wanted_class ); foreach ( $this->class_list() as $class_name ) { if ( $class_name === $wanted_class ) { return true; } } return false; } /** * Sets a bookmark in the HTML document. * * Bookmarks represent specific places or tokens in the HTML * document, such as a tag opener or closer. When applying * edits to a document, such as setting an attribute, the * text offsets of that token may shift; the bookmark is * kept updated with those shifts and remains stable unless * the entire span of text in which the token sits is removed. * * Release bookmarks when they are no longer needed. * * Example: * * <main><h2>Surprising fact you may not know!</h2></main> * ^ ^ * \-|-- this `H2` opener bookmark tracks the token * * <main class="clickbait"><h2>Surprising fact you may no… * ^ ^ * \-|-- it shifts with edits * * Bookmarks provide the ability to seek to a previously-scanned * place in the HTML document. This avoids the need to re-scan * the entire document. * * Example: * * <ul><li>One</li><li>Two</li><li>Three</li></ul> * ^^^^ * want to note this last item * * $p = new WP_HTML_Tag_Processor( $html ); * $in_list = false; * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { * if ( 'UL' === $p->get_tag() ) { * if ( $p->is_tag_closer() ) { * $in_list = false; * $p->set_bookmark( 'resume' ); * if ( $p->seek( 'last-li' ) ) { * $p->add_class( 'last-li' ); * } * $p->seek( 'resume' ); * $p->release_bookmark( 'last-li' ); * $p->release_bookmark( 'resume' ); * } else { * $in_list = true; * } * } * * if ( 'LI' === $p->get_tag() ) { * $p->set_bookmark( 'last-li' ); * } * } * * Bookmarks intentionally hide the internal string offsets * to which they refer. They are maintained internally as * updates are applied to the HTML document and therefore * retain their "position" - the location to which they * originally pointed. The inability to use bookmarks with * functions like `substr` is therefore intentional to guard * against accidentally breaking the HTML. * * Because bookmarks allocate memory and require processing * for every applied update, they are limited and require * a name. They should not be created with programmatically-made * names, such as "li_{$index}" with some loop. As a general * rule they should only be created with string-literal names * like "start-of-section" or "last-paragraph". * * Bookmarks are a powerful tool to enable complicated behavior. * Consider double-checking that you need this tool if you are * reaching for it, as inappropriate use could lead to broken * HTML structure or unwanted processing overhead. * * @since 6.2.0 * * @param string $name Identifies this particular bookmark. * @return bool Whether the bookmark was successfully created. */ public function set_bookmark( $name ) { // It only makes sense to set a bookmark if the parser has paused on a concrete token. if ( self::STATE_COMPLETE === $this->parser_state || self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { _doing_it_wrong( __METHOD__, __( 'Too many bookmarks: cannot create any more.' ), '6.2.0' ); return false; } $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); return true; } /** * Removes a bookmark that is no longer needed. * * Releasing a bookmark frees up the small * performance overhead it requires. * * @param string $name Name of the bookmark to remove. * @return bool Whether the bookmark already existed before removal. */ public function release_bookmark( $name ) { if ( ! array_key_exists( $name, $this->bookmarks ) ) { return false; } unset( $this->bookmarks[ $name ] ); return true; } /** * Skips contents of generic rawtext elements. * * @since 6.3.2 * * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm * * @param string $tag_name The uppercase tag name which will close the RAWTEXT region. * @return bool Whether an end to the RAWTEXT region was found before the end of the document. */ private function skip_rawtext( $tag_name ) { /* * These two functions distinguish themselves on whether character references are * decoded, and since functionality to read the inner markup isn't supported, it's * not necessary to implement these two functions separately. */ return $this->skip_rcdata( $tag_name ); } /** * Skips contents of RCDATA elements, namely title and textarea tags. * * @since 6.2.0 * * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state * * @param string $tag_name The uppercase tag name which will close the RCDATA region. * @return bool Whether an end to the RCDATA region was found before the end of the document. */ private function skip_rcdata( $tag_name ) { $html = $this->html; $doc_length = strlen( $html ); $tag_length = strlen( $tag_name ); $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { $at = strpos( $this->html, '</', $at ); $this->tag_name_starts_at = $at; // Fail if there is no possible tag closer. if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { return false; } $at += 2; /* * Find a case-insensitive match to the tag name. * * Because tag names are limited to US-ASCII there is no * need to perform any kind of Unicode normalization when * comparing; any character which could be impacted by such * normalization could not be part of a tag name. */ for ( $i = 0; $i < $tag_length; $i++ ) { $tag_char = $tag_name[ $i ]; $html_char = $html[ $at + $i ]; if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { $at += $i; continue 2; } } $at += $tag_length; $this->bytes_already_parsed = $at; if ( $at >= strlen( $html ) ) { return false; } /* * Ensure that the tag name terminates to avoid matching on * substrings of a longer tag name. For example, the sequence * "</textarearug" should not match for "</textarea" even * though "textarea" is found within the text. */ $c = $html[ $at ]; if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { continue; } while ( $this->parse_next_attribute() ) { continue; } $at = $this->bytes_already_parsed; if ( $at >= strlen( $this->html ) ) { return false; } if ( '>' === $html[ $at ] ) { $this->bytes_already_parsed = $at + 1; return true; } if ( $at + 1 >= strlen( $this->html ) ) { return false; } if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { $this->bytes_already_parsed = $at + 2; return true; } } return false; } /** * Skips contents of script tags. * * @since 6.2.0 * * @return bool Whether the script tag was closed before the end of the document. */ private function skip_script_data() { $state = 'unescaped'; $html = $this->html; $doc_length = strlen( $html ); $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { $at += strcspn( $html, '-<', $at ); /* * For all script states a "-->" transitions * back into the normal unescaped script mode, * even if that's the current state. */ if ( $at + 2 < $doc_length && '-' === $html[ $at ] && '-' === $html[ $at + 1 ] && '>' === $html[ $at + 2 ] ) { $at += 3; $state = 'unescaped'; continue; } // Everything of interest past here starts with "<". if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) { continue; } /* * Unlike with "-->", the "<!--" only transitions * into the escaped mode if not already there. * * Inside the escaped modes it will be ignored; and * should never break out of the double-escaped * mode and back into the escaped mode. * * While this requires a mode change, it does not * impact the parsing otherwise, so continue * parsing after updating the state. */ if ( $at + 2 < $doc_length && '!' === $html[ $at ] && '-' === $html[ $at + 1 ] && '-' === $html[ $at + 2 ] ) { $at += 3; $state = 'unescaped' === $state ? 'escaped' : $state; continue; } if ( '/' === $html[ $at ] ) { $closer_potentially_starts_at = $at - 1; $is_closing = true; ++$at; } else { $is_closing = false; } /* * At this point the only remaining state-changes occur with the * <script> and </script> tags; unless one of these appears next, * proceed scanning to the next potential token in the text. */ if ( ! ( $at + 6 < $doc_length && ( 's' === $html[ $at ] || 'S' === $html[ $at ] ) && ( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) && ( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) && ( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) && ( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) && ( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] ) ) ) { ++$at; continue; } /* * Ensure that the script tag terminates to avoid matching on * substrings of a non-match. For example, the sequence * "<script123" should not end a script region even though * "<script" is found within the text. */ if ( $at + 6 >= $doc_length ) { continue; } $at += 6; $c = $html[ $at ]; if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { ++$at; continue; } if ( 'escaped' === $state && ! $is_closing ) { $state = 'double-escaped'; continue; } if ( 'double-escaped' === $state && $is_closing ) { $state = 'escaped'; continue; } if ( $is_closing ) { $this->bytes_already_parsed = $closer_potentially_starts_at; $this->tag_name_starts_at = $closer_potentially_starts_at; if ( $this->bytes_already_parsed >= $doc_length ) { return false; } while ( $this->parse_next_attribute() ) { continue; } if ( $this->bytes_already_parsed >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } if ( '>' === $html[ $this->bytes_already_parsed ] ) { ++$this->bytes_already_parsed; return true; } } ++$at; } return false; } /** * Parses the next tag. * * This will find and start parsing the next tag, including * the opening `<`, the potential closer `/`, and the tag * name. It does not parse the attributes or scan to the * closing `>`; these are left for other methods. * * @since 6.2.0 * @since 6.2.1 Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements. * * @return bool Whether a tag was found before the end of the document. */ private function parse_next_tag() { $this->after_tag(); $html = $this->html; $doc_length = strlen( $html ); $was_at = $this->bytes_already_parsed; $at = $was_at; while ( false !== $at && $at < $doc_length ) { $at = strpos( $html, '<', $at ); /* * This does not imply an incomplete parse; it indicates that there * can be nothing left in the document other than a #text node. */ if ( false === $at ) { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->token_length = strlen( $html ) - $was_at; $this->text_starts_at = $was_at; $this->text_length = $this->token_length; $this->bytes_already_parsed = strlen( $html ); return true; } if ( $at > $was_at ) { /* * A "<" normally starts a new HTML tag or syntax token, but in cases where the * following character can't produce a valid token, the "<" is instead treated * as plaintext and the parser should skip over it. This avoids a problem when * following earlier practices of typing emoji with text, e.g. "<3". This * should be a heart, not a tag. It's supposed to be rendered, not hidden. * * At this point the parser checks if this is one of those cases and if it is * will continue searching for the next "<" in search of a token boundary. * * @see https://html.spec.whatwg.org/#tag-open-state */ if ( strlen( $html ) > $at + 1 ) { $next_character = $html[ $at + 1 ]; $at_another_node = ( '!' === $next_character || '/' === $next_character || '?' === $next_character || ( 'A' <= $next_character && $next_character <= 'Z' ) || ( 'a' <= $next_character && $next_character <= 'z' ) ); if ( ! $at_another_node ) { ++$at; continue; } } $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->token_length = $at - $was_at; $this->text_starts_at = $was_at; $this->text_length = $this->token_length; $this->bytes_already_parsed = $at; return true; } $this->token_starts_at = $at; if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) { $this->is_closing_tag = true; ++$at; } else { $this->is_closing_tag = false; } /* * HTML tag names must start with [a-zA-Z] otherwise they are not tags. * For example, "<3" is rendered as text, not a tag opener. If at least * one letter follows the "<" then _it is_ a tag, but if the following * character is anything else it _is not a tag_. * * It's not uncommon to find non-tags starting with `<` in an HTML * document, so it's good for performance to make this pre-check before * continuing to attempt to parse a tag name. * * Reference: * * https://html.spec.whatwg.org/multipage/parsing.html#data-state * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); if ( $tag_name_prefix_length > 0 ) { ++$at; $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } /* * Abort if no tag is found before the end of * the document. There is nothing left to parse. */ if ( $at + 1 >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } /* * `<!` transitions to markup declaration open state * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */ if ( ! $this->is_closing_tag && '!' === $html[ $at + 1 ] ) { /* * `<!--` transitions to a comment state – apply further comment rules. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( $doc_length > $at + 3 && '-' === $html[ $at + 2 ] && '-' === $html[ $at + 3 ] ) { $closer_at = $at + 4; // If it's not possible to close the comment then there is nothing more to scan. if ( $doc_length <= $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { /* * @todo When implementing `set_modifiable_text()` ensure that updates to this token * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment * and bogus comment syntax, these leave no clear insertion point for text and * they need to be modified specially in order to contain text. E.g. to store * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which * involves inserting an additional `-` into the token after the modifiable text. */ $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; // Only provide modifiable text if the token is long enough to contain it. if ( $span_of_dashes >= 2 ) { $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $span_of_dashes - 2; } $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; return true; } /* * Comments may be closed by either a --> or an invalid --!>. * The first occurrence closes the comment. * * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment */ --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. while ( ++$closer_at < $doc_length ) { $closer_at = strpos( $html, '--', $closer_at ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->token_length = $closer_at + 3 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 3; return true; } if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->token_length = $closer_at + 4 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 4; return true; } } } /* * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest > * These are ASCII-case-insensitive. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( $doc_length > $at + 8 && ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) ) { $closer_at = strpos( $html, '>', $at + 9 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_DOCTYPE; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 9; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; return true; } /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is * found then the HTML was truncated inside the markup declaration. */ $closer_at = strpos( $html, '>', $at + 1 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; /* * Identify nodes that would be CDATA if HTML had CDATA sections. * * This section must occur after identifying the bogus comment end * because in an HTML parser it will span to the nearest `>`, even * if there's no `]]>` as would be required in an XML document. It * is therefore not possible to parse a CDATA section containing * a `>` in the HTML syntax. * * Inside foreign elements there is a discrepancy between browsers * and the specification on this. * * @todo Track whether the Tag Processor is inside a foreign element * and require the proper closing `]]>` in those cases. */ if ( $this->token_length >= 10 && '[' === $html[ $this->token_starts_at + 2 ] && 'C' === $html[ $this->token_starts_at + 3 ] && 'D' === $html[ $this->token_starts_at + 4 ] && 'A' === $html[ $this->token_starts_at + 5 ] && 'T' === $html[ $this->token_starts_at + 6 ] && 'A' === $html[ $this->token_starts_at + 7 ] && '[' === $html[ $this->token_starts_at + 8 ] && ']' === $html[ $closer_at - 1 ] && ']' === $html[ $closer_at - 2 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE; $this->text_starts_at += 7; $this->text_length -= 9; } return true; } /* * </> is a missing end tag name, which is ignored. * * This was also known as the "presumptuous empty tag" * in early discussions as it was proposed to close * the nearest previous opening tag. * * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { // `<>` is interpreted as plaintext. if ( ! $this->is_closing_tag ) { ++$at; continue; } $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; $this->token_length = $at + 2 - $this->token_starts_at; $this->bytes_already_parsed = $at + 2; return true; } /* * `<?` transitions to a bogus comment state – skip to the nearest > * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( ! $this->is_closing_tag && '?' === $html[ $at + 1 ] ) { $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; /* * Identify a Processing Instruction node were HTML to have them. * * This section must occur after identifying the bogus comment end * because in an HTML parser it will span to the nearest `>`, even * if there's no `?>` as would be required in an XML document. It * is therefore not possible to parse a Processing Instruction node * containing a `>` in the HTML syntax. * * XML allows for more target names, but this code only identifies * those with ASCII-representable target names. This means that it * may identify some Processing Instruction nodes as bogus comments, * but it will not misinterpret the HTML structure. By limiting the * identification to these target names the Tag Processor can avoid * the need to start parsing UTF-8 sequences. * * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | * [#x10000-#xEFFFF] * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); if ( 0 < $pi_target_length ) { $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE; $this->tag_name_starts_at = $this->token_starts_at + 2; $this->tag_name_length = $pi_target_length; $this->text_starts_at += $pi_target_length; $this->text_length -= $pi_target_length + 1; } } return true; } /* * If a non-alpha starts the tag name in a tag closer it's a comment. * Find the first `>`, which closes the comment. * * This parser classifies these particular comments as special "funky comments" * which are made available for further processing. * * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name */ if ( $this->is_closing_tag ) { // No chance of finding a closer. if ( $at + 3 > $doc_length ) { return false; } $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_FUNKY_COMMENT; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; return true; } ++$at; } return false; } /** * Parses the next attribute. * * @since 6.2.0 * * @return bool Whether an attribute was found before the end of the document. */ private function parse_next_attribute() { // Skip whitespace and slashes. $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } /* * Treat the equal sign as a part of the attribute * name if it is the first encountered byte. * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ $name_length = '=' === $this->html[ $this->bytes_already_parsed ] ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); // No attribute, just tag closer. if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) { return false; } $attribute_start = $this->bytes_already_parsed; $attribute_name = substr( $this->html, $attribute_start, $name_length ); $this->bytes_already_parsed += $name_length; if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; if ( $has_value ) { ++$this->bytes_already_parsed; $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } switch ( $this->html[ $this->bytes_already_parsed ] ) { case "'": case '"': $quote = $this->html[ $this->bytes_already_parsed ]; $value_start = $this->bytes_already_parsed + 1; $value_length = strcspn( $this->html, $quote, $value_start ); $attribute_end = $value_start + $value_length + 1; $this->bytes_already_parsed = $attribute_end; break; default: $value_start = $this->bytes_already_parsed; $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); $attribute_end = $value_start + $value_length; $this->bytes_already_parsed = $attribute_end; } } else { $value_start = $this->bytes_already_parsed; $value_length = 0; $attribute_end = $attribute_start + $name_length; } if ( $attribute_end >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } if ( $this->is_closing_tag ) { return true; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $attribute_name ); // If an attribute is listed many times, only use the first declaration and ignore the rest. if ( ! array_key_exists( $comparable_name, $this->attributes ) ) { $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token( $attribute_name, $value_start, $value_length, $attribute_start, $attribute_end - $attribute_start, ! $has_value ); return true; } /* * Track the duplicate attributes so if we remove it, all disappear together. * * While `$this->duplicated_attributes` could always be stored as an `array()`, * which would simplify the logic here, storing a `null` and only allocating * an array when encountering duplicates avoids needless allocations in the * normative case of parsing tags with no duplicate attributes. */ $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start ); if ( null === $this->duplicate_attributes ) { $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span ); } else { $this->duplicate_attributes[ $comparable_name ][] = $duplicate_span; } return true; } /** * Move the internal cursor past any immediate successive whitespace. * * @since 6.2.0 */ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed ); } /** * Applies attribute updates and cleans up once a tag is fully parsed. * * @since 6.2.0 */ private function after_tag() { /* * There could be lexical updates enqueued for an attribute that * also exists on the next tag. In order to avoid conflating the * attributes across the two tags, lexical updates with names * need to be flushed to raw lexical updates. */ $this->class_name_updates_to_attributes_updates(); /* * Purge updates if there are too many. The actual count isn't * scientific, but a few values from 100 to a few thousand were * tests to find a practically-useful limit. * * If the update queue grows too big, then the Tag Processor * will spend more time iterating through them and lose the * efficiency gains of deferring applying them. */ if ( 1000 < count( $this->lexical_updates ) ) { $this->get_updated_html(); } foreach ( $this->lexical_updates as $name => $update ) { /* * Any updates appearing after the cursor should be applied * before proceeding, otherwise they may be overlooked. */ if ( $update->start >= $this->bytes_already_parsed ) { $this->get_updated_html(); break; } if ( is_int( $name ) ) { continue; } $this->lexical_updates[] = $update; unset( $this->lexical_updates[ $name ] ); } $this->token_starts_at = null; $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->text_starts_at = 0; $this->text_length = 0; $this->is_closing_tag = null; $this->attributes = array(); $this->comment_type = null; $this->duplicate_attributes = null; } /** * Converts class name updates into tag attributes updates * (they are accumulated in different data formats for performance). * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::$lexical_updates * @see WP_HTML_Tag_Processor::$classname_updates */ private function class_name_updates_to_attributes_updates() { if ( count( $this->classname_updates ) === 0 ) { return; } $existing_class = $this->get_enqueued_attribute_value( 'class' ); if ( null === $existing_class || true === $existing_class ) { $existing_class = ''; } if ( false === $existing_class && isset( $this->attributes['class'] ) ) { $existing_class = substr( $this->html, $this->attributes['class']->value_starts_at, $this->attributes['class']->value_length ); } if ( false === $existing_class ) { $existing_class = ''; } /** * Updated "class" attribute value. * * This is incrementally built while scanning through the existing class * attribute, skipping removed classes on the way, and then appending * added classes at the end. Only when finished processing will the * value contain the final new value. * @var string $class */ $class = ''; /** * Tracks the cursor position in the existing * class attribute value while parsing. * * @var int $at */ $at = 0; /** * Indicates if there's any need to modify the existing class attribute. * * If a call to `add_class()` and `remove_class()` wouldn't impact * the `class` attribute value then there's no need to rebuild it. * For example, when adding a class that's already present or * removing one that isn't. * * This flag enables a performance optimization when none of the enqueued * class updates would impact the `class` attribute; namely, that the * processor can continue without modifying the input document, as if * none of the `add_class()` or `remove_class()` calls had been made. * * This flag is set upon the first change that requires a string update. * * @var bool $modified */ $modified = false; // Remove unwanted classes by only copying the new ones. $existing_class_length = strlen( $existing_class ); while ( $at < $existing_class_length ) { // Skip to the first non-whitespace character. $ws_at = $at; $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at ); $at += $ws_length; // Capture the class name – it's everything until the next whitespace. $name_length = strcspn( $existing_class, " \t\f\r\n", $at ); if ( 0 === $name_length ) { // If no more class names are found then that's the end. break; } $name = substr( $existing_class, $at, $name_length ); $at += $name_length; // If this class is marked for removal, start processing the next one. $remove_class = ( isset( $this->classname_updates[ $name ] ) && self::REMOVE_CLASS === $this->classname_updates[ $name ] ); // If a class has already been seen then skip it; it should not be added twice. if ( ! $remove_class ) { $this->classname_updates[ $name ] = self::SKIP_CLASS; } if ( $remove_class ) { $modified = true; continue; } /* * Otherwise, append it to the new "class" attribute value. * * There are options for handling whitespace between tags. * Preserving the existing whitespace produces fewer changes * to the HTML content and should clarify the before/after * content when debugging the modified output. * * This approach contrasts normalizing the inter-class * whitespace to a single space, which might appear cleaner * in the output HTML but produce a noisier change. */ $class .= substr( $existing_class, $ws_at, $ws_length ); $class .= $name; } // Add new classes by appending those which haven't already been seen. foreach ( $this->classname_updates as $name => $operation ) { if ( self::ADD_CLASS === $operation ) { $modified = true; $class .= strlen( $class ) > 0 ? ' ' : ''; $class .= $name; } } $this->classname_updates = array(); if ( ! $modified ) { return; } if ( strlen( $class ) > 0 ) { $this->set_attribute( 'class', $class ); } else { $this->remove_attribute( 'class' ); } } /** * Applies attribute updates to HTML document. * * @since 6.2.0 * @since 6.2.1 Accumulates shift for internal cursor and passed pointer. * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. * * @param int $shift_this_point Accumulate and return shift for this position. * @return int How many bytes the given pointer moved in response to the updates. */ private function apply_attributes_updates( $shift_this_point ) { if ( ! count( $this->lexical_updates ) ) { return 0; } $accumulated_shift_for_given_point = 0; /* * Attribute updates can be enqueued in any order but updates * to the document must occur in lexical order; that is, each * replacement must be made before all others which follow it * at later string indices in the input document. * * Sorting avoid making out-of-order replacements which * can lead to mangled output, partially-duplicated * attributes, and overwritten attributes. */ usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); $bytes_already_copied = 0; $output_buffer = ''; foreach ( $this->lexical_updates as $diff ) { $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. if ( $diff->start < $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } // Accumulate shift of the given pointer within this function call. if ( $diff->start <= $shift_this_point ) { $accumulated_shift_for_given_point += $shift; } $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); $output_buffer .= $diff->text; $bytes_already_copied = $diff->start + $diff->length; } $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); /* * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { $bookmark_end = $bookmark->start + $bookmark->length; /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change * and accumulate the total shift for each bookmark, then apply that * shift after tallying the full delta. */ $head_delta = 0; $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { $diff_end = $diff->start + $diff->length; if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { break; } if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { $this->release_bookmark( $bookmark_name ); continue 2; } $delta = strlen( $diff->text ) - $diff->length; if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } if ( $bookmark_end >= $diff_end ) { $tail_delta += $delta; } } $bookmark->start += $head_delta; $bookmark->length += $tail_delta - $head_delta; } $this->lexical_updates = array(); return $accumulated_shift_for_given_point; } /** * Checks whether a bookmark with the given name exists. * * @since 6.3.0 * * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ public function has_bookmark( $bookmark_name ) { return array_key_exists( $bookmark_name, $this->bookmarks ); } /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * * @since 6.2.0 * * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { _doing_it_wrong( __METHOD__, __( 'Unknown bookmark name.' ), '6.2.0' ); return false; } if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { _doing_it_wrong( __METHOD__, __( 'Too many calls to seek() - this can lead to performance issues.' ), '6.2.0' ); return false; } // Flush out any pending updates to the document. $this->get_updated_html(); // Point this tag processor before the sought tag opener and consume it. $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; $this->parser_state = self::STATE_READY; return $this->next_token(); } /** * Compare two WP_HTML_Text_Replacement objects. * * @since 6.2.0 * * @param WP_HTML_Text_Replacement $a First attribute update. * @param WP_HTML_Text_Replacement $b Second attribute update. * @return int Comparison value for string order. */ private static function sort_start_ascending( $a, $b ) { $by_start = $a->start - $b->start; if ( 0 !== $by_start ) { return $by_start; } $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; if ( 0 !== $by_text ) { return $by_text; } /* * This code should be unreachable, because it implies the two replacements * start at the same location and contain the same text. */ return $a->length - $b->length; } /** * Return the enqueued value for a given attribute, if one exists. * * Enqueued updates can take different data types: * - If an update is enqueued and is boolean, the return will be `true` * - If an update is otherwise enqueued, the return will be the string value of that update. * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. * - If no updates are enqueued, the return will be `false` to differentiate from "removed." * * @since 6.2.0 * * @param string $comparable_name The attribute name in its comparable form. * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return false; } if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { return false; } $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; // Removed attributes erase the entire span. if ( '' === $enqueued_text ) { return null; } /* * Boolean attribute updates are just the attribute name without a corresponding value. * * This value might differ from the given comparable name in that there could be leading * or trailing whitespace, and that the casing follows the name given in `set_attribute`. * * Example: * * $p->set_attribute( 'data-TEST-id', 'update' ); * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); * * Detect this difference based on the absence of the `=`, which _must_ exist in any * attribute containing a value, e.g. `<input type="text" enabled />`. * ¹ ² * 1. Attribute with a string value. * 2. Boolean attribute whose value is `true`. */ $equals_at = strpos( $enqueued_text, '=' ); if ( false === $equals_at ) { return true; } /* * Finally, a normal update's value will appear after the `=` and * be double-quoted, as performed incidentally by `set_attribute`. * * e.g. `type="text"` * ¹² ³ * 1. Equals is here. * 2. Double-quoting starts one after the equals sign. * 3. Double-quoting ends at the last character in the update. */ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); return WP_HTML_Decoder::decode_attribute( $enqueued_value ); } /** * Returns the value of a requested attribute from a matched tag opener if that attribute exists. * * Example: * * $p = new WP_HTML_Tag_Processor( '<div enabled class="test" data-test-id="14">Test</div>' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute( 'data-test-id' ) === '14'; * $p->get_attribute( 'enabled' ) === true; * $p->get_attribute( 'aria-label' ) === null; * * $p->next_tag() === false; * $p->get_attribute( 'class' ) === null; * * @since 6.2.0 * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } $comparable = strtolower( $name ); /* * For every attribute other than `class` it's possible to perform a quick check if * there's an enqueued lexical update whose value takes priority over what's found in * the input document. * * The `class` attribute is special though because of the exposed helpers `add_class` * and `remove_class`. These form a builder for the `class` attribute, so an additional * check for enqueued class changes is required in addition to the check for any enqueued * attribute values. If any exist, those enqueued class changes must first be flushed out * into an attribute value update. */ if ( 'class' === $name ) { $this->class_name_updates_to_attributes_updates(); } // Return any enqueued attribute value updates if they exist. $enqueued_value = $this->get_enqueued_attribute_value( $comparable ); if ( false !== $enqueued_value ) { return $enqueued_value; } if ( ! isset( $this->attributes[ $comparable ] ) ) { return null; } $attribute = $this->attributes[ $comparable ]; /* * This flag distinguishes an attribute with no value * from an attribute with an empty string value. For * unquoted attributes this could look very similar. * It refers to whether an `=` follows the name. * * e.g. <div boolean-attribute empty-attribute=></div> * ¹ ² * 1. Attribute `boolean-attribute` is `true`. * 2. Attribute `empty-attribute` is `""`. */ if ( true === $attribute->is_true ) { return true; } $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); return WP_HTML_Decoder::decode_attribute( $raw_value ); } /** * Gets lowercase names of all attributes matching a given prefix in the current tag. * * Note that matching is case-insensitive. This is in accordance with the spec: * * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * Example: * * $p = new WP_HTML_Tag_Processor( '<div data-ENABLED class="test" DATA-test-id="14">Test</div>' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); * * $p->next_tag() === false; * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * * @param string $prefix Prefix of requested attribute names. * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return null; } $comparable = strtolower( $prefix ); $matches = array(); foreach ( array_keys( $this->attributes ) as $attr_name ) { if ( str_starts_with( $attr_name, $comparable ) ) { $matches[] = $attr_name; } } return $matches; } /** * Returns the uppercase name of the matched tag. * * Example: * * $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' ); * $p->next_tag() === true; * $p->get_tag() === 'DIV'; * * $p->next_tag() === false; * $p->get_tag() === null; * * @since 6.2.0 * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ public function get_tag() { if ( null === $this->tag_name_starts_at ) { return null; } $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); if ( self::STATE_MATCHED_TAG === $this->parser_state ) { return strtoupper( $tag_name ); } if ( self::STATE_COMMENT === $this->parser_state && self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() ) { return $tag_name; } return null; } /** * Indicates if the currently matched tag contains the self-closing flag. * * No HTML elements ought to have the self-closing flag and for those, the self-closing * flag will be ignored. For void elements this is benign because they "self close" * automatically. For non-void HTML elements though problems will appear if someone * intends to use a self-closing element in place of that element with an empty body. * For HTML foreign elements and custom elements the self-closing flag determines if * they self-close or not. * * This function does not determine if a tag is self-closing, * but only if the self-closing flag is present in the syntax. * * @since 6.3.0 * * @return bool Whether the currently matched tag contains the self-closing flag. */ public function has_self_closing_flag() { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return false; } /* * The self-closing flag is the solidus at the _end_ of the tag, not the beginning. * * Example: * * <figure /> * ^ this appears one character before the end of the closing ">". */ return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ]; } /** * Indicates if the current tag token is a tag closer. * * Example: * * $p = new WP_HTML_Tag_Processor( '<div></div>' ); * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === false; * * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === true; * * @since 6.2.0 * * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer() { return ( self::STATE_MATCHED_TAG === $this->parser_state && $this->is_closing_tag ); } /** * Indicates the kind of matched token, if any. * * This differs from `get_token_name()` in that it always * returns a static string indicating the type, whereas * `get_token_name()` may return values derived from the * token itself, such as a tag name or processing * instruction tag. * * Possible values: * - `#tag` when matched on a tag. * - `#text` when matched on a text node. * - `#cdata-section` when matched on a CDATA node. * - `#comment` when matched on a comment. * - `#doctype` when matched on a DOCTYPE declaration. * - `#presumptuous-tag` when matched on an empty tag closer. * - `#funky-comment` when matched on a funky comment. * * @since 6.5.0 * * @return string|null What kind of token is matched, or null. */ public function get_token_type() { switch ( $this->parser_state ) { case self::STATE_MATCHED_TAG: return '#tag'; case self::STATE_DOCTYPE: return '#doctype'; default: return $this->get_token_name(); } } /** * Returns the node name represented by the token. * * This matches the DOM API value `nodeName`. Some values * are static, such as `#text` for a text node, while others * are dynamically generated from the token itself. * * Dynamic names: * - Uppercase tag name for tag matches. * - `html` for DOCTYPE declarations. * * Note that if the Tag Processor is not matched on a token * then this function will return `null`, either because it * hasn't yet found a token or because it reached the end * of the document without matching a token. * * @since 6.5.0 * * @return string|null Name of the matched token. */ public function get_token_name() { switch ( $this->parser_state ) { case self::STATE_MATCHED_TAG: return $this->get_tag(); case self::STATE_TEXT_NODE: return '#text'; case self::STATE_CDATA_NODE: return '#cdata-section'; case self::STATE_COMMENT: return '#comment'; case self::STATE_DOCTYPE: return 'html'; case self::STATE_PRESUMPTUOUS_TAG: return '#presumptuous-tag'; case self::STATE_FUNKY_COMMENT: return '#funky-comment'; } return null; } /** * Indicates what kind of comment produced the comment node. * * Because there are different kinds of HTML syntax which produce * comments, the Tag Processor tracks and exposes this as a type * for the comment. Nominally only regular HTML comments exist as * they are commonly known, but a number of unrelated syntax errors * also produce comments. * * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT * @see self::COMMENT_AS_CDATA_LOOKALIKE * @see self::COMMENT_AS_INVALID_HTML * @see self::COMMENT_AS_HTML_COMMENT * @see self::COMMENT_AS_PI_NODE_LOOKALIKE * * @since 6.5.0 * * @return string|null */ public function get_comment_type() { if ( self::STATE_COMMENT !== $this->parser_state ) { return null; } return $this->comment_type; } /** * Returns the modifiable text for a matched token, or an empty string. * * Modifiable text is text content that may be read and changed without * changing the HTML structure of the document around it. This includes * the contents of `#text` nodes in the HTML as well as the inner * contents of HTML comments, Processing Instructions, and others, even * though these nodes aren't part of a parsed DOM tree. They also contain * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any * other section in an HTML document which cannot contain HTML markup (DATA). * * If a token has no modifiable text then an empty string is returned to * avoid needless crashing or type errors. An empty string does not mean * that a token has modifiable text, and a token with modifiable text may * have an empty string (e.g. a comment with no contents). * * @since 6.5.0 * * @return string */ public function get_modifiable_text() { if ( null === $this->text_starts_at ) { return ''; } $text = substr( $this->html, $this->text_starts_at, $this->text_length ); // Comment data is not decoded. if ( self::STATE_CDATA_NODE === $this->parser_state || self::STATE_COMMENT === $this->parser_state || self::STATE_DOCTYPE === $this->parser_state || self::STATE_FUNKY_COMMENT === $this->parser_state ) { return $text; } $tag_name = $this->get_tag(); if ( // Script data is not decoded. 'SCRIPT' === $tag_name || // RAWTEXT data is not decoded. 'IFRAME' === $tag_name || 'NOEMBED' === $tag_name || 'NOFRAMES' === $tag_name || 'STYLE' === $tag_name || 'XMP' === $tag_name ) { return $text; } $decoded = WP_HTML_Decoder::decode_text_node( $text ); /* * TEXTAREA skips a leading newline, but this newline may appear not only as the * literal character `\n`, but also as a character reference, such as in the * following markup: `<textarea>
Content</textarea>`. * * For these cases it's important to first decode the text content before checking * for a leading newline and removing it. */ if ( self::STATE_MATCHED_TAG === $this->parser_state && 'TEXTAREA' === $tag_name && strlen( $decoded ) > 0 && "\n" === $decoded[0] ) { return substr( $decoded, 1 ); } return $decoded; } /** * Updates or creates a new attribute on the currently matched tag with the passed value. * * For boolean attributes special handling is provided: * - When `true` is passed as the value, then only the attribute name is added to the tag. * - When `false` is passed, the attribute gets removed if it existed before. * * For string attributes, the value is escaped using the `esc_attr` function. * * @since 6.2.0 * @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names. * * @param string $name The attribute name to target. * @param string|bool $value The new attribute value. * @return bool Whether an attribute value was set. */ public function set_attribute( $name, $value ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } /* * WordPress rejects more characters than are strictly forbidden * in HTML5. This is to prevent additional security risks deeper * in the WordPress and plugin stack. Specifically the * less-than (<) greater-than (>) and ampersand (&) aren't allowed. * * The use of a PCRE match enables looking for specific Unicode * code points without writing a UTF-8 decoder. Whereas scanning * for one-byte characters is trivial (with `strcspn`), scanning * for the longer byte sequences would be more complicated. Given * that this shouldn't be in the hot path for execution, it's a * reasonable compromise in efficiency without introducing a * noticeable impact on the overall system. * * @see https://html.spec.whatwg.org/#attributes-2 * * @todo As the only regex pattern maybe we should take it out? * Are Unicode patterns available broadly in Core? */ if ( preg_match( '~[' . // Syntax-like characters. '"\'>&</ =' . // Control characters. '\x{00}-\x{1F}' . // HTML noncharacters. '\x{FDD0}-\x{FDEF}' . '\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' . '\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' . '\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' . '\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' . '\x{10FFFE}\x{10FFFF}' . ']~Ssu', $name ) ) { _doing_it_wrong( __METHOD__, __( 'Invalid attribute name.' ), '6.2.0' ); return false; } /* * > The values "true" and "false" are not allowed on boolean attributes. * > To represent a false value, the attribute has to be omitted altogether. * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes */ if ( false === $value ) { return $this->remove_attribute( $name ); } if ( true === $value ) { $updated_attribute = $name; } else { $comparable_name = strtolower( $name ); /* * Escape URL attributes. * * @see https://html.spec.whatwg.org/#attributes-3 */ $escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes() ) ? esc_url( $value ) : esc_attr( $value ); // If the escaping functions wiped out the update, reject it and indicate it was rejected. if ( '' === $escaped_new_value && '' !== $value ) { return false; } $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $name ); if ( isset( $this->attributes[ $comparable_name ] ) ) { /* * Update an existing attribute. * * Example – set attribute id to "new" in <div id="initial_id" />: * * <div id="initial_id"/> * ^-------------^ * start end * replacement: `id="new"` * * Result: <div id="new"/> */ $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->length, $updated_attribute ); } else { /* * Create a new attribute at the tag's name end. * * Example – add attribute id="new" to <div />: * * <div/> * ^ * start and end * replacement: ` id="new"` * * Result: <div id="new"/> */ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, 0, ' ' . $updated_attribute ); } /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { $this->classname_updates = array(); } return true; } /** * Remove an attribute from the currently-matched tag. * * @since 6.2.0 * * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $name = strtolower( $name ); /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { $this->classname_updates = array(); } /* * If updating an attribute that didn't exist in the input * document, then remove the enqueued update and move on. * * For example, this might occur when calling `remove_attribute()` * after calling `set_attribute()` for the same attribute * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { if ( isset( $this->lexical_updates[ $name ] ) ) { unset( $this->lexical_updates[ $name ] ); } return false; } /* * Removes an existing tag attribute. * * Example – remove the attribute id from <div id="main"/>: * <div id="initial_id"/> * ^-------------^ * start end * replacement: `` * * Result: <div /> */ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->length, '' ); // Removes any duplicated attributes if they were also present. if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) { foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, $attribute_token->length, '' ); } } return true; } /** * Adds a new class name to the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ public function add_class( $class_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } $this->classname_updates[ $class_name ] = self::ADD_CLASS; return true; } /** * Removes a class name from the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; } return true; } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::get_updated_html() * * @return string The processed HTML. */ public function __toString() { return $this->get_updated_html(); } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * @since 6.2.1 Shifts the internal cursor corresponding to the applied updates. * @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML. * * @return string The processed HTML. */ public function get_updated_html() { $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been * updated, return the original document and avoid a string copy. */ if ( $requires_no_updating ) { return $this->html; } /* * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ $before_current_tag = $this->token_starts_at ?? 0; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. */ $this->class_name_updates_to_attributes_updates(); $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); /* * 2. Rewind to before the current tag and reparse to get updated attributes. * * At this point the internal cursor points to the end of the tag name. * Rewind before the tag name starts so that it's as if the cursor didn't * move; a call to `next_tag()` will reparse the recently-updated attributes * and additional calls to modify the attributes will apply at this same * location, but in order to avoid issues with subclasses that might add * behaviors to `next_tag()`, the internal methods should be called here * instead. * * It's important to note that in this specific place there will be no change * because the processor was already at a tag when this was called and it's * rewinding only to the beginning of this very tag before reprocessing it * and its attributes. * * <p>Previous HTML<em>More HTML</em></p> * ↑ │ back up by the length of the tag name plus the opening < * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; $this->base_class_next_token(); return $this->html; } /** * Parses tag query input into internal search criteria. * * @since 6.2.0 * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this class name to match. * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>. * } */ private function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } $this->last_query = $query; $this->sought_tag_name = null; $this->sought_class_name = null; $this->sought_match_offset = 1; $this->stop_on_tag_closers = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { $this->sought_tag_name = $query; return; } // An empty query parameter applies no restrictions on the search. if ( null === $query ) { return; } // If not using the string interface, an associative array is required. if ( ! is_array( $query ) ) { _doing_it_wrong( __METHOD__, __( 'The query argument must be an array or a tag name.' ), '6.2.0' ); return; } if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { $this->sought_tag_name = $query['tag_name']; } if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { $this->sought_class_name = $query['class_name']; } if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { $this->sought_match_offset = $query['match_offset']; } if ( isset( $query['tag_closers'] ) ) { $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; } } /** * Checks whether a given tag and its attributes match the search criteria. * * @since 6.2.0 * * @return bool Whether the given tag and its attribute match the search criteria. */ private function matches() { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } // Does the tag name match the requested tag name in a case-insensitive manner? if ( null !== $this->sought_tag_name ) { /* * String (byte) length lookup is fast. If they aren't the * same length then they can't be the same string values. */ if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { return false; } /* * Check each character to determine if they are the same. * Defer calls to `strtoupper()` to avoid them when possible. * Calling `strcasecmp()` here tested slowed than comparing each * character, so unless benchmarks show otherwise, it should * not be used. * * It's expected that most of the time that this runs, a * lower-case tag name will be supplied and the input will * contain lower-case tag names, thus normally bypassing * the case comparison code. */ for ( $i = 0; $i < $this->tag_name_length; $i++ ) { $html_char = $this->html[ $this->tag_name_starts_at + $i ]; $tag_char = $this->sought_tag_name[ $i ]; if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { return false; } } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { return false; } return true; } /** * Parser Ready State. * * Indicates that the parser is ready to run and waiting for a state transition. * It may not have started yet, or it may have just finished parsing a token and * is ready to find the next one. * * @since 6.5.0 * * @access private */ const STATE_READY = 'STATE_READY'; /** * Parser Complete State. * * Indicates that the parser has reached the end of the document and there is * nothing left to scan. It finished parsing the last token completely. * * @since 6.5.0 * * @access private */ const STATE_COMPLETE = 'STATE_COMPLETE'; /** * Parser Incomplete Input State. * * Indicates that the parser has reached the end of the document before finishing * a token. It started parsing a token but there is a possibility that the input * HTML document was truncated in the middle of a token. * * The parser is reset at the start of the incomplete token and has paused. There * is nothing more than can be scanned unless provided a more complete document. * * @since 6.5.0 * * @access private */ const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; /** * Parser Matched Tag State. * * Indicates that the parser has found an HTML tag and it's possible to get * the tag name and read or modify its attributes (if it's not a closing tag). * * @since 6.5.0 * * @access private */ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; /** * Parser Text Node State. * * Indicates that the parser has found a text node and it's possible * to read and modify that text. * * @since 6.5.0 * * @access private */ const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; /** * Parser CDATA Node State. * * Indicates that the parser has found a CDATA node and it's possible * to read and modify its modifiable text. Note that in HTML there are * no CDATA nodes outside of foreign content (SVG and MathML). Outside * of foreign content, they are treated as HTML comments. * * @since 6.5.0 * * @access private */ const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; /** * Indicates that the parser has found an HTML comment and it's * possible to read and modify its modifiable text. * * @since 6.5.0 * * @access private */ const STATE_COMMENT = 'STATE_COMMENT'; /** * Indicates that the parser has found a DOCTYPE node and it's * possible to read and modify its modifiable text. * * @since 6.5.0 * * @access private */ const STATE_DOCTYPE = 'STATE_DOCTYPE'; /** * Indicates that the parser has found an empty tag closer `</>`. * * Note that in HTML there are no empty tag closers, and they * are ignored. Nonetheless, the Tag Processor still * recognizes them as they appear in the HTML stream. * * These were historically discussed as a "presumptuous tag * closer," which would close the nearest open tag, but were * dismissed in favor of explicitly-closing tags. * * @since 6.5.0 * * @access private */ const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; /** * Indicates that the parser has found a "funky comment" * and it's possible to read and modify its modifiable text. * * Example: * * </%url> * </{"wp-bit":"query/post-author"}> * </2> * * Funky comments are tag closers with invalid tag names. Note * that in HTML these are turn into bogus comments. Nonetheless, * the Tag Processor recognizes them in a stream of HTML and * exposes them for inspection and modification. * * @since 6.5.0 * * @access private */ const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; /** * Indicates that a comment was created when encountering abruptly-closed HTML comment. * * Example: * * <!--> * <!---> * * @since 6.5.0 */ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT'; /** * Indicates that a comment would be parsed as a CDATA node, * were HTML to allow CDATA nodes outside of foreign content. * * Example: * * <![CDATA[This is a CDATA node.]]> * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE'; /** * Indicates that a comment was created when encountering * normative HTML comment syntax. * * Example: * * <!-- this is a comment --> * * @since 6.5.0 */ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT'; /** * Indicates that a comment would be parsed as a Processing * Instruction node, were they to exist within HTML. * * Example: * * <?wp __( 'Like' ) ?> * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE'; /** * Indicates that a comment was created when encountering invalid * HTML input, a so-called "bogus comment." * * Example: * * <?nothing special> * <!{nothing special}> * * @since 6.5.0 */ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; } PK +]VYP�&D D '