diff --git a/e2e/fixtures/wxr-base-url-rewriting.xml b/e2e/fixtures/wxr-base-url-rewriting.xml index 765924d4..4b48fcb4 100644 --- a/e2e/fixtures/wxr-base-url-rewriting.xml +++ b/e2e/fixtures/wxr-base-url-rewriting.xml @@ -69,7 +69,7 @@

- + ]]> diff --git a/e2e/import-wxr.spec.js b/e2e/import-wxr.spec.js index d458d044..8037cd1e 100644 --- a/e2e/import-wxr.spec.js +++ b/e2e/import-wxr.spec.js @@ -8,7 +8,7 @@ const http = require('http'); const fs = require('fs'); // Define available parsers -const PARSERS = ['simplexml', 'xml', 'regex', 'xmlprocessor']; +const PARSERS = process.env.PARSER ? [process.env.PARSER] : ['simplexml', 'xml', 'regex', 'xmlprocessor']; let PLAYGROUND_URL = ''; // Run tests for each parser PARSERS.forEach((parser) => { @@ -116,7 +116,7 @@ https://playground.internal/path-not-taken was the second best choice.

- + `; expect(normalizeBlockMarkup(normalized.rawContent)).toContain( diff --git a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php index d65d236b..9e0f9267 100644 --- a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php +++ b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php @@ -527,6 +527,14 @@ public function next_block_attribute() { return isset( $this->block_attribute_paths[ $this->block_attribute_index ] ); } + protected function get_block_attribute_path() { + if ( null === $this->block_attribute_paths || ! isset( $this->block_attribute_paths[ $this->block_attribute_index ] ) ) { + return false; + } + + return $this->block_attribute_paths[ $this->block_attribute_index ]; + } + /** * Gets the key of the currently matched block attribute. * diff --git a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php index cedb23c5..4ea5c778 100644 --- a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php +++ b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php @@ -132,7 +132,7 @@ private function next_url_in_text_node() { private function next_url_attribute() { $tag = $this->get_tag(); - if ( ! array_key_exists( $tag, self::URL_ATTRIBUTES ) ) { + if ( ! array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) { return false; } @@ -142,7 +142,7 @@ private function next_url_attribute() { * for the current token. The last element is the attribute we'll * inspect in the while() loop below. */ - $this->inspecting_html_attributes = self::URL_ATTRIBUTES[ $tag ]; + $this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ]; } else { /** * Forget the attribute we've inspected on the previous call to @@ -184,22 +184,75 @@ private function next_url_attribute() { private function next_url_block_attribute() { while ( $this->next_block_attribute() ) { $url_maybe = $this->get_block_attribute_value(); - /* - * Do not use base URL for block attributes. to avoid false positives. - * When a base URL is present, any word is a valid URL relative to the - * base URL. - * When a base URL is missing, the string must start with a protocol to - * be considered a URL. + if ( ! is_string( $url_maybe ) || + count( $this->get_block_attribute_path() ) > 1 + ) { + // @TODO: support arrays, objects, and other non-string data structures. + continue; + } + + /** + * Decide whether the current block attribute holds a URL. + * + * Known URL attributes can be assumed to hold a URL and be + * parsed with the base URL. For example, a "/about-us" value + * in a wp:navigation-link block's `url` attribute is a + * relative URL to the `/about-us` page. + * + * Other attributes may or may not contain URLs, but we cannot assume + * they do. A value `/about-us` could be a relative URL or a class name. + * In those cases, we'll let go of relative URLs and only detect + * absolute URLs to avoid treating every string as a URL. This requires + * parsing without a base URL. + */ + $is_relative_url_block_attribute = ( + isset( self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ] ) && + in_array( $this->get_block_attribute_key(), self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ], true ) + ); + + /** + * Filters whether a block attribute is known to contain a relative URL. + * + * This filter allows extending the list of block attributes that are + * recognized as containing URLs. When a block attribute is marked as + * a known URL attribute, it will be parsed with the base URL, allowing + * relative URLs to be properly resolved. + * + * @since 6.8.0 + * + * @param bool $is_relative_url_block_attribute Whether the block attribute is known to contain a relative URL. + * @param array $context { + * Context information about the block attribute. + * + * @type string $block_name The name of the block (e.g., 'wp:image', 'wp:button'). + * @type string $attribute_name The name of the attribute (e.g., 'url', 'href'). + * } */ - if ( is_string( $url_maybe ) ) { + $is_relative_url_block_attribute = apply_filters( + 'url_processor_is_relative_url_block_attribute', + $is_relative_url_block_attribute, + array( + 'block_name' => $this->get_block_name(), + 'attribute_name' => $this->get_block_attribute_key(), + ) + ); + + $parsed_url = false; + if ( $is_relative_url_block_attribute ) { + // Known relative URL attribute – let's parse with the base URL. + $parsed_url = WPURL::parse( $url_maybe, $this->base_url_string ); + } else { + // Other attributes – let's parse without a base URL (and only detect absolute URLs). $parsed_url = WPURL::parse( $url_maybe ); - if ( false !== $parsed_url ) { - $this->raw_url = $url_maybe; - $this->parsed_url = $parsed_url; + } - return true; - } + if ( false === $parsed_url ) { + continue; } + + $this->raw_url = $url_maybe; + $this->parsed_url = $parsed_url; + return true; } return false; @@ -362,6 +415,26 @@ public function get_inspected_attribute_name() { return $this->inspecting_html_attributes[ count( $this->inspecting_html_attributes ) - 1 ]; } + /** + * A list of block attributes that are known to contain URLs. + * + * It covers WordPress core blocks as of WordPress version 6.9. It can be + * extended by plugins and themes via the "url_processor_is_relative_url_block_attribute" + * filter. + * + * @var array + */ + public const BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array( + 'wp:button' => array( 'url', 'linkTarget' ), + 'wp:cover' => array( 'url' ), + 'wp:embed' => array( 'url' ), + 'wp:gallery' => array( 'url', 'fullUrl' ), + 'wp:image' => array( 'url', 'src', 'href' ), + 'wp:media-text' => array( 'mediaUrl', 'href' ), + 'wp:navigation-link' => array( 'url' ), + 'wp:navigation-submenu' => array( 'url' ), + 'wp:rss' => array( 'feedURL' ), + ); /** * A list of HTML attributes meant to contain URLs, as defined in the HTML specification. @@ -370,7 +443,7 @@ public function get_inspected_attribute_name() { * See https://html.spec.whatwg.org/multipage/indices.html#attributes-1. * See https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value. */ - public const URL_ATTRIBUTES = array( + public const HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array( 'A' => array( 'href' ), 'APPLET' => array( 'codebase', 'archive' ), 'AREA' => array( 'href' ), @@ -405,7 +478,7 @@ public function get_inspected_attribute_name() { * @TODO: Either explicitly support these attributes, or explicitly drop support for * handling their subsyntax. A generic URL matcher might be good enough. */ - public const URL_ATTRIBUTES_WITH_SUBSYNTAX = array( + public const HTML_ATTRIBUTES_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array( '*' => array( 'style' ), // background(), background-image(). 'APPLET' => array( 'archive' ), 'IMG' => array( 'srcset' ), @@ -425,7 +498,7 @@ public function get_inspected_attribute_name() { * @TODO: Either explicitly support these tags, or explicitly drop support for * handling their subsyntax. A generic URL matcher might be good enough. */ - public const URL_CONTAINING_TAGS_WITH_SUBSYNTAX = array( + public const HTML_TAGS_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array( 'STYLE', 'SCRIPT', );