diff --git a/e2e/fixtures/wxr-base-url-rewriting.xml b/e2e/fixtures/wxr-base-url-rewriting.xml index 765924d4..4b48fcb4 100644 --- a/e2e/fixtures/wxr-base-url-rewriting.xml +++ b/e2e/fixtures/wxr-base-url-rewriting.xml @@ -69,7 +69,7 @@
- +
]]>
diff --git a/e2e/import-wxr.spec.js b/e2e/import-wxr.spec.js
index d458d044..8037cd1e 100644
--- a/e2e/import-wxr.spec.js
+++ b/e2e/import-wxr.spec.js
@@ -8,7 +8,7 @@ const http = require('http');
const fs = require('fs');
// Define available parsers
-const PARSERS = ['simplexml', 'xml', 'regex', 'xmlprocessor'];
+const PARSERS = process.env.PARSER ? [process.env.PARSER] : ['simplexml', 'xml', 'regex', 'xmlprocessor'];
let PLAYGROUND_URL = '';
// Run tests for each parser
PARSERS.forEach((parser) => {
@@ -116,7 +116,7 @@ https://playground.internal/path-not-taken was the second best choice.
-
+
`;
expect(normalizeBlockMarkup(normalized.rawContent)).toContain(
diff --git a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php
index d65d236b..9e0f9267 100644
--- a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php
+++ b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php
@@ -527,6 +527,14 @@ public function next_block_attribute() {
return isset( $this->block_attribute_paths[ $this->block_attribute_index ] );
}
+ protected function get_block_attribute_path() {
+ if ( null === $this->block_attribute_paths || ! isset( $this->block_attribute_paths[ $this->block_attribute_index ] ) ) {
+ return false;
+ }
+
+ return $this->block_attribute_paths[ $this->block_attribute_index ];
+ }
+
/**
* Gets the key of the currently matched block attribute.
*
diff --git a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php
index cedb23c5..4ea5c778 100644
--- a/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php
+++ b/src/php-toolkit/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php
@@ -132,7 +132,7 @@ private function next_url_in_text_node() {
private function next_url_attribute() {
$tag = $this->get_tag();
- if ( ! array_key_exists( $tag, self::URL_ATTRIBUTES ) ) {
+ if ( ! array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) {
return false;
}
@@ -142,7 +142,7 @@ private function next_url_attribute() {
* for the current token. The last element is the attribute we'll
* inspect in the while() loop below.
*/
- $this->inspecting_html_attributes = self::URL_ATTRIBUTES[ $tag ];
+ $this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ];
} else {
/**
* Forget the attribute we've inspected on the previous call to
@@ -184,22 +184,75 @@ private function next_url_attribute() {
private function next_url_block_attribute() {
while ( $this->next_block_attribute() ) {
$url_maybe = $this->get_block_attribute_value();
- /*
- * Do not use base URL for block attributes. to avoid false positives.
- * When a base URL is present, any word is a valid URL relative to the
- * base URL.
- * When a base URL is missing, the string must start with a protocol to
- * be considered a URL.
+ if ( ! is_string( $url_maybe ) ||
+ count( $this->get_block_attribute_path() ) > 1
+ ) {
+ // @TODO: support arrays, objects, and other non-string data structures.
+ continue;
+ }
+
+ /**
+ * Decide whether the current block attribute holds a URL.
+ *
+ * Known URL attributes can be assumed to hold a URL and be
+ * parsed with the base URL. For example, a "/about-us" value
+ * in a wp:navigation-link block's `url` attribute is a
+ * relative URL to the `/about-us` page.
+ *
+ * Other attributes may or may not contain URLs, but we cannot assume
+ * they do. A value `/about-us` could be a relative URL or a class name.
+ * In those cases, we'll let go of relative URLs and only detect
+ * absolute URLs to avoid treating every string as a URL. This requires
+ * parsing without a base URL.
+ */
+ $is_relative_url_block_attribute = (
+ isset( self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ] ) &&
+ in_array( $this->get_block_attribute_key(), self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ], true )
+ );
+
+ /**
+ * Filters whether a block attribute is known to contain a relative URL.
+ *
+ * This filter allows extending the list of block attributes that are
+ * recognized as containing URLs. When a block attribute is marked as
+ * a known URL attribute, it will be parsed with the base URL, allowing
+ * relative URLs to be properly resolved.
+ *
+ * @since 6.8.0
+ *
+ * @param bool $is_relative_url_block_attribute Whether the block attribute is known to contain a relative URL.
+ * @param array $context {
+ * Context information about the block attribute.
+ *
+ * @type string $block_name The name of the block (e.g., 'wp:image', 'wp:button').
+ * @type string $attribute_name The name of the attribute (e.g., 'url', 'href').
+ * }
*/
- if ( is_string( $url_maybe ) ) {
+ $is_relative_url_block_attribute = apply_filters(
+ 'url_processor_is_relative_url_block_attribute',
+ $is_relative_url_block_attribute,
+ array(
+ 'block_name' => $this->get_block_name(),
+ 'attribute_name' => $this->get_block_attribute_key(),
+ )
+ );
+
+ $parsed_url = false;
+ if ( $is_relative_url_block_attribute ) {
+ // Known relative URL attribute – let's parse with the base URL.
+ $parsed_url = WPURL::parse( $url_maybe, $this->base_url_string );
+ } else {
+ // Other attributes – let's parse without a base URL (and only detect absolute URLs).
$parsed_url = WPURL::parse( $url_maybe );
- if ( false !== $parsed_url ) {
- $this->raw_url = $url_maybe;
- $this->parsed_url = $parsed_url;
+ }
- return true;
- }
+ if ( false === $parsed_url ) {
+ continue;
}
+
+ $this->raw_url = $url_maybe;
+ $this->parsed_url = $parsed_url;
+ return true;
}
return false;
@@ -362,6 +415,26 @@ public function get_inspected_attribute_name() {
return $this->inspecting_html_attributes[ count( $this->inspecting_html_attributes ) - 1 ];
}
+ /**
+ * A list of block attributes that are known to contain URLs.
+ *
+ * It covers WordPress core blocks as of WordPress version 6.9. It can be
+ * extended by plugins and themes via the "url_processor_is_relative_url_block_attribute"
+ * filter.
+ *
+ * @var array
+ */
+ public const BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array(
+ 'wp:button' => array( 'url', 'linkTarget' ),
+ 'wp:cover' => array( 'url' ),
+ 'wp:embed' => array( 'url' ),
+ 'wp:gallery' => array( 'url', 'fullUrl' ),
+ 'wp:image' => array( 'url', 'src', 'href' ),
+ 'wp:media-text' => array( 'mediaUrl', 'href' ),
+ 'wp:navigation-link' => array( 'url' ),
+ 'wp:navigation-submenu' => array( 'url' ),
+ 'wp:rss' => array( 'feedURL' ),
+ );
/**
* A list of HTML attributes meant to contain URLs, as defined in the HTML specification.
@@ -370,7 +443,7 @@ public function get_inspected_attribute_name() {
* See https://html.spec.whatwg.org/multipage/indices.html#attributes-1.
* See https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value.
*/
- public const URL_ATTRIBUTES = array(
+ public const HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array(
'A' => array( 'href' ),
'APPLET' => array( 'codebase', 'archive' ),
'AREA' => array( 'href' ),
@@ -405,7 +478,7 @@ public function get_inspected_attribute_name() {
* @TODO: Either explicitly support these attributes, or explicitly drop support for
* handling their subsyntax. A generic URL matcher might be good enough.
*/
- public const URL_ATTRIBUTES_WITH_SUBSYNTAX = array(
+ public const HTML_ATTRIBUTES_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array(
'*' => array( 'style' ), // background(), background-image().
'APPLET' => array( 'archive' ),
'IMG' => array( 'srcset' ),
@@ -425,7 +498,7 @@ public function get_inspected_attribute_name() {
* @TODO: Either explicitly support these tags, or explicitly drop support for
* handling their subsyntax. A generic URL matcher might be good enough.
*/
- public const URL_CONTAINING_TAGS_WITH_SUBSYNTAX = array(
+ public const HTML_TAGS_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array(
'STYLE',
'SCRIPT',
);