Skip to content

Commit 5dcd1d9

Browse files
committed
Replace replacedurl with convertedurl
1 parent c417f2f commit 5dcd1d9

File tree

3 files changed

+254
-66
lines changed

3 files changed

+254
-66
lines changed

components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php

Lines changed: 17 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
use Rowbot\URL\URL;
66
use WordPress\DataLiberation\URL\URLInTextProcessor;
77
use WordPress\DataLiberation\URL\WPURL;
8+
use WordPress\DataLiberation\URL\ConvertedUrl;
89

910
use function WordPress\DataLiberation\URL\urldecode_n;
1011

@@ -256,87 +257,37 @@ public function set_url( $raw_url, $parsed_url ) {
256257
* by this WPURL_In_Text_Processor class so maybe the two do go hand in hand?
257258
*/
258259
public function replace_base_url( URL $to_url, ?URL $base_url = null ) {
259-
$updated_url = clone $this->get_parsed_url();
260-
261-
$updated_url->hostname = $to_url->hostname;
262-
$updated_url->protocol = $to_url->protocol;
263-
$updated_url->port = $to_url->port;
264-
265-
// Update the pathname if needed.
266-
$from_url = $this->get_parsed_url();
267-
$from_pathname = $from_url->pathname;
268-
$to_pathname = $to_url->pathname;
269-
270260
$base_url = $base_url ?? $this->base_url_object;
271-
if ( $base_url->pathname !== $to_pathname ) {
272-
$base_pathname_with_trailing_slash = rtrim( $base_url->pathname, '/' ) . '/';
273-
$decoded_matched_pathname = urldecode_n(
274-
$from_pathname,
275-
strlen( $base_pathname_with_trailing_slash )
276-
);
277-
$to_pathname_with_trailing_slash = rtrim( $to_pathname, '/' ) . '/';
278-
$remaining_pathname =
279-
substr(
280-
$decoded_matched_pathname,
281-
strlen( $base_pathname_with_trailing_slash )
282-
);
283-
284-
$updated_url->pathname = $to_pathname_with_trailing_slash . $remaining_pathname;
261+
if ( ! $base_url ) {
262+
return false;
285263
}
286264

287-
/*
288-
* Stylistic choice – if the updated URL has no trailing slash,
289-
* do not add it to the new URL. The WHATWG URL parser will
290-
* add one automatically if the path is empty, so we have to
291-
* explicitly remove it.
292-
*/
293-
$new_raw_url = $updated_url->toString();
294265
if (
295-
'/' !== $from_url->pathname[ strlen( $from_url->pathname ) - 1 ] &&
296-
'/' !== $from_url->pathname &&
297-
'' === $from_url->search &&
298-
'' === $from_url->hash
266+
'#text' === $this->get_token_type() &&
267+
! WPURL::can_parse( $this->get_raw_url() )
299268
) {
300-
$new_raw_url = rtrim( $new_raw_url, '/' );
301-
}
302-
if ( ! $new_raw_url ) {
303-
// @TODO: When does this happen? Let's add the test coverage and
304-
// doubly verify the logic.
269+
// In text nodes, only convert absolute URLs.
305270
return false;
306271
}
307272

308-
if ( ! $this->is_url_relative() ) {
309-
$this->set_url( $new_raw_url, $updated_url );
310-
311-
return true;
312-
}
273+
$result = WPURL::replace_base_url(
274+
$this->get_parsed_url(),
275+
array(
276+
'old_base_url' => $base_url,
277+
'new_base_url' => $to_url,
278+
'raw_url' => $this->get_raw_url(),
279+
)
280+
);
313281

314-
$new_relative_url = $updated_url->pathname;
315-
if ( '' !== $updated_url->search ) {
316-
$new_relative_url .= $updated_url->search;
317-
}
318-
if ( '' !== $updated_url->hash ) {
319-
$new_relative_url .= $updated_url->hash;
282+
if ( false === $result ) {
283+
return false;
320284
}
321285

322-
$this->set_url( $new_relative_url, $updated_url );
286+
$this->set_url( (string) $result, $result->getConvertedUrl() );
323287

324288
return true;
325289
}
326290

327-
/**
328-
* Returns true if the currently matched URL is relative.
329-
*
330-
* @return bool Whether the currently matched URL is relative.
331-
*/
332-
public function is_url_relative() {
333-
return (
334-
! WPURL::can_parse( $this->get_raw_url() ) &&
335-
// only absolute URLs are detected in text nodes.
336-
'#text' !== $this->get_token_type()
337-
);
338-
}
339-
340291
/**
341292
* Returns true if the currently matched URL is absolute.
342293
*
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
<?php
2+
3+
namespace WordPress\DataLiberation\URL;
4+
5+
use Rowbot\URL\URL;
6+
7+
/**
8+
* Value object returned by WPURL::replace_base_url().
9+
*
10+
* - Cast to string to get the updated URL as a string.
11+
* - When the original URL was relative, casting returns a relative string against
12+
* the new base.
13+
*/
14+
class ConvertedUrl {
15+
16+
/** @var URL */
17+
private $url;
18+
19+
/** @var string */
20+
private $string;
21+
22+
/** @var string|null */
23+
private $relative_string;
24+
25+
/** @var bool */
26+
private $was_relative;
27+
28+
public function __construct( URL $url, string $string, ?string $relative_string, bool $was_relative ) {
29+
$this->url = $url;
30+
$this->string = $string;
31+
$this->relative_string = $relative_string;
32+
$this->was_relative = $was_relative;
33+
}
34+
35+
/**
36+
* Returns the updated URL string. If the original was relative, returns a relative string.
37+
*/
38+
public function __toString(): string {
39+
if ( $this->was_relative ) {
40+
return $this->getRelativeString();
41+
}
42+
return $this->getString();
43+
}
44+
45+
/**
46+
* The parsed updated URL object.
47+
*/
48+
public function getConvertedUrl(): URL {
49+
return $this->url;
50+
}
51+
52+
/**
53+
* Whether the input URL was originally relative.
54+
*/
55+
public function wasRelative(): bool {
56+
return $this->was_relative;
57+
}
58+
59+
/**
60+
* Returns the absolute updated URL string.
61+
*/
62+
public function getString(): string {
63+
return $this->string;
64+
}
65+
66+
/**
67+
* Returns the relative string if available, otherwise constructs it from the URL.
68+
*/
69+
public function getRelativeString(): ?string {
70+
if ( null !== $this->relative_string ) {
71+
return $this->relative_string;
72+
}
73+
74+
$relative = $this->url->pathname;
75+
if ( '' !== $this->url->search ) {
76+
$relative .= $this->url->search;
77+
}
78+
if ( '' !== $this->url->hash ) {
79+
$relative .= $this->url->hash;
80+
}
81+
82+
return $relative;
83+
}
84+
}

components/DataLiberation/URL/class-wpurl.php

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,159 @@ public static function can_parse( $url, $base = null ) {
2525
return URL::canParse( $url, $base );
2626
}
2727

28+
/**
29+
* Replaces the "base" of a URL — scheme, host (and port), and the portion of the path that
30+
* belongs to the old base — with a new base while keeping the remainder of the URL intact.
31+
*
32+
* This is intended for content migrations, where URLs embedded in block markup, HTML attributes,
33+
* or inline text must be moved from one site root to another without losing the rest of the path,
34+
* query, or fragment. It handles simple domain swaps, ports, and deep path bases. When the old
35+
* base includes path segments, only that matched prefix is substituted and the unmatched tail is
36+
* carried over to the target base.
37+
*
38+
* For example:
39+
* * URL: https://example.com/a/b/c/d/e/f/g/h/i/j/page/
40+
* * Old base: https://example.com/a/b/c/d/e/f/
41+
* * New base: https://example.org/docs/
42+
* * Result: https://example.org/docs/g/h/i/j/page/
43+
*
44+
* ## Trailing slash handling
45+
*
46+
* Trailing slash style is preserved from the original URL. If it has no trailing slash, the
47+
* result will also omit the trailing slash and vice versa.
48+
*
49+
* For example, here the final result has no trailing slash:
50+
* * URL: https://example.com/uploads/file.txt
51+
* * Old base: https://example.com/uploads/
52+
* * New base: https://example.org/docs/
53+
* * Result: https://example.org/docs/file.txt
54+
*
55+
* And here it does:
56+
* * URL: https://example.com/uploads/2018/
57+
* * Old base: https://example.com/uploads/
58+
* * New base: https://example.org/docs/
59+
* * Result: https://example.org/docs/2018/
60+
*
61+
* ## URL-encoded path segments
62+
*
63+
* URL-encoded path segments are respected and not inadvertently decoded or re-encoded. Only the
64+
* matched base prefix is considered for alignment, so inputs that contain percent-encoded content
65+
* keep that content exactly as-is in the output. This prevents data corruption in tricky cases such
66+
* as "/~jappleseed/1997.10.1/%2561-reasons-to-migrate-data/" where the "%2561" must remain
67+
* double-escaped after the move.
68+
*
69+
* ## Relative URLs
70+
*
71+
* This method can preserve the relative nature of the original URL. Say you are processing a markup
72+
* that contains `<a href="/uploads/file.txt">`. The original URL string is "/uploads/file.txt",
73+
* and the URL actually resolves to "https://example.com/uploads/file.txt". If you want to replace
74+
* the base URL from "https://example.com/uploads/" to "https://newsite.com/files/" but keep the
75+
* URL relative, you can pass the raw URL string via the "raw_url" option.
76+
*
77+
* For example:
78+
* * URL: https://example.com/uploads/file.txt
79+
* * Raw URL: /uploads/file.txt
80+
* * Old base: https://example.com/uploads/
81+
* * New base: https://example.org/files/
82+
* * Result: /files/file.txt
83+
*
84+
* The method also supports relative inputs commonly found in markup. If you pass the raw URL
85+
* string via the "raw_url" option, the method can infer whether the author originally wrote a
86+
* relative URL like "docs/page.html" or an absolute one. You may also explicitly
87+
* assert relativity with "is_relative" to avoid inference.
88+
*
89+
* @param string|URL $url The URL to replace the base of.
90+
* @param array $options Associative options: old_base_url, new_base_url; optional raw_url.
91+
* @return ConvertedUrl|false Returns a ConvertedUrl value object on success, or false when parsing
92+
* or replacement cannot be performed.
93+
*/
94+
public static function replace_base_url( $url, $options ) {
95+
if ( ! is_array( $options ) ) {
96+
return false;
97+
}
98+
99+
foreach ( array( 'old_base_url', 'new_base_url' ) as $required ) {
100+
if ( ! array_key_exists( $required, $options ) || null === $options[ $required ] ) {
101+
return false;
102+
}
103+
}
104+
105+
$old_base_url = self::parse( $options['old_base_url'] );
106+
$new_base_url = self::parse( $options['new_base_url'] );
107+
$url = self::parse( $url, $old_base_url ? $old_base_url->toString() : null );
108+
109+
if ( false === $old_base_url || false === $new_base_url || false === $url ) {
110+
return false;
111+
}
112+
113+
$updated_url = clone $url;
114+
115+
$updated_url->hostname = $new_base_url->hostname;
116+
$updated_url->protocol = $new_base_url->protocol;
117+
$updated_url->port = $new_base_url->port;
118+
119+
$from_pathname = $url->pathname;
120+
$to_pathname = $new_base_url->pathname;
121+
$base_pathname = $old_base_url->pathname;
122+
123+
if ( $base_pathname !== $to_pathname ) {
124+
$base_pathname_with_trailing_slash = rtrim( $base_pathname, '/' ) . '/';
125+
$decoded_matched_pathname = urldecode_n(
126+
$from_pathname,
127+
strlen( $base_pathname_with_trailing_slash )
128+
);
129+
$to_pathname_with_trailing_slash = rtrim( $to_pathname, '/' ) . '/';
130+
$remaining_pathname = substr(
131+
$decoded_matched_pathname,
132+
strlen( $base_pathname_with_trailing_slash )
133+
);
134+
135+
$updated_url->pathname = $to_pathname_with_trailing_slash . $remaining_pathname;
136+
}
137+
138+
/*
139+
* Stylistic choice – if the updated URL has no trailing slash,
140+
* do not add it to the new URL. The WHATWG URL parser will
141+
* add one automatically if the path is empty, so we have to
142+
* explicitly remove it.
143+
*/
144+
$new_raw_url = $updated_url->toString();
145+
$should_trim_trailing_slash = (
146+
'' !== $from_pathname &&
147+
'/' !== substr( $from_pathname, -1 ) &&
148+
'/' !== $from_pathname &&
149+
'' === $url->search &&
150+
'' === $url->hash
151+
);
152+
if ( $should_trim_trailing_slash ) {
153+
$new_raw_url = rtrim( $new_raw_url, '/' );
154+
}
155+
if ( ! $new_raw_url ) {
156+
return false;
157+
}
158+
159+
$was_relative = null;
160+
if ( array_key_exists( 'raw_url', $options ) && is_string( $options['raw_url'] ) ) {
161+
$was_relative = ! self::can_parse( $options['raw_url'] );
162+
}
163+
if ( null === $was_relative ) {
164+
$was_relative = false;
165+
}
166+
167+
$relative_url = null;
168+
if ( $was_relative ) {
169+
$relative_url = $updated_url->pathname;
170+
if ( '' !== $updated_url->search ) {
171+
$relative_url .= $updated_url->search;
172+
}
173+
if ( '' !== $updated_url->hash ) {
174+
$relative_url .= $updated_url->hash;
175+
}
176+
}
177+
178+
return new ConvertedUrl( $updated_url, $new_raw_url, $relative_url, (bool) $was_relative );
179+
}
180+
28181
/**
29182
* Prepends a protocol to any matched URL without the double slash.
30183
*

0 commit comments

Comments
 (0)