1
+ use percent_encoding:: { utf8_percent_encode, NON_ALPHANUMERIC } ;
2
+
1
3
use crate :: types:: { CrawledDocument , CrawledMetadata } ;
2
4
3
5
pub fn split_llms_content ( content : & str , base_url : & str ) -> Vec < CrawledDocument > {
@@ -13,10 +15,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
13
15
// If we already have a section in progress, finalize it.
14
16
if let Some ( title) = current_title. take ( ) {
15
17
// Use the URL from the section if available; otherwise, fallback to base_url.
16
- let url = current_url. take ( ) . unwrap_or_else ( || base_url. to_owned ( ) ) ;
18
+ let base_url_str = current_url. take ( ) . unwrap_or_else ( || base_url. to_owned ( ) ) ;
19
+ // URL-encode the title and append it as a fragment
20
+ let encoded_title = utf8_percent_encode ( & title, NON_ALPHANUMERIC ) . to_string ( ) ;
21
+ let url = format ! ( "{base_url_str}#{encoded_title}" ) ;
17
22
let metadata = CrawledMetadata {
18
23
title : title. into ( ) ,
19
- description : url . clone ( ) . into ( ) ,
24
+ description : base_url_str . into ( ) ,
20
25
} ;
21
26
docs. push ( CrawledDocument :: new (
22
27
url,
@@ -39,10 +44,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
39
44
40
45
// Finalize the last section if any.
41
46
if let Some ( title) = current_title {
42
- let url = current_url. unwrap_or_else ( || base_url. to_owned ( ) ) ;
47
+ let base_url_str = current_url. unwrap_or_else ( || base_url. to_owned ( ) ) ;
48
+ // URL-encode the title and append it as a fragment
49
+ let encoded_title = utf8_percent_encode ( & title, NON_ALPHANUMERIC ) . to_string ( ) ;
50
+ let url = format ! ( "{base_url_str}#{encoded_title}" ) ;
43
51
let metadata = CrawledMetadata {
44
52
title : title. into ( ) ,
45
- description : url . clone ( ) . into ( ) ,
53
+ description : base_url_str . into ( ) ,
46
54
} ;
47
55
docs. push ( CrawledDocument :: new (
48
56
url,
@@ -74,8 +82,11 @@ More text on the same section.
74
82
let doc = & docs[ 0 ] ;
75
83
// The title is taken from the heading.
76
84
assert_eq ! ( doc. metadata. title, Some ( "Test Title with URL" . to_string( ) ) ) ;
77
- // The URL should be extracted from the URL: line.
78
- assert_eq ! ( doc. url, "https://developers.cloudflare.com" ) ;
85
+ // The URL should be extracted from the URL: line with encoded title appended.
86
+ assert_eq ! (
87
+ doc. url,
88
+ "https://developers.cloudflare.com#Test%20Title%20with%20URL"
89
+ ) ;
79
90
// The body should contain the text after the URL line.
80
91
assert_eq ! (
81
92
doc. markdown,
@@ -101,8 +112,11 @@ Line two of body.
101
112
doc. metadata. title,
102
113
Some ( "Test Title with Source" . to_string( ) )
103
114
) ;
104
- // The URL should be extracted from the Source: line.
105
- assert_eq ! ( doc. url, "https://docs.perplexity.ai" ) ;
115
+ // The URL should be extracted from the Source: line with encoded title appended.
116
+ assert_eq ! (
117
+ doc. url,
118
+ "https://docs.perplexity.ai#Test%20Title%20with%20Source"
119
+ ) ;
106
120
assert_eq ! (
107
121
doc. markdown,
108
122
"This is another test body.\n Line two of body."
@@ -126,8 +140,11 @@ Additional content line.
126
140
doc. metadata. title,
127
141
Some ( "Test Title without URL or Source" . to_string( ) )
128
142
) ;
129
- // Fallback to the provided base_url.
130
- assert_eq ! ( doc. url, "example.com" ) ;
143
+ // Fallback to the provided base_url with encoded title appended.
144
+ assert_eq ! (
145
+ doc. url,
146
+ "example.com#Test%20Title%20without%20URL%20or%20Source"
147
+ ) ;
131
148
assert_eq ! (
132
149
doc. markdown,
133
150
"This is test body with no explicit URL.\n Additional content line."
@@ -156,22 +173,59 @@ Content for section three with no metadata.
156
173
// Section One.
157
174
let doc1 = & docs[ 0 ] ;
158
175
assert_eq ! ( doc1. metadata. title, Some ( "Section One" . to_string( ) ) ) ;
159
- assert_eq ! ( doc1. url, "https://developers.cloudflare.com" ) ;
176
+ assert_eq ! ( doc1. url, "https://developers.cloudflare.com#Section%20One " ) ;
160
177
assert ! ( doc1. markdown. contains( "Content for section one." ) ) ;
161
178
162
179
// Section Two.
163
180
let doc2 = & docs[ 1 ] ;
164
181
assert_eq ! ( doc2. metadata. title, Some ( "Section Two" . to_string( ) ) ) ;
165
- assert_eq ! ( doc2. url, "https://docs.perplexity.ai" ) ;
182
+ assert_eq ! ( doc2. url, "https://docs.perplexity.ai#Section%20Two " ) ;
166
183
assert ! ( doc2. markdown. contains( "Content for section two." ) ) ;
167
184
168
185
// Section Three.
169
186
let doc3 = & docs[ 2 ] ;
170
187
assert_eq ! ( doc3. metadata. title, Some ( "Section Three" . to_string( ) ) ) ;
171
- // Since no URL/Source is provided, fallback to base_url.
172
- assert_eq ! ( doc3. url, "example.com" ) ;
188
+ // Since no URL/Source is provided, fallback to base_url with encoded title appended .
189
+ assert_eq ! ( doc3. url, "example.com#Section%20Three " ) ;
173
190
assert ! ( doc3
174
191
. markdown
175
192
. contains( "Content for section three with no metadata." ) ) ;
176
193
}
194
+
195
+ #[ test]
196
+ fn test_base_url_only_generates_unique_fragment_links ( ) {
197
+ // Test that when only base URL is available, each section gets a unique link with fragment
198
+ let content = "\
199
+ # Getting Started
200
+ This is the getting started guide.
201
+
202
+ # Configuration
203
+ This explains how to configure the system.
204
+
205
+ # Advanced Usage
206
+ Advanced usage scenarios.
207
+ " ;
208
+ let base_url = "https://docs.example.com" ;
209
+ let docs = split_llms_content ( content, base_url) ;
210
+ assert_eq ! ( docs. len( ) , 3 , "Should produce three documents" ) ;
211
+
212
+ let doc1 = & docs[ 0 ] ;
213
+ let doc2 = & docs[ 1 ] ;
214
+ let doc3 = & docs[ 2 ] ;
215
+
216
+ // All should use base URL with encoded title fragments
217
+ assert_eq ! ( doc1. url, "https://docs.example.com#Getting%20Started" ) ;
218
+ assert_eq ! ( doc2. url, "https://docs.example.com#Configuration" ) ;
219
+ assert_eq ! ( doc3. url, "https://docs.example.com#Advanced%20Usage" ) ;
220
+
221
+ // All URLs should be different despite same base URL
222
+ assert_ne ! ( doc1. url, doc2. url) ;
223
+ assert_ne ! ( doc2. url, doc3. url) ;
224
+ assert_ne ! ( doc1. url, doc3. url) ;
225
+
226
+ // All should have the same description (base URL)
227
+ assert_eq ! ( doc1. metadata. description, Some ( base_url. to_string( ) ) ) ;
228
+ assert_eq ! ( doc2. metadata. description, Some ( base_url. to_string( ) ) ) ;
229
+ assert_eq ! ( doc3. metadata. description, Some ( base_url. to_string( ) ) ) ;
230
+ }
177
231
}
0 commit comments