Skip to content

Commit 5a31215

Browse files
zwpaperpochi-agentautofix-ci[bot]
authored
fix(crawler): append URL-encoded section titles as fragments to llms.txt URLs (#4338)
* feat(crawler): append URL-encoded section titles as fragments to llms.txt URLs This enhancement ensures that each section in llms.txt files gets a unique URL by appending the URL-encoded section title as a fragment. This prevents URL collisions when multiple sections share the same base URL and improves navigation within crawled documents. Changes: - Add percent-encoding dependency for URL encoding - Modify split_llms_content to append encoded titles as URL fragments - Update metadata description to use base URL instead of full URL - Add comprehensive tests for the new fragment URL behavior 🤖 Generated with [Pochi](https://getpochi.com) Co-Authored-By: Pochi <[email protected]> * [autofix.ci] apply automated fixes --------- Co-authored-by: Pochi <[email protected]> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
1 parent a44447d commit 5a31215

File tree

3 files changed

+70
-14
lines changed

3 files changed

+70
-14
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/tabby-crawler/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ tokio = { workspace = true, features = ["io-util", "process", "rt"] }
1010
anyhow.workspace = true
1111
tracing.workspace = true
1212
url.workspace = true
13+
percent-encoding = "2.3"
1314
readable-readability = "0.4.0"
1415
futures.workspace = true
1516
async-stream.workspace = true

crates/tabby-crawler/src/llms_txt_parser.rs

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
2+
13
use crate::types::{CrawledDocument, CrawledMetadata};
24

35
pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument> {
@@ -13,10 +15,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
1315
// If we already have a section in progress, finalize it.
1416
if let Some(title) = current_title.take() {
1517
// Use the URL from the section if available; otherwise, fallback to base_url.
16-
let url = current_url.take().unwrap_or_else(|| base_url.to_owned());
18+
let base_url_str = current_url.take().unwrap_or_else(|| base_url.to_owned());
19+
// URL-encode the title and append it as a fragment
20+
let encoded_title = utf8_percent_encode(&title, NON_ALPHANUMERIC).to_string();
21+
let url = format!("{base_url_str}#{encoded_title}");
1722
let metadata = CrawledMetadata {
1823
title: title.into(),
19-
description: url.clone().into(),
24+
description: base_url_str.into(),
2025
};
2126
docs.push(CrawledDocument::new(
2227
url,
@@ -39,10 +44,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
3944

4045
// Finalize the last section if any.
4146
if let Some(title) = current_title {
42-
let url = current_url.unwrap_or_else(|| base_url.to_owned());
47+
let base_url_str = current_url.unwrap_or_else(|| base_url.to_owned());
48+
// URL-encode the title and append it as a fragment
49+
let encoded_title = utf8_percent_encode(&title, NON_ALPHANUMERIC).to_string();
50+
let url = format!("{base_url_str}#{encoded_title}");
4351
let metadata = CrawledMetadata {
4452
title: title.into(),
45-
description: url.clone().into(),
53+
description: base_url_str.into(),
4654
};
4755
docs.push(CrawledDocument::new(
4856
url,
@@ -74,8 +82,11 @@ More text on the same section.
7482
let doc = &docs[0];
7583
// The title is taken from the heading.
7684
assert_eq!(doc.metadata.title, Some("Test Title with URL".to_string()));
77-
// The URL should be extracted from the URL: line.
78-
assert_eq!(doc.url, "https://developers.cloudflare.com");
85+
// The URL should be extracted from the URL: line with encoded title appended.
86+
assert_eq!(
87+
doc.url,
88+
"https://developers.cloudflare.com#Test%20Title%20with%20URL"
89+
);
7990
// The body should contain the text after the URL line.
8091
assert_eq!(
8192
doc.markdown,
@@ -101,8 +112,11 @@ Line two of body.
101112
doc.metadata.title,
102113
Some("Test Title with Source".to_string())
103114
);
104-
// The URL should be extracted from the Source: line.
105-
assert_eq!(doc.url, "https://docs.perplexity.ai");
115+
// The URL should be extracted from the Source: line with encoded title appended.
116+
assert_eq!(
117+
doc.url,
118+
"https://docs.perplexity.ai#Test%20Title%20with%20Source"
119+
);
106120
assert_eq!(
107121
doc.markdown,
108122
"This is another test body.\nLine two of body."
@@ -126,8 +140,11 @@ Additional content line.
126140
doc.metadata.title,
127141
Some("Test Title without URL or Source".to_string())
128142
);
129-
// Fallback to the provided base_url.
130-
assert_eq!(doc.url, "example.com");
143+
// Fallback to the provided base_url with encoded title appended.
144+
assert_eq!(
145+
doc.url,
146+
"example.com#Test%20Title%20without%20URL%20or%20Source"
147+
);
131148
assert_eq!(
132149
doc.markdown,
133150
"This is test body with no explicit URL.\nAdditional content line."
@@ -156,22 +173,59 @@ Content for section three with no metadata.
156173
// Section One.
157174
let doc1 = &docs[0];
158175
assert_eq!(doc1.metadata.title, Some("Section One".to_string()));
159-
assert_eq!(doc1.url, "https://developers.cloudflare.com");
176+
assert_eq!(doc1.url, "https://developers.cloudflare.com#Section%20One");
160177
assert!(doc1.markdown.contains("Content for section one."));
161178

162179
// Section Two.
163180
let doc2 = &docs[1];
164181
assert_eq!(doc2.metadata.title, Some("Section Two".to_string()));
165-
assert_eq!(doc2.url, "https://docs.perplexity.ai");
182+
assert_eq!(doc2.url, "https://docs.perplexity.ai#Section%20Two");
166183
assert!(doc2.markdown.contains("Content for section two."));
167184

168185
// Section Three.
169186
let doc3 = &docs[2];
170187
assert_eq!(doc3.metadata.title, Some("Section Three".to_string()));
171-
// Since no URL/Source is provided, fallback to base_url.
172-
assert_eq!(doc3.url, "example.com");
188+
// Since no URL/Source is provided, fallback to base_url with encoded title appended.
189+
assert_eq!(doc3.url, "example.com#Section%20Three");
173190
assert!(doc3
174191
.markdown
175192
.contains("Content for section three with no metadata."));
176193
}
194+
195+
#[test]
196+
fn test_base_url_only_generates_unique_fragment_links() {
197+
// Test that when only base URL is available, each section gets a unique link with fragment
198+
let content = "\
199+
# Getting Started
200+
This is the getting started guide.
201+
202+
# Configuration
203+
This explains how to configure the system.
204+
205+
# Advanced Usage
206+
Advanced usage scenarios.
207+
";
208+
let base_url = "https://docs.example.com";
209+
let docs = split_llms_content(content, base_url);
210+
assert_eq!(docs.len(), 3, "Should produce three documents");
211+
212+
let doc1 = &docs[0];
213+
let doc2 = &docs[1];
214+
let doc3 = &docs[2];
215+
216+
// All should use base URL with encoded title fragments
217+
assert_eq!(doc1.url, "https://docs.example.com#Getting%20Started");
218+
assert_eq!(doc2.url, "https://docs.example.com#Configuration");
219+
assert_eq!(doc3.url, "https://docs.example.com#Advanced%20Usage");
220+
221+
// All URLs should be different despite same base URL
222+
assert_ne!(doc1.url, doc2.url);
223+
assert_ne!(doc2.url, doc3.url);
224+
assert_ne!(doc1.url, doc3.url);
225+
226+
// All should have the same description (base URL)
227+
assert_eq!(doc1.metadata.description, Some(base_url.to_string()));
228+
assert_eq!(doc2.metadata.description, Some(base_url.to_string()));
229+
assert_eq!(doc3.metadata.description, Some(base_url.to_string()));
230+
}
177231
}

0 commit comments

Comments
 (0)