fix(crawler): append URL-encoded section titles as fragments to llms.txt URLs (#4338)

zwpaper · pochi-agent · autofix-ci[bot] · web-flow · commit 5a31215720cf · 2025-08-14T09:15:12.000+08:00
* feat(crawler): append URL-encoded section titles as fragments to llms.txt URLs This enhancement ensures that each section in llms.txt files gets a unique URL by appending the URL-encoded section title as a fragment. This prevents URL collisions when multiple sections share the same base URL and improves navigation within crawled documents. Changes: - Add percent-encoding dependency for URL encoding - Modify split_llms_content to append encoded titles as URL fragments - Update metadata description to use base URL instead of full URL - Add comprehensive tests for the new fragment URL behavior 🤖 Generated with [Pochi](https://getpochi.com) Co-Authored-By: Pochi <noreply@getpochi.com> * [autofix.ci] apply automated fixes --------- Co-authored-by: Pochi <noreply@getpochi.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/tabby-crawler/Cargo.toml b/crates/tabby-crawler/Cargo.toml
@@ -10,6 +10,7 @@ tokio = { workspace = true, features = ["io-util", "process", "rt"] }
 anyhow.workspace = true
 tracing.workspace = true
 url.workspace = true
+percent-encoding = "2.3"
 readable-readability = "0.4.0"
 futures.workspace = true
 async-stream.workspace = true
diff --git a/crates/tabby-crawler/src/llms_txt_parser.rs b/crates/tabby-crawler/src/llms_txt_parser.rs
@@ -1,3 +1,5 @@
+use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
+
 use crate::types::{CrawledDocument, CrawledMetadata};
 
 pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument> {
@@ -13,10 +15,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
             // If we already have a section in progress, finalize it.
             if let Some(title) = current_title.take() {
                 // Use the URL from the section if available; otherwise, fallback to base_url.
-                let url = current_url.take().unwrap_or_else(|| base_url.to_owned());
+                let base_url_str = current_url.take().unwrap_or_else(|| base_url.to_owned());
+                // URL-encode the title and append it as a fragment
+                let encoded_title = utf8_percent_encode(&title, NON_ALPHANUMERIC).to_string();
+                let url = format!("{base_url_str}#{encoded_title}");
                 let metadata = CrawledMetadata {
                     title: title.into(),
-                    description: url.clone().into(),
+                    description: base_url_str.into(),
                 };
                 docs.push(CrawledDocument::new(
                     url,
@@ -39,10 +44,13 @@ pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument>
 
     // Finalize the last section if any.
     if let Some(title) = current_title {
-        let url = current_url.unwrap_or_else(|| base_url.to_owned());
+        let base_url_str = current_url.unwrap_or_else(|| base_url.to_owned());
+        // URL-encode the title and append it as a fragment
+        let encoded_title = utf8_percent_encode(&title, NON_ALPHANUMERIC).to_string();
+        let url = format!("{base_url_str}#{encoded_title}");
         let metadata = CrawledMetadata {
             title: title.into(),
-            description: url.clone().into(),
+            description: base_url_str.into(),
         };
         docs.push(CrawledDocument::new(
             url,
@@ -74,8 +82,11 @@ More text on the same section.
         let doc = &docs[0];
         // The title is taken from the heading.
         assert_eq!(doc.metadata.title, Some("Test Title with URL".to_string()));
-        // The URL should be extracted from the URL: line.
-        assert_eq!(doc.url, "https://developers.cloudflare.com");
+        // The URL should be extracted from the URL: line with encoded title appended.
+        assert_eq!(
+            doc.url,
+            "https://developers.cloudflare.com#Test%20Title%20with%20URL"
+        );
         // The body should contain the text after the URL line.
         assert_eq!(
             doc.markdown,
@@ -101,8 +112,11 @@ Line two of body.
             doc.metadata.title,
             Some("Test Title with Source".to_string())
         );
-        // The URL should be extracted from the Source: line.
-        assert_eq!(doc.url, "https://docs.perplexity.ai");
+        // The URL should be extracted from the Source: line with encoded title appended.
+        assert_eq!(
+            doc.url,
+            "https://docs.perplexity.ai#Test%20Title%20with%20Source"
+        );
         assert_eq!(
             doc.markdown,
             "This is another test body.\nLine two of body."
@@ -126,8 +140,11 @@ Additional content line.
             doc.metadata.title,
             Some("Test Title without URL or Source".to_string())
         );
-        // Fallback to the provided base_url.
-        assert_eq!(doc.url, "example.com");
+        // Fallback to the provided base_url with encoded title appended.
+        assert_eq!(
+            doc.url,
+            "example.com#Test%20Title%20without%20URL%20or%20Source"
+        );
         assert_eq!(
             doc.markdown,
             "This is test body with no explicit URL.\nAdditional content line."
@@ -156,22 +173,59 @@ Content for section three with no metadata.
         // Section One.
         let doc1 = &docs[0];
         assert_eq!(doc1.metadata.title, Some("Section One".to_string()));
-        assert_eq!(doc1.url, "https://developers.cloudflare.com");
+        assert_eq!(doc1.url, "https://developers.cloudflare.com#Section%20One");
         assert!(doc1.markdown.contains("Content for section one."));
 
         // Section Two.
         let doc2 = &docs[1];
         assert_eq!(doc2.metadata.title, Some("Section Two".to_string()));
-        assert_eq!(doc2.url, "https://docs.perplexity.ai");
+        assert_eq!(doc2.url, "https://docs.perplexity.ai#Section%20Two");
         assert!(doc2.markdown.contains("Content for section two."));
 
         // Section Three.
         let doc3 = &docs[2];
         assert_eq!(doc3.metadata.title, Some("Section Three".to_string()));
-        // Since no URL/Source is provided, fallback to base_url.
-        assert_eq!(doc3.url, "example.com");
+        // Since no URL/Source is provided, fallback to base_url with encoded title appended.
+        assert_eq!(doc3.url, "example.com#Section%20Three");
         assert!(doc3
             .markdown
             .contains("Content for section three with no metadata."));
     }
+
+    #[test]
+    fn test_base_url_only_generates_unique_fragment_links() {
+        // Test that when only base URL is available, each section gets a unique link with fragment
+        let content = "\
+# Getting Started
+This is the getting started guide.
+
+# Configuration
+This explains how to configure the system.
+
+# Advanced Usage
+Advanced usage scenarios.
+";
+        let base_url = "https://docs.example.com";
+        let docs = split_llms_content(content, base_url);
+        assert_eq!(docs.len(), 3, "Should produce three documents");
+
+        let doc1 = &docs[0];
+        let doc2 = &docs[1];
+        let doc3 = &docs[2];
+
+        // All should use base URL with encoded title fragments
+        assert_eq!(doc1.url, "https://docs.example.com#Getting%20Started");
+        assert_eq!(doc2.url, "https://docs.example.com#Configuration");
+        assert_eq!(doc3.url, "https://docs.example.com#Advanced%20Usage");
+
+        // All URLs should be different despite same base URL
+        assert_ne!(doc1.url, doc2.url);
+        assert_ne!(doc2.url, doc3.url);
+        assert_ne!(doc1.url, doc3.url);
+
+        // All should have the same description (base URL)
+        assert_eq!(doc1.metadata.description, Some(base_url.to_string()));
+        assert_eq!(doc2.metadata.description, Some(base_url.to_string()));
+        assert_eq!(doc3.metadata.description, Some(base_url.to_string()));
+    }
 }