Skip to content

Commit 1643509

Browse files
author
Ubuntu
committed
Vendor cocoindex_ops_text
1 parent 6971ed3 commit 1643509

10 files changed

Lines changed: 2211 additions & 3 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ edition = "2021"
77
name = "coco_rs"
88

99
[dependencies]
10-
# 核心引擎依赖 (本地路径)
11-
cocoindex_ops_text = { path = "../cocoindex/rust/ops_text" }
10+
# Vendored syntax-aware text chunking crate
11+
cocoindex_ops_text = { path = "vendor/cocoindex_ops_text" }
1212

1313
# 异步与 IO
1414
tokio = { version = "1.48", features = ["full"] }
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
[package]
2+
name = "cocoindex_ops_text"
3+
version = "0.1.0"
4+
edition = "2021"
5+
license = "Apache-2.0"
6+
7+
[dependencies]
8+
anyhow = { version = "1.0", features = ["std"] }
9+
globset = "0.4"
10+
regex = "1.12.2"
11+
unicase = "2.8.1"
12+
13+
tree-sitter = "0.25.10"
14+
tree-sitter-language = "0.1.7"
15+
# Per language tree-sitter parsers
16+
tree-sitter-c = "0.24.1"
17+
tree-sitter-cpp = "0.23.4"
18+
tree-sitter-c-sharp = "0.23.1"
19+
tree-sitter-css = "0.23.2"
20+
tree-sitter-fortran = "0.5.1"
21+
tree-sitter-go = "0.23.4"
22+
tree-sitter-html = "0.23.2"
23+
tree-sitter-java = "0.23.5"
24+
tree-sitter-javascript = "0.23.1"
25+
tree-sitter-json = "0.24.8"
26+
# The other more popular crate tree-sitter-kotlin requires tree-sitter < 0.23 for now
27+
tree-sitter-kotlin-ng = "1.1.0"
28+
tree-sitter-md = "0.5.3"
29+
tree-sitter-pascal = "0.10.2"
30+
tree-sitter-php = "0.23.11"
31+
tree-sitter-python = "0.23.6"
32+
tree-sitter-r = "1.2.0"
33+
tree-sitter-ruby = "0.23.1"
34+
tree-sitter-rust = "0.24.0"
35+
tree-sitter-scala = "0.24.0"
36+
tree-sitter-sequel = "0.3.11"
37+
tree-sitter-swift = "0.7.1"
38+
tree-sitter-toml-ng = "0.7.0"
39+
tree-sitter-typescript = "0.23.2"
40+
tree-sitter-xml = "0.7.0"
41+
tree-sitter-yaml = "0.7.2"
42+
tree-sitter-solidity = "1.2.13"
43+
44+
[dev-dependencies]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
//! Extra text processing utilities for CocoIndex.
2+
//!
3+
//! This crate provides text processing functionality including:
4+
//! - Programming language detection and tree-sitter support
5+
//! - Text splitting by separators
6+
//! - Recursive text chunking with syntax awareness
7+
8+
pub(crate) mod output_positions;
9+
pub mod pattern_matcher;
10+
pub mod prog_langs;
11+
pub mod split;
Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
//! Internal module for computing output positions from byte offsets.
2+
3+
/// A text range specified by byte offsets.
4+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5+
pub struct TextRange {
6+
/// Start byte offset (inclusive).
7+
pub start: usize,
8+
/// End byte offset (exclusive).
9+
pub end: usize,
10+
}
11+
12+
impl TextRange {
13+
/// Create a new text range.
14+
pub fn new(start: usize, end: usize) -> Self {
15+
Self { start, end }
16+
}
17+
18+
/// Get the length of the range in bytes.
19+
pub fn len(&self) -> usize {
20+
self.end - self.start
21+
}
22+
23+
/// Check if the range is empty.
24+
pub fn is_empty(&self) -> bool {
25+
self.start >= self.end
26+
}
27+
}
28+
29+
/// Output position information with character offset and line/column.
30+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31+
pub struct OutputPosition {
32+
/// Character (not byte) offset from the start of the text.
33+
pub char_offset: usize,
34+
/// 1-based line number.
35+
pub line: u32,
36+
/// 1-based column number.
37+
pub column: u32,
38+
}
39+
40+
/// Position tracking helper that converts byte offsets to character positions.
41+
pub(crate) struct Position {
42+
/// The byte offset in the text.
43+
pub byte_offset: usize,
44+
/// Computed output position (populated by `set_output_positions`).
45+
pub output: Option<OutputPosition>,
46+
}
47+
48+
impl Position {
49+
/// Create a new position with the given byte offset.
50+
pub fn new(byte_offset: usize) -> Self {
51+
Self {
52+
byte_offset,
53+
output: None,
54+
}
55+
}
56+
}
57+
58+
/// Fill OutputPosition for the requested byte offsets.
59+
///
60+
/// This function efficiently computes character offsets, line numbers, and column
61+
/// numbers for a set of byte positions in a single pass through the text.
62+
pub(crate) fn set_output_positions<'a>(
63+
text: &str,
64+
positions: impl Iterator<Item = &'a mut Position>,
65+
) {
66+
let mut positions = positions.collect::<Vec<_>>();
67+
positions.sort_by_key(|o| o.byte_offset);
68+
69+
let mut positions_iter = positions.iter_mut();
70+
let Some(mut next_position) = positions_iter.next() else {
71+
return;
72+
};
73+
74+
let mut char_offset = 0;
75+
let mut line = 1;
76+
let mut column = 1;
77+
for (byte_offset, ch) in text.char_indices() {
78+
while next_position.byte_offset == byte_offset {
79+
next_position.output = Some(OutputPosition {
80+
char_offset,
81+
line,
82+
column,
83+
});
84+
if let Some(p) = positions_iter.next() {
85+
next_position = p
86+
} else {
87+
return;
88+
}
89+
}
90+
char_offset += 1;
91+
if ch == '\n' {
92+
line += 1;
93+
column = 1;
94+
} else {
95+
column += 1;
96+
}
97+
}
98+
99+
loop {
100+
next_position.output = Some(OutputPosition {
101+
char_offset,
102+
line,
103+
column,
104+
});
105+
if let Some(p) = positions_iter.next() {
106+
next_position = p
107+
} else {
108+
return;
109+
}
110+
}
111+
}
112+
113+
#[cfg(test)]
114+
mod tests {
115+
use super::*;
116+
117+
#[test]
118+
fn test_set_output_positions_simple() {
119+
let text = "abc";
120+
let mut start = Position::new(0);
121+
let mut end = Position::new(3);
122+
123+
set_output_positions(text, vec![&mut start, &mut end].into_iter());
124+
125+
assert_eq!(
126+
start.output,
127+
Some(OutputPosition {
128+
char_offset: 0,
129+
line: 1,
130+
column: 1,
131+
})
132+
);
133+
assert_eq!(
134+
end.output,
135+
Some(OutputPosition {
136+
char_offset: 3,
137+
line: 1,
138+
column: 4,
139+
})
140+
);
141+
}
142+
143+
#[test]
144+
fn test_set_output_positions_with_newlines() {
145+
let text = "ab\ncd\nef";
146+
let mut pos1 = Position::new(0);
147+
let mut pos2 = Position::new(3); // 'c'
148+
let mut pos3 = Position::new(6); // 'e'
149+
let mut pos4 = Position::new(8); // end
150+
151+
set_output_positions(
152+
text,
153+
vec![&mut pos1, &mut pos2, &mut pos3, &mut pos4].into_iter(),
154+
);
155+
156+
assert_eq!(
157+
pos1.output,
158+
Some(OutputPosition {
159+
char_offset: 0,
160+
line: 1,
161+
column: 1,
162+
})
163+
);
164+
assert_eq!(
165+
pos2.output,
166+
Some(OutputPosition {
167+
char_offset: 3,
168+
line: 2,
169+
column: 1,
170+
})
171+
);
172+
assert_eq!(
173+
pos3.output,
174+
Some(OutputPosition {
175+
char_offset: 6,
176+
line: 3,
177+
column: 1,
178+
})
179+
);
180+
assert_eq!(
181+
pos4.output,
182+
Some(OutputPosition {
183+
char_offset: 8,
184+
line: 3,
185+
column: 3,
186+
})
187+
);
188+
}
189+
190+
#[test]
191+
fn test_set_output_positions_multibyte() {
192+
// Test with emoji (4-byte UTF-8 character)
193+
let text = "abc\u{1F604}def"; // abc + emoji (4 bytes) + def
194+
let mut start = Position::new(0);
195+
let mut before_emoji = Position::new(3);
196+
let mut after_emoji = Position::new(7); // byte position after emoji
197+
let mut end = Position::new(10);
198+
199+
set_output_positions(
200+
text,
201+
vec![&mut start, &mut before_emoji, &mut after_emoji, &mut end].into_iter(),
202+
);
203+
204+
assert_eq!(
205+
start.output,
206+
Some(OutputPosition {
207+
char_offset: 0,
208+
line: 1,
209+
column: 1,
210+
})
211+
);
212+
assert_eq!(
213+
before_emoji.output,
214+
Some(OutputPosition {
215+
char_offset: 3,
216+
line: 1,
217+
column: 4,
218+
})
219+
);
220+
assert_eq!(
221+
after_emoji.output,
222+
Some(OutputPosition {
223+
char_offset: 4, // 3 chars + 1 emoji
224+
line: 1,
225+
column: 5,
226+
})
227+
);
228+
assert_eq!(
229+
end.output,
230+
Some(OutputPosition {
231+
char_offset: 7, // 3 + 1 + 3
232+
line: 1,
233+
column: 8,
234+
})
235+
);
236+
}
237+
238+
#[test]
239+
fn test_translate_bytes_to_chars_detailed() {
240+
// Comprehensive test moved from cocoindex
241+
let text = "abc\u{1F604}def";
242+
let mut start1 = Position::new(0);
243+
let mut end1 = Position::new(3);
244+
let mut start2 = Position::new(3);
245+
let mut end2 = Position::new(7);
246+
let mut start3 = Position::new(7);
247+
let mut end3 = Position::new(10);
248+
let mut end_full = Position::new(text.len());
249+
250+
let offsets = vec![
251+
&mut start1,
252+
&mut end1,
253+
&mut start2,
254+
&mut end2,
255+
&mut start3,
256+
&mut end3,
257+
&mut end_full,
258+
];
259+
260+
set_output_positions(text, offsets.into_iter());
261+
262+
assert_eq!(
263+
start1.output,
264+
Some(OutputPosition {
265+
char_offset: 0,
266+
line: 1,
267+
column: 1,
268+
})
269+
);
270+
assert_eq!(
271+
end1.output,
272+
Some(OutputPosition {
273+
char_offset: 3,
274+
line: 1,
275+
column: 4,
276+
})
277+
);
278+
assert_eq!(
279+
start2.output,
280+
Some(OutputPosition {
281+
char_offset: 3,
282+
line: 1,
283+
column: 4,
284+
})
285+
);
286+
assert_eq!(
287+
end2.output,
288+
Some(OutputPosition {
289+
char_offset: 4,
290+
line: 1,
291+
column: 5,
292+
})
293+
);
294+
assert_eq!(
295+
end3.output,
296+
Some(OutputPosition {
297+
char_offset: 7,
298+
line: 1,
299+
column: 8,
300+
})
301+
);
302+
assert_eq!(
303+
end_full.output,
304+
Some(OutputPosition {
305+
char_offset: 7,
306+
line: 1,
307+
column: 8,
308+
})
309+
);
310+
}
311+
312+
#[test]
313+
fn test_text_range() {
314+
let range = TextRange::new(0, 10);
315+
assert_eq!(range.len(), 10);
316+
assert!(!range.is_empty());
317+
318+
let empty = TextRange::new(5, 5);
319+
assert_eq!(empty.len(), 0);
320+
assert!(empty.is_empty());
321+
}
322+
}

0 commit comments

Comments
 (0)