Skip to content

Commit 79cd65d

Browse files
authored
Merge pull request #779 from Martin005/sourcepos_chars
feat: Add parse option for char-based columns in Sourcepos
2 parents 0d4a9ca + 583d092 commit 79cd65d

7 files changed

Lines changed: 994 additions & 4 deletions

File tree

fuzz/fuzz_targets/all_options.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ struct FuzzParseOptions {
134134
default_info_string: bool,
135135
broken_link_callback: bool,
136136
escaped_char_spans: bool,
137+
sourcepos_chars: bool,
137138
}
138139

139140
impl FuzzParseOptions {
@@ -159,6 +160,7 @@ impl FuzzParseOptions {
159160
None
160161
},
161162
escaped_char_spans: self.escaped_char_spans,
163+
sourcepos_chars: self.sourcepos_chars,
162164
}
163165
}
164166
}

src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,11 @@ struct Cli {
167167
#[arg(long)]
168168
ignore_empty_links: bool,
169169

170+
/// Report column positions in sourcepos as a Unicode character count
171+
/// rather than UTF-8 byte offsets
172+
#[arg(long)]
173+
sourcepos_chars: bool,
174+
170175
/// Minimise escapes in CommonMark output using a trial-and-error algorithm
171176
#[arg(long)]
172177
experimental_minimize_commonmark: bool,
@@ -323,6 +328,7 @@ fn main() -> Result<(), Box<dyn Error>> {
323328
.relaxed_tasklist_matching(cli.relaxed_tasklist_character)
324329
.relaxed_autolinks(cli.relaxed_autolinks)
325330
.ignore_setext(cli.ignore_setext)
331+
.sourcepos_chars(cli.sourcepos_chars)
326332
.build();
327333

328334
let render = options::Render::builder()

src/nodes.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -806,16 +806,23 @@ impl From<(usize, usize, usize, usize)> for Sourcepos {
806806

807807
/// Represents the 1-based line and column positions of a given character.
808808
///
809-
/// The `column` value is measured in UTF-8 byte offsets (1-based),
809+
/// By default, the `column` value is measured in UTF-8 byte offsets (1-based),
810810
/// matching cmark behavior. This means multi-byte UTF-8 characters
811811
/// (for example, `ö` or `好`) increase the column count by their byte
812812
/// length rather than by Rust's `char` count.
813+
///
814+
/// Enable [`parse.sourcepos_chars`][crate::options::Parse#structfield.sourcepos_chars] to have
815+
/// column values reported as a Unicode character count instead.
813816
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
814817
pub struct LineColumn {
815818
/// The 1-based line number of the character.
816819
pub line: usize,
817-
/// The 1-based column number of the character, counted as UTF-8 bytes.
818-
/// For example, a 3-byte UTF-8 character increments the column by 3.
820+
/// The 1-based column number of the character.
821+
///
822+
/// By default this is counted in UTF-8 bytes (so a 3-byte character
823+
/// increments the column by 3). Enable
824+
/// [`parse.sourcepos_chars`][crate::options::Parse#structfield.sourcepos_chars] to have
825+
/// it reported as a Unicode character count instead.
819826
pub column: usize,
820827
}
821828

src/parser/mod.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,49 @@ pub fn parse_document<'a>(arena: &'a Arena<'a>, md: &str, options: &Options) ->
5353
}
5454
.into(),
5555
);
56-
Parser::new(arena, root, options).parse(md)
56+
let document = Parser::new(arena, root, options).parse(md);
57+
if options.parse.sourcepos_chars {
58+
convert_sourcepos_columns_to_chars(document, md);
59+
}
60+
document
61+
}
62+
63+
/// Convert all byte-based column values in the AST's sourcepos to char-based.
64+
fn convert_sourcepos_columns_to_chars(document: Node<'_>, md: &str) {
65+
let lines: Vec<&str> = md.lines().collect();
66+
67+
let convert = |lc: &mut nodes::LineColumn| {
68+
if lc.column == 0 {
69+
return;
70+
}
71+
if let Some(line) = lines.get(lc.line.wrapping_sub(1)) {
72+
lc.column = byte_col_to_char_col(line, lc.column);
73+
}
74+
};
75+
76+
for node in document.descendants() {
77+
let mut ast = node.data_mut();
78+
convert(&mut ast.sourcepos.start);
79+
convert(&mut ast.sourcepos.end);
80+
}
81+
}
82+
83+
/// Convert a 1-based byte column index to a 1-based char column index for the given line.
84+
fn byte_col_to_char_col(line: &str, byte_col: usize) -> usize {
85+
// If the byte column points past the end of the line (e.g. position
86+
// after the final byte, such as softbreak/newline positions),
87+
// map it to the position after the last Unicode character.
88+
if byte_col > line.len() {
89+
return line.chars().count() + 1;
90+
}
91+
92+
// Count all chars whose start byte offset is <= byte_idx.
93+
// Since `char_indices()` yields (byte_offset_of_char_start, char),
94+
// `take_while` includes exactly the chars that start at or before `byte_idx`.
95+
let byte_idx = byte_col - 1;
96+
line.char_indices()
97+
.take_while(|&(i, _)| i <= byte_idx)
98+
.count()
5799
}
58100

59101
/// Return whether the byte at the given offset passes the callback.

src/parser/options.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,37 @@ pub struct Parse<'c> {
861861
/// this option to be enabled.
862862
#[cfg_attr(feature = "bon", builder(default))]
863863
pub escaped_char_spans: bool,
864+
865+
/// When enabled, the [`column`][crate::nodes::LineColumn::column] values in
866+
/// [`Sourcepos`][crate::nodes::Sourcepos] are counted as Unicode characters
867+
/// (i.e. `char`s) rather than as UTF-8 bytes.
868+
///
869+
/// By default, column values follow cmark behaviour: each byte of a
870+
/// multi-byte UTF-8 character counts as a separate column. Enabling this
871+
/// option converts those byte-based columns to character-based columns after
872+
/// parsing, so that a 3-byte character such as `好` occupies only one
873+
/// column position instead of three.
874+
///
875+
/// ```rust
876+
/// # use comrak::{Arena, parse_document, Options};
877+
/// let arena = Arena::new();
878+
/// let mut options = Options::default();
879+
///
880+
/// // Default (byte-based): "好" spans columns 1-3
881+
/// let root = parse_document(&arena, "好", &options);
882+
/// let sp = root.first_child().unwrap().data().sourcepos;
883+
/// assert_eq!(sp.start.column, 1);
884+
/// assert_eq!(sp.end.column, 3);
885+
///
886+
/// // Char-based: "好" occupies only column 1
887+
/// options.parse.sourcepos_chars = true;
888+
/// let root = parse_document(&arena, "好", &options);
889+
/// let sp = root.first_child().unwrap().data().sourcepos;
890+
/// assert_eq!(sp.start.column, 1);
891+
/// assert_eq!(sp.end.column, 1);
892+
/// ```
893+
#[cfg_attr(feature = "bon", builder(default))]
894+
pub sourcepos_chars: bool,
864895
}
865896

866897
/// The type of the callback used when a reference link is encountered with no

src/tests.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ mod rewriter;
4040
mod shortcodes;
4141
#[path = "tests/sourcepos.rs"]
4242
mod sourcepos_;
43+
mod sourcepos_chars;
4344
mod spoiler;
4445
mod strikethrough;
4546
mod subscript;
@@ -339,6 +340,17 @@ where
339340
}
340341
}
341342

343+
macro_rules! assert_ast_match_set_opt_single {
344+
($opts:ident; $optclass:ident.$optname:ident = $val:expr_2021) => {
345+
$opts.$optclass.$optname = $val;
346+
};
347+
($opts:ident; $optclass:ident.$optname:ident) => {
348+
$opts.$optclass.$optname = true;
349+
};
350+
}
351+
352+
pub(crate) use assert_ast_match_set_opt_single;
353+
342354
macro_rules! assert_ast_match {
343355
([ $( $optclass:ident.$optname:ident ),* ], $( $md:literal )+, $amt:tt,) => {
344356
assert_ast_match!(
@@ -354,6 +366,13 @@ macro_rules! assert_ast_match {
354366
|#[allow(unused_variables)] opts| {$(opts.$optclass.$optname = $val;)*},
355367
);
356368
};
369+
([ $( $optclass:ident.$optname:ident $(= $val:expr_2021)? ),* ], $( $md:literal )+, $amt:tt) => {
370+
crate::tests::assert_ast_match_i(
371+
concat!( $( $md ),+ ),
372+
ast!($amt),
373+
|#[allow(unused_variables)] opts| { $( assert_ast_match_set_opt_single!(opts; $optclass.$optname $(= $val)? ); )* },
374+
);
375+
};
357376
([ $( $optclass:ident.$optname:ident ),* ], $( $md:literal )+, $amt:tt) => {
358377
assert_ast_match!(
359378
[ $( $optclass.$optname = true),* ],
@@ -397,6 +416,15 @@ impl AstMatchTree {
397416
assert_eq!(text, &ncb.literal, "CodeBlock literal should match");
398417
asserted_text = true;
399418
}
419+
NodeValue::Code(ref nc) => {
420+
assert_eq!(text, &nc.literal, "Code literal should match");
421+
asserted_text = true;
422+
}
423+
#[cfg(feature = "shortcodes")]
424+
NodeValue::ShortCode(ref nsc) => {
425+
assert_eq!(text, &nsc.code, "Shortcode code should match");
426+
asserted_text = true;
427+
}
400428
NodeValue::HtmlBlock(ref nhb) => {
401429
assert_eq!(text, &nhb.literal, "HtmlBlock literal should match");
402430
asserted_text = true;

0 commit comments

Comments
 (0)