From 657abf14c68efe4fc49476dc78caa9c860b1baa9 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Tue, 18 Nov 2025 11:05:07 -0500 Subject: [PATCH 01/20] check: Work on byte --- crates/swc_common/src/input.rs | 124 ++++++++------- .../swc_ecma_lexer/src/common/lexer/char.rs | 20 ++- crates/swc_ecma_lexer/src/common/lexer/mod.rs | 11 +- crates/swc_ecma_lexer/src/lexer/table.rs | 16 +- crates/swc_ecma_parser/src/lexer/char_ext.rs | 20 ++- crates/swc_ecma_parser/src/lexer/mod.rs | 143 ++++++++++-------- crates/swc_ecma_parser/src/lexer/state.rs | 73 +++++---- crates/swc_ecma_parser/src/lexer/table.rs | 18 ++- .../swc_ecma_parser/src/lexer/whitespace.rs | 32 ++-- 9 files changed, 270 insertions(+), 187 deletions(-) diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs index 6cd4d8e35e83..86465b956932 100644 --- a/crates/swc_common/src/input.rs +++ b/crates/swc_common/src/input.rs @@ -10,8 +10,8 @@ pub type SourceFileInput<'a> = StringInput<'a>; #[derive(Clone)] pub struct StringInput<'a> { last_pos: BytePos, - /// Current cursor - iter: str::Chars<'a>, + /// Remaining input as str - we slice this as we consume bytes + remaining: &'a str, orig: &'a str, /// Original start position. orig_start: BytePos, @@ -33,7 +33,7 @@ impl<'a> StringInput<'a> { StringInput { last_pos: start, orig: src, - iter: src.chars(), + remaining: src, orig_start: start, orig_end: end, } @@ -41,7 +41,7 @@ impl<'a> StringInput<'a> { #[inline(always)] pub fn as_str(&self) -> &str { - self.iter.as_str() + self.remaining } #[inline(always)] @@ -68,21 +68,22 @@ impl<'a> StringInput<'a> { let ret = unsafe { s.get_unchecked(start_idx..end_idx) }; - self.iter = unsafe { s.get_unchecked(end_idx..) }.chars(); + self.remaining = unsafe { s.get_unchecked(end_idx..) }; ret } #[inline] pub fn bump_bytes(&mut self, n: usize) { - let s = self.iter.as_str(); - self.iter = unsafe { s.get_unchecked(n..) }.chars(); + debug_assert!(n <= self.remaining.len()); + self.remaining = unsafe { self.remaining.get_unchecked(n..) }; self.last_pos.0 += n as u32; } #[inline] pub fn bump_one(&mut self) { - if self.iter.next().is_some() { + if !self.remaining.is_empty() { + self.remaining = unsafe { self.remaining.get_unchecked(1..) }; self.last_pos.0 += 1; } else { unsafe { @@ -114,41 +115,49 @@ impl<'a> From<&'a SourceFile> for StringInput<'a> { impl<'a> Input<'a> for StringInput<'a> { #[inline] - fn cur(&self) -> Option { - self.iter.clone().next() + fn cur(&self) -> Option { + self.remaining.as_bytes().first().copied() } #[inline] - fn peek(&self) -> Option { - let mut iter = self.iter.clone(); - // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster. - iter.next(); - iter.next() + fn peek(&self) -> Option { + self.remaining.as_bytes().get(1).copied() } #[inline] - fn peek_ahead(&self) -> Option { - let mut iter = self.iter.clone(); - // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster - iter.next(); - iter.next(); - iter.next() + fn peek_ahead(&self) -> Option { + self.remaining.as_bytes().get(2).copied() } #[inline] unsafe fn bump(&mut self) { - if let Some(c) = self.iter.next() { - self.last_pos = self.last_pos + BytePos((c.len_utf8()) as u32); - } else { + let bytes = self.remaining.as_bytes(); + if bytes.is_empty() { unsafe { debug_unreachable!("bump should not be called when cur() == None"); } } + + let first_byte = unsafe { *bytes.get_unchecked(0) }; + + // Calculate the number of bytes in this UTF-8 character + let len = if first_byte < 0x80 { + 1 // ASCII + } else if first_byte < 0xe0 { + 2 // 2-byte UTF-8 + } else if first_byte < 0xf0 { + 3 // 3-byte UTF-8 + } else { + 4 // 4-byte UTF-8 + }; + + self.remaining = unsafe { self.remaining.get_unchecked(len..) }; + self.last_pos = self.last_pos + BytePos(len as u32); } #[inline] fn cur_as_ascii(&self) -> Option { - let first_byte = *self.as_str().as_bytes().first()?; + let first_byte = *self.remaining.as_bytes().first()?; if first_byte <= 0x7f { Some(first_byte) } else { @@ -156,6 +165,11 @@ impl<'a> Input<'a> for StringInput<'a> { } } + #[inline] + fn cur_as_char(&self) -> Option { + self.remaining.chars().next() + } + #[inline] fn is_at_start(&self) -> bool { self.orig_start == self.last_pos @@ -184,7 +198,7 @@ impl<'a> Input<'a> for StringInput<'a> { let ret = unsafe { s.get_unchecked(start_idx..end_idx) }; - self.iter = unsafe { s.get_unchecked(end_idx..) }.chars(); + self.remaining = unsafe { s.get_unchecked(end_idx..) }; self.last_pos = end; ret @@ -197,7 +211,7 @@ impl<'a> Input<'a> for StringInput<'a> { { let last = { let mut last = 0; - for c in self.iter.clone() { + for c in self.remaining.chars() { if pred(c) { last += c.len_utf8(); } else { @@ -207,12 +221,11 @@ impl<'a> Input<'a> for StringInput<'a> { last }; - let s = self.iter.as_str(); - debug_assert!(last <= s.len()); - let ret = unsafe { s.get_unchecked(..last) }; + debug_assert!(last <= self.remaining.len()); + let ret = unsafe { self.remaining.get_unchecked(..last) }; self.last_pos = self.last_pos + BytePos(last as _); - self.iter = unsafe { s.get_unchecked(last..) }.chars(); + self.remaining = unsafe { self.remaining.get_unchecked(last..) }; ret } @@ -228,15 +241,13 @@ impl<'a> Input<'a> for StringInput<'a> { let idx = (to - self.orig_start).0 as usize; debug_assert!(idx <= orig.len()); - let s = unsafe { orig.get_unchecked(idx..) }; - self.iter = s.chars(); + self.remaining = unsafe { orig.get_unchecked(idx..) }; self.last_pos = to; } #[inline] fn is_byte(&self, c: u8) -> bool { - self.iter - .as_str() + self.remaining .as_bytes() .first() .map(|b| *b == c) @@ -245,13 +256,13 @@ impl<'a> Input<'a> for StringInput<'a> { #[inline] fn is_str(&self, s: &str) -> bool { - self.as_str().starts_with(s) + self.remaining.starts_with(s) } #[inline] fn eat_byte(&mut self, c: u8) -> bool { if self.is_byte(c) { - self.iter.next(); + self.remaining = unsafe { self.remaining.get_unchecked(1..) }; self.last_pos = self.last_pos + BytePos(1_u32); true } else { @@ -261,9 +272,14 @@ impl<'a> Input<'a> for StringInput<'a> { } pub trait Input<'a>: Clone { - fn cur(&self) -> Option; - fn peek(&self) -> Option; - fn peek_ahead(&self) -> Option; + /// Returns the current byte. Returns [None] if at end of input. + fn cur(&self) -> Option; + + /// Returns the next byte without consuming the current byte. + fn peek(&self) -> Option; + + /// Returns the byte after the next byte without consuming anything. + fn peek_ahead(&self) -> Option; /// # Safety /// @@ -271,18 +287,20 @@ pub trait Input<'a>: Clone { /// when the Input is not empty. unsafe fn bump(&mut self); - /// Returns [None] if it's end of input **or** current character is not an - /// ascii character. + /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F). + /// Returns [None] if it's end of input or if the byte is not ASCII. #[inline] fn cur_as_ascii(&self) -> Option { - self.cur().and_then(|i| { - if i.is_ascii() { - return Some(i as u8); - } - None - }) + self.cur() + .and_then(|b| if b <= 0x7f { Some(b) } else { None }) } + /// Returns the current position as a UTF-8 char for cases where we need + /// full character processing (identifiers, strings, etc). + /// Returns [None] if at end of input or if the bytes don't form valid + /// UTF-8. + fn cur_as_char(&self) -> Option; + fn is_at_start(&self) -> bool; fn cur_pos(&self) -> BytePos; @@ -306,16 +324,12 @@ pub trait Input<'a>: Clone { /// - `to` be in the valid range of input. unsafe fn reset_to(&mut self, to: BytePos); - /// Implementors can override the method to make it faster. - /// - /// `c` must be ASCII. + /// Check if the current byte equals the given byte. + /// `c` should typically be an ASCII byte for performance. #[inline] #[allow(clippy::wrong_self_convention)] fn is_byte(&self, c: u8) -> bool { - match self.cur() { - Some(ch) => ch == c as char, - _ => false, - } + self.cur() == Some(c) } /// Implementors can override the method to make it faster. diff --git a/crates/swc_ecma_lexer/src/common/lexer/char.rs b/crates/swc_ecma_lexer/src/common/lexer/char.rs index 705a3fd05f70..62f4e4d08a49 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/char.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/char.rs @@ -1,8 +1,8 @@ -/// Implemented for `char`. +/// Implemented for `u8` - operates on bytes for performance. pub trait CharExt: Copy { fn to_char(self) -> Option; - /// Test whether a given character code starts an identifier. + /// Test whether a given byte/character starts an identifier. /// /// https://tc39.github.io/ecma262/#prod-IdentifierStart #[inline] @@ -14,7 +14,7 @@ pub trait CharExt: Copy { swc_ecma_ast::Ident::is_valid_start(c) } - /// Test whether a given character is part of an identifier. + /// Test whether a given byte/character is part of an identifier. #[inline] fn is_ident_part(self) -> bool { let c = match self.to_char() { @@ -65,6 +65,20 @@ pub trait CharExt: Copy { } } +impl CharExt for u8 { + #[inline(always)] + fn to_char(self) -> Option { + // For ASCII bytes, this is a fast path + if self <= 0x7f { + Some(self as char) + } else { + // For non-ASCII bytes, we can't convert a single byte to a char + // The caller should use cur_as_char() on the Input trait instead + None + } + } +} + impl CharExt for char { #[inline(always)] fn to_char(self) -> Option { diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 852877860f3f..a81ee56cd384 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -175,20 +175,25 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } #[inline(always)] - fn cur(&self) -> Option { + fn cur(&self) -> Option { self.input().cur() } #[inline(always)] - fn peek(&self) -> Option { + fn peek(&self) -> Option { self.input().peek() } #[inline(always)] - fn peek_ahead(&self) -> Option { + fn peek_ahead(&self) -> Option { self.input().peek_ahead() } + #[inline(always)] + fn cur_as_char(&self) -> Option { + self.input().cur_as_char() + } + #[inline(always)] fn cur_pos(&self) -> BytePos { self.input().cur_pos() diff --git a/crates/swc_ecma_lexer/src/lexer/table.rs b/crates/swc_ecma_lexer/src/lexer/table.rs index 798c5194a371..f6c347bba074 100644 --- a/crates/swc_ecma_lexer/src/lexer/table.rs +++ b/crates/swc_ecma_lexer/src/lexer/table.rs @@ -48,13 +48,14 @@ const EOF: ByteHandler = Some(|lexer| { const ERR: ByteHandler = Some(|lexer| { let c = unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.cur().unwrap_unchecked() + // Safety: Byte handler is only called for non-last characters + // Get the char representation for error messages + lexer.cur_as_char().unwrap_unchecked() }; let start = lexer.cur_pos(); unsafe { - // Safety: Byte handler is only called for non-last chracters + // Safety: Byte handler is only called for non-last characters lexer.input.bump(); } lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? @@ -361,11 +362,12 @@ const DIG: ByteHandler = Some(|lexer| { /// String literals with `'` or `"` const QOT: ByteHandler = Some(|lexer| lexer.read_str_lit()); -/// Unicode +/// Unicode - handles multi-byte UTF-8 sequences const UNI: ByteHandler = Some(|lexer| { let c = unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.cur().unwrap_unchecked() + // Safety: Byte handler is only called for non-last characters + // For non-ASCII bytes, we need the full char + lexer.cur_as_char().unwrap_unchecked() }; // Identifier or keyword. '\uXXXX' sequences are allowed in @@ -376,7 +378,7 @@ const UNI: ByteHandler = Some(|lexer| { let start = lexer.cur_pos(); unsafe { - // Safety: Byte handler is only called for non-last chracters + // Safety: Byte handler is only called for non-last characters lexer.input.bump(); } lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? diff --git a/crates/swc_ecma_parser/src/lexer/char_ext.rs b/crates/swc_ecma_parser/src/lexer/char_ext.rs index 8f7dc37c2573..873dc36f94d0 100644 --- a/crates/swc_ecma_parser/src/lexer/char_ext.rs +++ b/crates/swc_ecma_parser/src/lexer/char_ext.rs @@ -1,8 +1,8 @@ -/// Implemented for `char`. +/// Implemented for `u8` and `char` - operates on bytes for performance. pub trait CharExt: Copy { fn to_char(self) -> Option; - /// Test whether a given character code starts an identifier. + /// Test whether a given byte/character starts an identifier. /// /// https://tc39.github.io/ecma262/#prod-IdentifierStart #[inline] @@ -14,7 +14,7 @@ pub trait CharExt: Copy { swc_ecma_ast::Ident::is_valid_start(c) } - /// Test whether a given character is part of an identifier. + /// Test whether a given byte/character is part of an identifier. #[inline] fn is_ident_part(self) -> bool { let c = match self.to_char() { @@ -35,6 +35,20 @@ pub trait CharExt: Copy { } } +impl CharExt for u8 { + #[inline(always)] + fn to_char(self) -> Option { + // For ASCII bytes, this is a fast path + if self <= 0x7f { + Some(self as char) + } else { + // For non-ASCII bytes, we can't convert a single byte to a char + // The caller should use cur_as_char() on the Input trait instead + None + } + } +} + impl CharExt for char { #[inline(always)] fn to_char(self) -> Option { diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index ec2ca65313cb..a802304c98eb 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -250,7 +250,7 @@ impl<'a> Lexer<'a> { } // '++', '--' - Ok(if self.input.cur() == Some(C as char) { + Ok(if self.input.cur() == Some(C) { unsafe { // Safety: cur() is Some(c) self.input.bump(); @@ -344,7 +344,10 @@ impl Lexer<'_> { } // XML style comment. `")); } // U+0021 EXCLAMATION MARK (!) // Switch to the comment end bang state. - Some('!') => { + Some(b'!') => { self.state = State::CommentEndBang; } // U+002D HYPHEN-MINUS (-) // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. - Some(c @ '-') => { - self.append_to_comment_token(c, c); + Some(c @ b'-') => { + self.append_to_comment_token(c as char, c as char); } // EOF // This is an eof-in-comment parse error. Emit the current comment token. @@ -3275,16 +3276,16 @@ where // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION // MARK character (!) to the comment token's data. Switch to the comment end // dash state. - Some(c @ '-') => { - self.append_to_comment_token(c, c); - self.append_to_comment_token('-', '-'); + Some(c @ b'-') => { + self.append_to_comment_token(c as char, c as char); + self.append_to_comment_token(c as char, c as char); self.append_to_comment_token('!', '!'); self.state = State::CommentEndDash; } // U+003E GREATER-THAN SIGN (>) // This is an incorrectly-closed-comment parse error. Switch to the data // state. Emit the current comment token. - Some('>') => { + Some(b'>') => { self.emit_error(ErrorKind::IncorrectlyClosedComment); self.state = State::Data; self.emit_comment_token(Some(">")); @@ -3326,7 +3327,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Reconsume in the before DOCTYPE name state. - Some('>') => { + Some(b'>') => { self.reconsume_in_state(State::BeforeDoctypeName); } // EOF @@ -3370,14 +3371,14 @@ where Some(c) if is_ascii_upper_alpha(c) => { self.append_raw_to_doctype_token(c); self.create_doctype_token(); - self.set_doctype_token_name(c.to_ascii_lowercase()); + self.set_doctype_token_name(c.to_ascii_lowercase() as char); self.state = State::DoctypeName; } // U+0000 NULL // This is an unexpected-null-character parse error. Create a new DOCTYPE // token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character. // Switch to the DOCTYPE name state. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); self.create_doctype_token(); @@ -3388,7 +3389,7 @@ where // This is a missing-doctype-name parse error. Create a new DOCTYPE token. // Set its force-quirks flag to on. Switch to the data state. Emit the // current token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypeName); self.create_doctype_token(); @@ -3416,7 +3417,7 @@ where self.validate_input_stream_character(c); self.append_raw_to_doctype_token(c); self.create_doctype_token(); - self.set_doctype_token_name(c); + self.set_doctype_token_name(c as char); self.state = State::DoctypeName; } } @@ -3437,7 +3438,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_name(); self.state = State::Data; @@ -3447,15 +3448,15 @@ where // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current DOCTYPE token's name. Some(c) if is_ascii_upper_alpha(c) => { - self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha); + self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha_char); } // U+0000 NULL // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's name. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER), None, None); + self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER as u8), None, None); } // EOF // This is an eof-in-doctype parse error. Set the current DOCTYPE token's @@ -3474,12 +3475,14 @@ where // Append the current input character to the current DOCTYPE token's name. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_name(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_name(c, |ch| { + if !is_allowed_character(ch) { return false; } - !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c) + !is_spacy_char(ch) + && !matches!(ch, '>' | '\x00') + && !is_ascii_upper_alpha_char(ch) }); } } @@ -3500,7 +3503,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3534,12 +3537,12 @@ where let b = self.buf.clone(); let mut buf = b.borrow_mut(); - buf.push(c); + buf.push(c as char); for _ in 0..5 { match self.consume_next_char() { Some(c) => { - buf.push(c); + buf.push(c as char); } _ => { break; @@ -3602,7 +3605,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword); self.set_doctype_token_public_id(); @@ -3613,7 +3616,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword); self.set_doctype_token_public_id(); @@ -3623,7 +3626,7 @@ where // This is a missing-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypePublicIdentifier); self.set_doctype_token_force_quirks(); @@ -3669,7 +3672,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_public_id(); self.state = State::DoctypePublicIdentifierDoubleQuoted; @@ -3678,7 +3681,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_public_id(); self.state = State::DoctypePublicIdentifierSingleQuoted; @@ -3687,7 +3690,7 @@ where // This is a missing-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypePublicIdentifier); self.set_doctype_token_force_quirks(); @@ -3723,7 +3726,7 @@ where match self.consume_next_char() { // U+0022 QUOTATION MARK (") // Switch to the after DOCTYPE public identifier state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_public_id(); self.state = State::AfterDoctypePublicIdentifier; @@ -3732,16 +3735,16 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's public // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None); + self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_public_id(); self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier); @@ -3767,12 +3770,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_public_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_public_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '"' | '\x00' | '>' | '\r') + !matches!(ch, '"' | '\x00' | '>' | '\r') }); } } @@ -3783,7 +3786,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE public identifier state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.finish_doctype_token_public_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypePublicIdentifier; @@ -3792,16 +3795,16 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's public // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None); + self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_public_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier); @@ -3827,12 +3830,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_public_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_public_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '\'' | '\x00' | '>' | '\r') + !matches!(ch, '\'' | '\x00' | '>' | '\r') }); } } @@ -3852,7 +3855,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3862,7 +3865,7 @@ where // parse error. Set the current DOCTYPE token's system // identifier to the empty string (not missing), then switch // to the DOCTYPE system identifier (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error( ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, @@ -3875,7 +3878,7 @@ where // parse error. Set the current DOCTYPE token's system // identifier to the empty string (not missing), then switch // to the DOCTYPE system identifier (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error( ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, @@ -3920,7 +3923,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3929,7 +3932,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierDoubleQuoted; @@ -3938,7 +3941,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierSingleQuoted; @@ -3984,7 +3987,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword); self.set_doctype_token_system_id(); @@ -3995,7 +3998,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword); self.set_doctype_token_system_id(); @@ -4005,7 +4008,7 @@ where // This is a missing-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypeSystemIdentifier); self.set_doctype_token_force_quirks(); @@ -4051,7 +4054,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierDoubleQuoted; @@ -4060,7 +4063,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierSingleQuoted; @@ -4069,7 +4072,7 @@ where // This is a missing-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::EofInDoctype); self.set_doctype_token_force_quirks(); @@ -4105,7 +4108,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE system identifier state. - Some(c @ '"') => { + Some(c @ b'"') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypeSystemIdentifier; @@ -4114,16 +4117,16 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's system // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER)); + self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8)); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier); @@ -4149,12 +4152,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_system_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_system_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '"' | '\x00' | '>' | '\r') + !matches!(ch, '"' | '\x00' | '>' | '\r') }); } } @@ -4165,7 +4168,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE system identifier state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypeSystemIdentifier; @@ -4174,16 +4177,16 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's system // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER)); + self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8)); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier); @@ -4209,12 +4212,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_system_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_system_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '\'' | '\x00' | '>' | '\r') + !matches!(ch, '\'' | '\x00' | '>' | '\r') }); } } @@ -4233,7 +4236,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -4266,14 +4269,14 @@ where match self.consume_next_char() { // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); } // U+0000 NULL // This is an unexpected-null-character parse error. Ignore the character. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); } @@ -4299,7 +4302,7 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Switch to the CDATA section bracket state. - Some(']') => { + Some(b']') => { self.state = State::CdataSectionBracket; } // EOF @@ -4324,14 +4327,14 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Switch to the CDATA section end state. - Some(']') => { + Some(b']') => { self.state = State::CdataSectionEnd; } // Anything else // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the // CDATA section state. _ => { - self.emit_character_token(']'); + self.emit_character_token(b']'); self.reconsume_in_state(State::CdataSection); } } @@ -4342,20 +4345,20 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Emit a U+005D RIGHT SQUARE BRACKET character token. - Some(c @ ']') => { + Some(c @ b']') => { self.emit_character_token_with_raw(']', c); } // U+003E GREATER-THAN SIGN character // Switch to the data state. - Some('>') => { + Some(b'>') => { self.state = State::Data; } // Anything else // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the // CDATA section state. _ => { - self.emit_character_token(']'); - self.emit_character_token(']'); + self.emit_character_token(b']'); + self.emit_character_token(b']'); self.reconsume_in_state(State::CdataSection); } } @@ -4377,8 +4380,8 @@ where // U+0023 NUMBER SIGN (#) // Append the current input character to the temporary buffer. Switch to the // numeric character reference state. - Some(c @ '#') => { - self.temporary_buffer.push(c); + Some(c @ b'#') => { + self.temporary_buffer.push(c as char); self.state = State::NumericCharacterReference; } // Anything else @@ -4409,7 +4412,7 @@ where // No need to validate input, because we reset position if nothing was found while let Some(c) = &self.consume_next_char() { - entity_temporary_buffer.push(*c); + entity_temporary_buffer.push(*c as char); if let Some(found_entity) = HTML_ENTITIES.get(&entity_temporary_buffer) { entity = Some(found_entity); @@ -4448,7 +4451,7 @@ where match entity { Some(entity) => { let is_next_equals_sign_or_ascii_alphanumeric = match self.next() { - Some('=') => true, + Some(b'=') => true, Some(c) if c.is_ascii_alphanumeric() => true, _ => false, }; @@ -4514,7 +4517,7 @@ where // Otherwise, emit the current input character as a character token. Some(c) if c.is_ascii_alphanumeric() => { if self.is_consumed_as_part_of_an_attribute() { - self.append_to_attribute_token_value(Some(c), Some(c)); + self.append_to_attribute_token_value(Some(c as char), Some(c as char)); } else { self.emit_character_token(c); } @@ -4522,7 +4525,7 @@ where // U+003B SEMICOLON (;) // This is an unknown-named-character-reference parse error. Reconsume in // the return state. - Some(';') => { + Some(b';') => { self.emit_error(ErrorKind::UnknownNamedCharacterReference); self.reconsume_in_state(self.return_state.clone()); } @@ -4543,8 +4546,8 @@ where // U+0058 LATIN CAPITAL LETTER X // Append the current input character to the temporary buffer. Switch to the // hexadecimal character reference start state. - Some(c @ 'x' | c @ 'X') => { - self.temporary_buffer.push(c); + Some(c @ b'x' | c @ b'X') => { + self.temporary_buffer.push(c as char); self.state = State::HexademicalCharacterReferenceStart; } // Anything else @@ -4604,7 +4607,7 @@ where // to the character reference code. Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x30, Some(c))); + character_reference_code.push((16, c as u32 - 0x30, Some(c as char))); } _ => { unreachable!(); @@ -4616,7 +4619,7 @@ where // character's code point) to the character reference code. Some(c) if is_upper_hex_digit(c) => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x37, Some(c))); + character_reference_code.push((16, c as u32 - 0x37, Some(c as char))); } _ => { unreachable!(); @@ -4628,7 +4631,7 @@ where // character's code point) to the character reference code. Some(c) if is_lower_hex_digit(c) => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x57, Some(c))); + character_reference_code.push((16, c as u32 - 0x57, Some(c as char))); } _ => { unreachable!(); @@ -4636,7 +4639,7 @@ where }, // U+003B SEMICOLON // Switch to the numeric character reference end state. - Some(';') => { + Some(b';') => { self.state = State::NumericCharacterReferenceEnd; } // Anything else @@ -4658,7 +4661,7 @@ where // to the character reference code. Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((10, c as u32 - 0x30, Some(c))); + character_reference_code.push((10, c as u32 - 0x30, Some(c as char))); } _ => { unreachable!(); @@ -4666,7 +4669,7 @@ where }, // U+003B SEMICOLON // Switch to the numeric character reference end state. - Some(';') => self.state = State::NumericCharacterReferenceEnd, + Some(b';') => self.state = State::NumericCharacterReferenceEnd, // Anything else // This is a missing-semicolon-after-character-reference parse error. // Reconsume in the numeric character reference end state. @@ -4832,7 +4835,7 @@ where raw.push_str(&old_temporary_buffer); raw.push_str(&raw_char_ref); - if self.cur == Some(';') { + if self.cur == Some(b';') { raw.push(';'); } @@ -4855,8 +4858,8 @@ where } #[inline(always)] - fn skip_whitespaces(&mut self, c: char) { - if c == '\r' && self.input.cur() == Some('\n') { + fn skip_whitespaces(&mut self, c: u8) { + if c == b'\r' && self.input.cur() == Some(b'\n') { unsafe { // Safety: cur() is Some self.input.bump(); @@ -4868,8 +4871,13 @@ where // By spec '\r` removed before tokenizer, but we keep them to have better AST // and don't break logic to ignore characters #[inline(always)] -fn is_spacy(c: char) -> bool { - matches!(c, '\x09' | '\x0a' | '\x0d' | '\x0c' | '\x20') +fn is_spacy(c: u8) -> bool { + matches!(c, b'\x09' | b'\x0a' | b'\x0d' | b'\x0c' | b'\x20') +} + +#[inline(always)] +fn is_spacy_char(c: char) -> bool { + is_spacy(c as u8) } #[inline(always)] @@ -4932,35 +4940,65 @@ fn is_noncharacter(c: u32) -> bool { } #[inline(always)] -fn is_upper_hex_digit(c: char) -> bool { - matches!(c, '0'..='9' | 'A'..='F') +fn is_upper_hex_digit(c: u8) -> bool { + matches!(c, b'0'..=b'9' | b'A'..=b'F') } #[inline(always)] -fn is_lower_hex_digit(c: char) -> bool { - matches!(c, '0'..='9' | 'a'..='f') +fn is_lower_hex_digit(c: u8) -> bool { + matches!(c, b'0'..=b'9' | b'a'..=b'f') } #[inline(always)] -fn is_ascii_hex_digit(c: char) -> bool { +fn is_ascii_hex_digit(c: u8) -> bool { is_upper_hex_digit(c) || is_lower_hex_digit(c) } #[inline(always)] -fn is_ascii_upper_alpha(c: char) -> bool { +fn is_upper_hex_digit_char(c: char) -> bool { + is_upper_hex_digit(c as u8) +} + +#[inline(always)] +fn is_lower_hex_digit_char(c: char) -> bool { + is_lower_hex_digit(c as u8) +} + +#[inline(always)] +fn is_ascii_hex_digit_char(c: char) -> bool { + is_ascii_hex_digit(c as u8) +} + +#[inline(always)] +fn is_ascii_upper_alpha(c: u8) -> bool { c.is_ascii_uppercase() } #[inline(always)] -fn is_ascii_lower_alpha(c: char) -> bool { +fn is_ascii_lower_alpha(c: u8) -> bool { c.is_ascii_lowercase() } #[inline(always)] -fn is_ascii_alpha(c: char) -> bool { +fn is_ascii_alpha(c: u8) -> bool { is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c) } +#[inline(always)] +fn is_ascii_upper_alpha_char(c: char) -> bool { + c.is_ascii_uppercase() +} + +#[inline(always)] +fn is_ascii_lower_alpha_char(c: char) -> bool { + c.is_ascii_lowercase() +} + +#[inline(always)] +fn is_ascii_alpha_char(c: char) -> bool { + is_ascii_upper_alpha_char(c) || is_ascii_lower_alpha_char(c) +} + #[inline(always)] fn is_allowed_control_character(c: u32) -> bool { c != 0x00 && is_control(c) From 98dc922a8fdb50f3f36cb65a3234032e2d84c9f3 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Wed, 19 Nov 2025 17:18:18 -0500 Subject: [PATCH 05/20] fix ci? --- crates/swc_css_parser/src/lexer/mod.rs | 46 ++++++-- crates/swc_html_parser/src/lexer/mod.rs | 147 ++++++++++++++++++++---- 2 files changed, 162 insertions(+), 31 deletions(-) diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs index 048f13b12706..1c29b02e8b68 100644 --- a/crates/swc_css_parser/src/lexer/mod.rs +++ b/crates/swc_css_parser/src/lexer/mod.rs @@ -963,6 +963,18 @@ where // will return a code point. fn read_escape(&mut self) -> LexResult<(char, String)> { self.with_sub_buf(|l, buf| { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + // Consume the next input code point. match l.consume() { // hex digit @@ -1027,9 +1039,10 @@ where // anything else // Return the current input code point. Some(c) => { - buf.push(c as char); + let ch = cur_char.unwrap_or(c as char); + buf.push(ch); - Ok((c as char, (&**buf).into())) + Ok((ch, (&**buf).into())) } } }) @@ -1178,20 +1191,37 @@ where // Repeatedly consume the next input code point from the stream: loop { - match l.consume() { + // For non-ASCII bytes, we need to get the full UTF-8 character before consuming + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + + let c = l.consume(); + + match c { // name code point // Append the code point to result. - Some(c) if is_name(c) => { - buf.push(c as char); - raw.push(c as char); + Some(byte) if is_name(byte) => { + // Use the full character we got earlier + if let Some(ch) = cur_char { + buf.push(ch); + raw.push(ch); + } } // the stream starts with a valid escape // Consume an escaped code point. Append the returned code point to result. - Some(c) if l.is_valid_escape(None, None) => { + Some(byte) if l.is_valid_escape(None, None) => { let escaped = l.read_escape()?; buf.push(escaped.0); - raw.push(c as char); + raw.push(byte as char); raw.push_str(&escaped.1); } // anything else diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index 8d98f63b9e59..4dc796078e30 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -119,6 +119,9 @@ where character_reference_code: Option)>>, temporary_buffer: String, is_adjusted_current_node_is_element_in_html_namespace: Option, + /// The full UTF-8 character corresponding to the current byte (for + /// non-ASCII) + current_char: Option, phantom: std::marker::PhantomData<&'a ()>, } @@ -149,6 +152,7 @@ where // Do this without a new allocation. temporary_buffer: String::with_capacity(33), is_adjusted_current_node_is_element_in_html_namespace: None, + current_char: None, phantom: std::marker::PhantomData, }; @@ -276,6 +280,17 @@ where // consumed. let c = self.next(); + // Store the full UTF-8 character before consuming (for helper functions) + if let Some(byte) = c { + if is_non_ascii(byte) { + self.current_char = self.input.cur_as_char(); + } else { + self.current_char = Some(byte as char); + } + } else { + self.current_char = None; + } + self.consume(); c @@ -421,7 +436,13 @@ where sub_buf.push('\n'); } } else { - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + sub_buf.push(ch); } } @@ -435,15 +456,30 @@ where let mut buf = b.borrow_mut(); if let Some(name) = name { - buf.push(name as char); + let ch = if is_non_ascii(name) { + self.input.cur_as_char().unwrap_or(name as char) + } else { + name as char + }; + buf.push(ch); } if let Some(public_id) = public_id { - buf.push(public_id as char); + let ch = if is_non_ascii(public_id) { + self.input.cur_as_char().unwrap_or(public_id as char) + } else { + public_id as char + }; + buf.push(ch); } if let Some(system_id) = system_id { - buf.push(system_id as char); + let ch = if is_non_ascii(system_id) { + self.input.cur_as_char().unwrap_or(system_id as char) + } else { + system_id as char + }; + buf.push(ch); } } @@ -456,8 +492,14 @@ where let b = self.sub_buf.clone(); let mut sub_buf = b.borrow_mut(); - buf.push((c as char).to_ascii_lowercase()); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch.to_ascii_lowercase()); + sub_buf.push(ch); let value = self.input.uncons_while(f); @@ -489,8 +531,14 @@ where sub_buf.push('\n'); } } else { - buf.push(c as char); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch); + sub_buf.push(ch); } let value = self.input.uncons_while(f); @@ -523,8 +571,14 @@ where sub_buf.push('\n'); } } else { - buf.push(c as char); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch); + sub_buf.push(ch); } let value = self.input.uncons_while(f); @@ -660,8 +714,14 @@ where let b = self.sub_buf.clone(); let mut sub_buf = b.borrow_mut(); - buf.push((c as char).to_ascii_lowercase()); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch.to_ascii_lowercase()); + sub_buf.push(ch); let value = self.input.uncons_while(f); @@ -718,8 +778,20 @@ where let b = self.sub_buf.clone(); let mut sub_buf = b.borrow_mut(); - buf.push(c as char); - sub_buf.push(raw_c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + let raw_ch = if is_non_ascii(raw_c) { + self.input.cur_as_char().unwrap_or(raw_c as char) + } else { + raw_c as char + }; + + buf.push(ch); + sub_buf.push(raw_ch); } fn consume_and_append_to_attribute_token_name(&mut self, c: u8, f: F) @@ -731,8 +803,14 @@ where let b = self.sub_buf.clone(); let mut sub_buf = b.borrow_mut(); - buf.push((c as char).to_ascii_lowercase()); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch.to_ascii_lowercase()); + sub_buf.push(ch); let value = self.input.uncons_while(f); @@ -749,10 +827,16 @@ where let b = self.sub_buf.clone(); let mut sub_buf = b.borrow_mut(); - buf.push((c as char).to_ascii_lowercase()); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch.to_ascii_lowercase()); + sub_buf.push(ch); - self.temporary_buffer.push(c as char); + self.temporary_buffer.push(ch); let value = self.input.uncons_while(f); @@ -857,8 +941,14 @@ where sub_buf.push('\n'); } } else { - buf.push(c as char); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch); + sub_buf.push(ch); } let value = self.input.uncons_while(f); @@ -988,8 +1078,14 @@ where sub_buf.push('\n'); } } else { - buf.push(c as char); - sub_buf.push(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + + buf.push(ch); + sub_buf.push(ch); } let value = self.input.uncons_while(f); @@ -5014,3 +5110,8 @@ fn is_allowed_character(c: char) -> bool { return true; } + +#[inline(always)] +fn is_non_ascii(c: u8) -> bool { + c >= 0x80 +} From bb477af05cb2b03471f7c15ce67f03069f9a0539 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 20 Nov 2025 04:37:06 -0500 Subject: [PATCH 06/20] fix(parser): Fix CI errors after byte conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unnecessary `as u8` cast in ECMAScript lexer (clippy error) - Fix XML parser to use `cur_as_char()` and byte literals for char comparisons - Fix HTML parser current_char usage (already in staged changes) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- crates/swc_ecma_parser/src/lexer/mod.rs | 2 +- crates/swc_html_parser/src/lexer/mod.rs | 54 +++++++++++++++++++------ crates/swc_xml_parser/src/lexer/mod.rs | 26 ++++++------ 3 files changed, 55 insertions(+), 27 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index a802304c98eb..6aff7668c428 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -2198,7 +2198,7 @@ impl<'a> Lexer<'a> { fn read_str_lit(&mut self) -> LexResult { debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"')); let start = self.cur_pos(); - let quote = self.cur().unwrap() as u8; + let quote = self.cur().unwrap(); self.bump(); // '"' or '\'' diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index 4dc796078e30..bc9888eeb177 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -457,7 +457,7 @@ where if let Some(name) = name { let ch = if is_non_ascii(name) { - self.input.cur_as_char().unwrap_or(name as char) + self.current_char.unwrap_or(name as char) } else { name as char }; @@ -466,7 +466,7 @@ where if let Some(public_id) = public_id { let ch = if is_non_ascii(public_id) { - self.input.cur_as_char().unwrap_or(public_id as char) + self.current_char.unwrap_or(public_id as char) } else { public_id as char }; @@ -475,7 +475,7 @@ where if let Some(system_id) = system_id { let ch = if is_non_ascii(system_id) { - self.input.cur_as_char().unwrap_or(system_id as char) + self.current_char.unwrap_or(system_id as char) } else { system_id as char }; @@ -785,7 +785,7 @@ where }; let raw_ch = if is_non_ascii(raw_c) { - self.input.cur_as_char().unwrap_or(raw_c as char) + self.current_char.unwrap_or(raw_c as char) } else { raw_c as char }; @@ -1115,8 +1115,13 @@ where #[inline(always)] fn emit_character_token(&mut self, value: u8) { + let ch = if is_non_ascii(value) { + self.current_char.unwrap_or(value as char) + } else { + value as char + }; self.emit_token(Token::Character { - value: value as char, + value: ch, raw: Some(Raw::Same), }); } @@ -1126,7 +1131,12 @@ where let b = self.buf.clone(); let mut buf = b.borrow_mut(); - buf.push(raw_c as char); + let raw_ch = if is_non_ascii(raw_c) { + self.current_char.unwrap_or(raw_c as char) + } else { + raw_c as char + }; + buf.push(raw_ch); self.emit_token(Token::Character { value: c, @@ -1160,8 +1170,13 @@ where buf.clear(); } else { + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; self.emit_token(Token::Character { - value: c as char, + value: ch, raw: Some(Raw::Same), }); } @@ -2539,7 +2554,9 @@ where // REPLACEMENT CHARACTER character to the current attribute's name. Some(c @ b'\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_attribute_token_name(REPLACEMENT_CHARACTER as u8, c); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+0022 QUOTATION MARK (") // U+0027 APOSTROPHE (') @@ -3552,7 +3569,10 @@ where Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER as u8), None, None); + + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // EOF // This is an eof-in-doctype parse error. Set the current DOCTYPE token's @@ -3834,7 +3854,9 @@ where Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current @@ -3894,7 +3916,9 @@ where Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current @@ -4216,7 +4240,9 @@ where Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8)); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current @@ -4276,7 +4302,9 @@ where Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8)); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current diff --git a/crates/swc_xml_parser/src/lexer/mod.rs b/crates/swc_xml_parser/src/lexer/mod.rs index 95b4c5057614..ebac6ad14726 100644 --- a/crates/swc_xml_parser/src/lexer/mod.rs +++ b/crates/swc_xml_parser/src/lexer/mod.rs @@ -173,9 +173,9 @@ where // A leading Byte Order Mark (BOM) causes the character encoding argument to be // ignored and will itself be skipped. - if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') { + if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') { unsafe { - // Safety: cur() is Some('\u{feff}') + // Safety: cur_as_char() is Some('\u{feff}') lexer.input.bump(); } } @@ -224,7 +224,7 @@ where { #[inline(always)] fn next(&mut self) -> Option { - self.input.cur() + self.input.cur_as_char() } // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any @@ -249,12 +249,12 @@ where #[inline(always)] fn consume(&mut self) { - self.cur = self.input.cur(); + self.cur = self.input.cur_as_char(); self.cur_pos = self.input.cur_pos(); if self.cur.is_some() { unsafe { - // Safety: cur() is Some(c) + // Safety: cur_as_char() is Some(c) self.input.bump(); } } @@ -573,9 +573,9 @@ where raw.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -895,9 +895,9 @@ where raw_c.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -962,9 +962,9 @@ where raw.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -3104,9 +3104,9 @@ where #[inline(always)] fn skip_next_lf(&mut self, c: char) { - if c == '\r' && self.input.cur() == Some('\n') { + if c == '\r' && self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } } From 6331ce0e7b538343ac065a7e0c022d9443a7e04b Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 20 Nov 2025 06:37:21 -0500 Subject: [PATCH 07/20] fix(parser): Fix UTF-8 multibyte character handling after byte conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes issues where multibyte UTF-8 characters (like '⬇' and '😀') were being corrupted during parsing after the char-to-byte conversion. The problem was that individual UTF-8 bytes were being cast directly to char using `as char`, which treats each byte as a separate Unicode code point. **Changes made:** 1. **CSS Parser** (`swc_css_parser`): - Updated string tokenization to use `Input::cur_as_char()` for non-ASCII bytes - Fixed URL tokenization to properly decode multibyte UTF-8 sequences - Fixed bad URL remnant parsing 2. **HTML Parser** (`swc_html_parser`): - Removed truly unused functions (`is_upper_hex_digit_char`, `is_lower_hex_digit_char`, `is_ascii_hex_digit_char`, `is_ascii_lower_alpha_char`, `is_ascii_alpha_char`, `append_to_doctype_token`) - Kept `is_ascii_upper_alpha_char` which is still in use - HTML parser already had correct UTF-8 handling via `current_char` field **Implementation:** Before consuming bytes, we now check if it's non-ASCII (`>= 0x80`) and call `cur_as_char()` to get the full UTF-8 character before advancing the input stream. For ASCII bytes, we continue using the fast path of `as char`. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- crates/swc_css_parser/src/lexer/mod.rs | 78 +++++++++++++++++++++---- crates/swc_html_parser/src/lexer/mod.rs | 62 -------------------- 2 files changed, 67 insertions(+), 73 deletions(-) diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs index 1c29b02e8b68..4820b9670d81 100644 --- a/crates/swc_css_parser/src/lexer/mod.rs +++ b/crates/swc_css_parser/src/lexer/mod.rs @@ -723,6 +723,18 @@ where // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // ending code point // Return the . @@ -784,9 +796,11 @@ where // Anything else // Append the current input code point to the 's value. - Some(c) => { - buf.push(c as char); - raw.push(c as char); + Some(_) => { + if let Some(ch) = cur_char { + buf.push(ch); + raw.push(ch); + } } } } @@ -808,9 +822,15 @@ where // Consume as much whitespace as possible. while let Some(c) = l.next() { if is_whitespace(c) { + // Get char before consuming + let ch = if is_non_ascii(c) { + l.input.cur_as_char().unwrap_or(c as char) + } else { + c as char + }; l.consume(); - raw.push(c as char); + raw.push(ch); } else { break; } @@ -818,6 +838,18 @@ where // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // U+0029 RIGHT PARENTHESIS ()) // Return the . @@ -843,13 +875,21 @@ where Some(c) if is_whitespace(c) => { // Consume as much whitespace as possible. let whitespaces: String = l.with_sub_buf(|l, buf| { - buf.push(c as char); + if let Some(ch) = cur_char { + buf.push(ch); + } while let Some(c) = l.next() { if is_whitespace(c) { + // Get char before consuming + let ch = if is_non_ascii(c) { + l.input.cur_as_char().unwrap_or(c as char) + } else { + c as char + }; l.consume(); - buf.push(c as char); + buf.push(ch); } else { break; } @@ -947,9 +987,11 @@ where // anything else // Append the current input code point to the 's value. - Some(c) => { - out.push(c as char); - raw.push(c as char); + Some(_) => { + if let Some(ch) = cur_char { + out.push(ch); + raw.push(ch); + } } } } @@ -1362,6 +1404,18 @@ where self.with_sub_buf(|l, raw| { // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // U+0029 RIGHT PARENTHESIS ()) // EOF @@ -1385,8 +1439,10 @@ where } // anything else // Do nothing. - Some(c) => { - raw.push(c as char); + Some(_) => { + if let Some(ch) = cur_char { + raw.push(ch); + } } } } diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index bc9888eeb177..d7e30e0dc4cc 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -446,43 +446,6 @@ where } } - fn append_to_doctype_token( - &mut self, - name: Option, - public_id: Option, - system_id: Option, - ) { - let b = self.buf.clone(); - let mut buf = b.borrow_mut(); - - if let Some(name) = name { - let ch = if is_non_ascii(name) { - self.current_char.unwrap_or(name as char) - } else { - name as char - }; - buf.push(ch); - } - - if let Some(public_id) = public_id { - let ch = if is_non_ascii(public_id) { - self.current_char.unwrap_or(public_id as char) - } else { - public_id as char - }; - buf.push(ch); - } - - if let Some(system_id) = system_id { - let ch = if is_non_ascii(system_id) { - self.current_char.unwrap_or(system_id as char) - } else { - system_id as char - }; - buf.push(ch); - } - } - fn consume_and_append_to_doctype_token_name(&mut self, c: u8, f: F) where F: Fn(char) -> bool, @@ -5078,21 +5041,6 @@ fn is_ascii_hex_digit(c: u8) -> bool { is_upper_hex_digit(c) || is_lower_hex_digit(c) } -#[inline(always)] -fn is_upper_hex_digit_char(c: char) -> bool { - is_upper_hex_digit(c as u8) -} - -#[inline(always)] -fn is_lower_hex_digit_char(c: char) -> bool { - is_lower_hex_digit(c as u8) -} - -#[inline(always)] -fn is_ascii_hex_digit_char(c: char) -> bool { - is_ascii_hex_digit(c as u8) -} - #[inline(always)] fn is_ascii_upper_alpha(c: u8) -> bool { c.is_ascii_uppercase() @@ -5113,16 +5061,6 @@ fn is_ascii_upper_alpha_char(c: char) -> bool { c.is_ascii_uppercase() } -#[inline(always)] -fn is_ascii_lower_alpha_char(c: char) -> bool { - c.is_ascii_lowercase() -} - -#[inline(always)] -fn is_ascii_alpha_char(c: char) -> bool { - is_ascii_upper_alpha_char(c) || is_ascii_lower_alpha_char(c) -} - #[inline(always)] fn is_allowed_control_character(c: u32) -> bool { c != 0x00 && is_control(c) From 803ec47debf6f54eb1a5bce36bf7920c395dd3d6 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 20 Nov 2025 07:08:58 -0500 Subject: [PATCH 08/20] fi xci --- crates/swc_ecma_lexer/src/common/lexer/mod.rs | 2 +- crates/swc_html_parser/src/lexer/mod.rs | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 0c4da3a6b6c7..3bc438454779 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -2055,7 +2055,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { fn read_str_lit(&mut self) -> LexResult { debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"')); let start = self.cur_pos(); - let quote = self.cur().unwrap() as u8; + let quote = self.cur().unwrap(); self.bump(); // '"' or '\'' diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index d7e30e0dc4cc..9b2812edd93d 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -233,7 +233,12 @@ where // `anything else` #[inline(always)] fn validate_input_stream_character(&mut self, c: u8) { - let code = (c as char) as u32; + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + let code = ch as u32; if is_surrogate(code) { self.emit_error(ErrorKind::SurrogateInInputStream); @@ -2515,7 +2520,7 @@ where // U+0000 NULL // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current attribute's name. - Some(c @ b'\x00') => { + Some(_c @ b'\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); let b = self.buf.clone(); let mut buf = b.borrow_mut(); @@ -3493,7 +3498,12 @@ where self.validate_input_stream_character(c); self.append_raw_to_doctype_token(c); self.create_doctype_token(); - self.set_doctype_token_name(c as char); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + self.set_doctype_token_name(ch); self.state = State::DoctypeName; } } From 09283cefd1e3c0c6c7887b05564cf50efeab8ce1 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Tue, 25 Nov 2025 21:16:44 +0900 Subject: [PATCH 09/20] bump_bytes --- crates/swc_common/src/input.rs | 11 ++++ crates/swc_css_parser/src/lexer/mod.rs | 17 ++++-- crates/swc_ecma_parser/src/lexer/mod.rs | 70 ++++++++-------------- crates/swc_ecma_parser/src/lexer/table.rs | 10 +--- crates/swc_html_parser/src/lexer/mod.rs | 71 ++++++++++------------- 5 files changed, 78 insertions(+), 101 deletions(-) diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs index 24457fa94fba..65a8986a5112 100644 --- a/crates/swc_common/src/input.rs +++ b/crates/swc_common/src/input.rs @@ -155,6 +155,13 @@ impl<'a> Input<'a> for StringInput<'a> { self.last_pos = self.last_pos + BytePos(len as u32); } + #[inline] + fn bump_bytes(&mut self, n: usize) { + debug_assert!(n <= self.remaining.len()); + self.remaining = unsafe { self.remaining.get_unchecked(n..) }; + self.last_pos.0 += n as u32; + } + #[inline] fn cur_as_ascii(&self) -> Option { let first_byte = *self.remaining.as_bytes().first()?; @@ -287,6 +294,10 @@ pub trait Input<'a>: Clone { /// when the Input is not empty. unsafe fn bump(&mut self); + /// Advances the input by exactly `n` bytes. + /// Unlike `bump()`, this does not calculate UTF-8 character boundaries. + fn bump_bytes(&mut self, n: usize); + /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F). /// Returns [None] if it's end of input or if the byte is not ASCII. #[inline] diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs index 4820b9670d81..53376b2590f2 100644 --- a/crates/swc_css_parser/src/lexer/mod.rs +++ b/crates/swc_css_parser/src/lexer/mod.rs @@ -220,11 +220,18 @@ where self.cur = cur; self.cur_pos = self.input.last_pos(); - if cur.is_some() { - unsafe { - // Safety: cur is Some - self.input.bump(); - } + if let Some(byte) = cur { + // Calculate the number of bytes in this UTF-8 character + let len = if byte < 0x80 { + 1 // ASCII + } else if byte < 0xe0 { + 2 // 2-byte UTF-8 + } else if byte < 0xf0 { + 3 // 3-byte UTF-8 + } else { + 4 // 4-byte UTF-8 + }; + self.input.bump_bytes(len); } cur diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 6aff7668c428..484ac21d321b 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -244,17 +244,11 @@ impl<'a> Lexer<'a> { fn read_token_plus_minus(&mut self) -> LexResult { let start = self.cur_pos(); - unsafe { - // Safety: cur() is Some(c), if this method is called. - self.input.bump(); - } + self.bump(); // '++', '--' Ok(if self.input.cur() == Some(C) { - unsafe { - // Safety: cur() is Some(c) - self.input.bump(); - } + self.bump(); // Handle --> if self.state.had_line_break && C == b'-' && self.eat(b'>') { @@ -286,10 +280,7 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); let had_line_break_before_last = self.had_line_break_before_last(); - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input.bump(); - } + self.bump(); Ok(if self.input.eat_byte(b'=') { // "==" @@ -543,9 +534,18 @@ impl<'a> Lexer<'a> { #[inline(always)] fn bump(&mut self) { - unsafe { - // Safety: Actually this is not safe but this is an internal method. - self.input_mut().bump() + if let Some(byte) = self.input().cur() { + // Calculate the number of bytes in this UTF-8 character + let len = if byte < 0x80 { + 1 // ASCII + } else if byte < 0xe0 { + 2 // 2-byte UTF-8 + } else if byte < 0xf0 { + 3 // 3-byte UTF-8 + } else { + 4 // 4-byte UTF-8 + }; + self.input_mut().bump_bytes(len); } } @@ -929,10 +929,7 @@ impl<'a> Lexer<'a> { } // Ignore this _ character - unsafe { - // Safety: cur() returns Some(c) where c is a valid char - self.input_mut().bump(); - } + self.bump(); continue; } @@ -1349,10 +1346,7 @@ impl<'a> Lexer<'a> { fn read_jsx_str(&mut self, quote: char) -> LexResult { debug_assert!(self.syntax().jsx()); let start = self.input().cur_pos(); - unsafe { - // Safety: cur() was Some(quote) - self.input_mut().bump(); // `quote` - } + self.bump(); // `quote` let mut out = String::new(); let mut chunk_start = self.input().cur_pos(); loop { @@ -1416,10 +1410,7 @@ impl<'a> Lexer<'a> { chunk_start = cur_pos + BytePos(ch.len_utf8() as _); } else { - unsafe { - // Safety: cur() was Some(ch) - self.input_mut().bump(); - } + self.bump(); } } let s = unsafe { @@ -1719,10 +1710,7 @@ impl<'a> Lexer<'a> { _ => c, }; - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input_mut().bump(); - } + self.bump(); Ok(CodePoint::from_u32(c as u32)) } @@ -2074,10 +2062,7 @@ impl<'a> Lexer<'a> { let had_line_break_before_last = self.had_line_break_before_last(); let start = self.cur_pos(); - unsafe { - // Safety: cur() is Some(c as char) - self.input_mut().bump(); - } + self.bump(); let token = if is_bit_and { Token::Ampersand } else { @@ -2096,16 +2081,10 @@ impl<'a> Lexer<'a> { // '||', '&&' if self.input().cur() == Some(C) { - unsafe { - // Safety: cur() is Some(c) - self.input_mut().bump(); - } + self.bump(); if self.input().cur() == Some(b'=') { - unsafe { - // Safety: cur() is Some('=') - self.input_mut().bump(); - } + self.bump(); return Ok(if is_bit_and { Token::LogicalAndEq @@ -2252,10 +2231,7 @@ impl<'a> Lexer<'a> { self.wtf8_atom(Wtf8::from_str(s)) }; - unsafe { - // Safety: cur is quote - self.input_mut().bump(); - } + self.bump(); // cur is quote let end = self.cur_pos(); let raw = unsafe { diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs index 37e33537afbb..066d580fbe83 100644 --- a/crates/swc_ecma_parser/src/lexer/table.rs +++ b/crates/swc_ecma_parser/src/lexer/table.rs @@ -48,10 +48,7 @@ const ERR: ByteHandler = |lexer| { }; let start = lexer.cur_pos(); - unsafe { - // Safety: Byte handler is only called for non-last characters - lexer.input.bump(); - } + lexer.bump(); lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }; @@ -315,10 +312,7 @@ const UNI: ByteHandler = |lexer| { } let start = lexer.cur_pos(); - unsafe { - // Safety: Byte handler is only called for non-last characters - lexer.input.bump(); - } + lexer.bump(); lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }; diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index 9b2812edd93d..552d2c6934ab 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -159,10 +159,8 @@ where // A leading Byte Order Mark (BOM) causes the character encoding argument to be // ignored and will itself be skipped. if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') { - unsafe { - // Safety: We know that the current character is '\u{feff}'. - lexer.input.bump(); - } + // Safety: We know that the current character is '\u{feff}' (3 bytes: EF BB BF). + lexer.input.bump_bytes(3); } lexer @@ -254,11 +252,18 @@ where self.cur = self.input.cur(); self.cur_pos = self.input.cur_pos(); - if self.cur.is_some() { - unsafe { - // Safety: self.cur is Some() - self.input.bump(); - } + if let Some(byte) = self.cur { + // Calculate the number of bytes in this UTF-8 character + let len = if byte < 0x80 { + 1 // ASCII + } else if byte < 0xe0 { + 2 // 2-byte UTF-8 + } else if byte < 0xf0 { + 3 // 3-byte UTF-8 + } else { + 4 // 4-byte UTF-8 + }; + self.input.bump_bytes(len); } } @@ -433,10 +438,8 @@ where sub_buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -491,10 +494,8 @@ where sub_buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -531,10 +532,8 @@ where sub_buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -867,10 +866,8 @@ where sub_buf.push('\r'); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -901,10 +898,8 @@ where sub_buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -1038,10 +1033,8 @@ where sub_buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); sub_buf.push('\n'); } @@ -1124,10 +1117,8 @@ where buf.push(c as char); if self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some(b'\n') - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); buf.push('\n'); } @@ -4957,10 +4948,8 @@ where #[inline(always)] fn skip_whitespaces(&mut self, c: u8) { if c == b'\r' && self.input.cur() == Some(b'\n') { - unsafe { - // Safety: cur() is Some - self.input.bump(); - } + // Safety: cur() is Some(b'\n'), which is 1 byte + self.input.bump_bytes(1); } } } From 51a14859e119d4da5e4d619d9e1f1414378b813a Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Tue, 25 Nov 2025 21:38:43 +0900 Subject: [PATCH 10/20] bump_bytes with proper len --- crates/swc_ecma_parser/src/lexer/mod.rs | 142 +++++++++--------- crates/swc_ecma_parser/src/lexer/state.rs | 16 +- crates/swc_ecma_parser/src/lexer/table.rs | 4 +- .../swc_ecma_parser/src/lexer/whitespace.rs | 4 +- 4 files changed, 82 insertions(+), 84 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 484ac21d321b..079ed06814f5 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -244,11 +244,11 @@ impl<'a> Lexer<'a> { fn read_token_plus_minus(&mut self) -> LexResult { let start = self.cur_pos(); - self.bump(); + self.bump(1); // '++', '--' Ok(if self.input.cur() == Some(C) { - self.bump(); + self.bump(1); // Handle --> if self.state.had_line_break && C == b'-' && self.eat(b'>') { @@ -280,7 +280,7 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); let had_line_break_before_last = self.had_line_break_before_last(); - self.bump(); + self.bump(1); Ok(if self.input.eat_byte(b'=') { // "==" @@ -321,7 +321,7 @@ impl Lexer<'_> { fn read_token_lt_gt(&mut self) -> LexResult { let had_line_break_before_last = self.had_line_break_before_last(); let start = self.cur_pos(); - self.bump(); + self.bump(1); if self.syntax.typescript() && self.ctx.contains(Context::InType) @@ -351,7 +351,7 @@ impl Lexer<'_> { // '<<', '>>' if self.cur() == Some(C) { - self.bump(); + self.bump(1); op = if C == b'<' { Token::LShift } else { @@ -360,7 +360,7 @@ impl Lexer<'_> { //'>>>' if C == b'>' && self.cur() == Some(C) { - self.bump(); + self.bump(1); op = Token::ZeroFillRShift; } } @@ -412,7 +412,7 @@ impl Lexer<'_> { ) -> LexResult { debug_assert!(self.cur() == Some(if started_with_backtick { b'`' } else { b'}' })); let mut cooked = Ok(Wtf8Buf::with_capacity(8)); - self.bump(); // `}` or `\`` + self.bump(1); // `}` or `\`` let mut cooked_slice_start = self.cur_pos(); let raw_slice_start = cooked_slice_start; let raw_atom = |this: &mut Self| { @@ -438,7 +438,7 @@ impl Lexer<'_> { consume_cooked!(); let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked)); let raw = raw_atom(self); - self.bump(); + self.bump(1); return Ok(if started_with_backtick { self.set_token_value(Some(TokenValue::Template { raw, cooked })); Token::NoSubstitutionTemplateLiteral @@ -485,7 +485,7 @@ impl Lexer<'_> { }; let c = if c == b'\r' && self.peek() == Some(b'\n') { - self.bump(); // '\r' + self.bump(1); // '\r' '\n' } else { match c_char { @@ -497,14 +497,14 @@ impl Lexer<'_> { } }; - self.bump(); + self.bump(c_char.len_utf8()); if let Ok(ref mut cooked) = cooked { cooked.push_char(c); } cooked_slice_start = self.cur_pos(); } else { - self.bump(); + self.bump(1); } } @@ -532,21 +532,13 @@ impl<'a> Lexer<'a> { Span { lo: start, hi: end } } + /// Advances the input by `len` bytes. + /// + /// For ASCII characters, use `bump(1)`. + /// For unknown character length, use `c.len_utf8()` where c is a char. #[inline(always)] - fn bump(&mut self) { - if let Some(byte) = self.input().cur() { - // Calculate the number of bytes in this UTF-8 character - let len = if byte < 0x80 { - 1 // ASCII - } else if byte < 0xe0 { - 2 // 2-byte UTF-8 - } else if byte < 0xf0 { - 3 // 3-byte UTF-8 - } else { - 4 // 4-byte UTF-8 - }; - self.input_mut().bump_bytes(len); - } + fn bump(&mut self, len: usize) { + self.input_mut().bump_bytes(len); } #[inline(always)] @@ -929,7 +921,7 @@ impl<'a> Lexer<'a> { } // Ignore this _ character - self.bump(); + self.bump(1); continue; } @@ -942,7 +934,7 @@ impl<'a> Lexer<'a> { return Ok(total); }; - self.bump(); + self.bump(1); let (t, cont) = op(total, RADIX, val)?; @@ -1093,7 +1085,7 @@ impl<'a> Lexer<'a> { // // `.1.a`, `.1e-4.a` are valid, if has_dot { - self.bump(); + self.bump(1); // equal: if START_WITH_DOT { debug_assert!(xxxx) } debug_assert!(!START_WITH_DOT || self.cur().is_some_and(|cur| cur.is_ascii_digit())); @@ -1110,7 +1102,7 @@ impl<'a> Lexer<'a> { // 1e+2 = 100 // 1e-2 = 0.01 if has_e { - self.bump(); // `e`/`E` + self.bump(1); // `e`/`E` let next = match self.cur() { Some(next) => next, @@ -1121,7 +1113,7 @@ impl<'a> Lexer<'a> { }; if next == b'+' || next == b'-' { - self.bump(); // remove '+', '-' + self.bump(1); // remove '+', '-' } let lazy_integer = self.read_number_no_dot_as_str::<10>()?; @@ -1191,12 +1183,12 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); debug_assert_eq!(self.cur(), Some(b'0')); - self.bump(); + self.bump(1); debug_assert!(self .cur() .is_some_and(|c| matches!(c, b'b' | b'B' | b'o' | b'O' | b'x' | b'X'))); - self.bump(); + self.bump(1); let lazy_integer = self.read_number_no_dot_as_str::()?; let has_underscore = lazy_integer.has_underscore; @@ -1287,7 +1279,7 @@ impl<'a> Lexer<'a> { let mut s = SmartString::::default(); debug_assert!(self.input().cur().is_some_and(|c| c == b'&')); - self.bump(); + self.bump(1); let start_pos = self.input().cur_pos(); @@ -1296,7 +1288,7 @@ impl<'a> Lexer<'a> { Some(c) => c, None => break, }; - self.bump(); + self.bump(1); if c == b';' { if let Some(stripped) = s.strip_prefix('#') { @@ -1332,10 +1324,10 @@ impl<'a> Lexer<'a> { fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult> { debug_assert!(self.syntax().jsx()); let ch = self.input().cur_as_char().unwrap(); - self.bump(); + self.bump(ch.len_utf8()); let out = if ch == '\r' && self.input().cur() == Some(b'\n') { - self.bump(); // `\n` + self.bump(1); // `\n` Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) } else { Either::Right(ch) @@ -1346,7 +1338,7 @@ impl<'a> Lexer<'a> { fn read_jsx_str(&mut self, quote: char) -> LexResult { debug_assert!(self.syntax().jsx()); let start = self.input().cur_pos(); - self.bump(); // `quote` + self.bump(1); // `quote` let mut out = String::new(); let mut chunk_start = self.input().cur_pos(); loop { @@ -1367,7 +1359,7 @@ impl<'a> Lexer<'a> { out.push_str(value); out.push('\\'); - self.bump(); + self.bump(1); chunk_start = self.input().cur_pos(); @@ -1410,7 +1402,7 @@ impl<'a> Lexer<'a> { chunk_start = cur_pos + BytePos(ch.len_utf8() as _); } else { - self.bump(); + self.bump(ch.len_utf8()); } } let s = unsafe { @@ -1428,7 +1420,7 @@ impl<'a> Lexer<'a> { // it might be at the end of the file when // the string literal is unterminated if self.input().peek_ahead().is_some() { - self.bump(); + self.bump(1); } let raw = unsafe { @@ -1506,7 +1498,7 @@ impl<'a> Lexer<'a> { let mut is_curly = false; - self.bump(); // 'u' + self.bump(1); // 'u' if self.eat(b'{') { is_curly = true; @@ -1589,8 +1581,8 @@ impl<'a> Lexer<'a> { if self.input().cur() != Some(b'#') || self.input().peek() != Some(b'!') { return Ok(None); } - self.bump(); // `#` - self.bump(); // `!` + self.bump(1); // `#` + self.bump(1); // `!` let s = self.input_uncons_while(|c| !c.is_line_terminator()); Ok(Some(self.atom(s))) } @@ -1603,7 +1595,7 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); - self.bump(); // '\' + self.bump(1); // '\' let c = match self.cur_as_char() { Some(c) => c, @@ -1619,21 +1611,21 @@ impl<'a> Lexer<'a> { 'v' => '\u{000b}', 'f' => '\u{000c}', '\r' => { - self.bump(); // remove '\r' + self.bump(1); // remove '\r' self.eat(b'\n'); return Ok(None); } '\n' | '\u{2028}' | '\u{2029}' => { - self.bump(); + self.bump(c.len_utf8()); return Ok(None); } // read hexadecimal escape sequences 'x' => { - self.bump(); // 'x' + self.bump(1); // 'x' match self.read_int_u32::<16>(2)? { Some(val) => return Ok(CodePoint::from_u32(val)), @@ -1656,7 +1648,7 @@ impl<'a> Lexer<'a> { // octal escape sequences '0'..='7' => { - self.bump(); + self.bump(1); let first_c = if c == '0' { match self.cur() { @@ -1695,7 +1687,7 @@ impl<'a> Lexer<'a> { value * 8 + v as u8 }; - self.bump(); + self.bump(1); } _ => return Ok(CodePoint::from_u32(value as u32)), } @@ -1710,7 +1702,7 @@ impl<'a> Lexer<'a> { _ => c, }; - self.bump(); + self.bump(1); Ok(CodePoint::from_u32(c as u32)) } @@ -1726,7 +1718,7 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); - self.bump(); // bump '/' + self.bump(1); // bump '/' let slice_start = self.cur_pos(); @@ -1758,7 +1750,7 @@ impl<'a> Lexer<'a> { escaped = c == b'\\'; } - self.bump(); + self.bump(1); } let content = { @@ -1776,7 +1768,7 @@ impl<'a> Lexer<'a> { )); } - self.bump(); // '/' + self.bump(1); // '/' // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape // sequence." TODO: check for escape @@ -1806,7 +1798,7 @@ impl<'a> Lexer<'a> { if let Some(c) = self.input().cur_as_ascii() { if Ident::is_valid_ascii_start(c) { // Advance past first byte - self.bump(); + self.bump(1); // Use byte_search to quickly scan to end of ASCII identifier let next_byte = byte_search! { @@ -1861,10 +1853,10 @@ impl<'a> Lexer<'a> { loop { if let Some(c) = self.input().cur_as_ascii() { if Ident::is_valid_ascii_continue(c) { - self.bump(); + self.bump(1); continue; } else if first && Ident::is_valid_ascii_start(c) { - self.bump(); + self.bump(1); first = false; continue; } @@ -1874,7 +1866,7 @@ impl<'a> Lexer<'a> { first = false; has_escape = true; let start = self.cur_pos(); - self.bump(); + self.bump(1); if !self.is(b'u') { self.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)? @@ -1926,10 +1918,10 @@ impl<'a> Lexer<'a> { break; } else if let Some(c) = self.input().cur_as_char() { if Ident::is_valid_non_ascii_continue(c) { - self.bump(); + self.bump(c.len_utf8()); continue; } else if first && Ident::is_valid_non_ascii_start(c) { - self.bump(); + self.bump(c.len_utf8()); first = false; continue; } @@ -1959,7 +1951,7 @@ impl<'a> Lexer<'a> { fn read_token_number_sign(&mut self) -> LexResult { debug_assert!(self.cur().is_some_and(|c| c == b'#')); - self.bump(); // '#' + self.bump(1); // '#' // `#` can also be a part of shebangs, however they should have been // handled by `read_shebang()` @@ -1979,7 +1971,7 @@ impl<'a> Lexer<'a> { let next = match self.input().peek() { Some(next) => next, None => { - self.bump(); // '.' + self.bump(1); // '.' return Ok(Token::Dot); } }; @@ -1990,11 +1982,11 @@ impl<'a> Lexer<'a> { }); } - self.bump(); // 1st `.` + self.bump(1); // 1st `.` if next == b'.' && self.input().peek() == Some(b'.') { - self.bump(); // 2nd `.` - self.bump(); // 3rd `.` + self.bump(1); // 2nd `.` + self.bump(1); // 3rd `.` return Ok(Token::DotDotDot); } @@ -2007,7 +1999,7 @@ impl<'a> Lexer<'a> { /// This is extracted as a method to reduce size of `read_token`. fn read_token_question_mark(&mut self) -> LexResult { debug_assert!(self.cur().is_some_and(|c| c == b'?')); - self.bump(); + self.bump(1); if self.input_mut().eat_byte(b'?') { if self.input_mut().eat_byte(b'=') { Ok(Token::NullishEq) @@ -2024,7 +2016,7 @@ impl<'a> Lexer<'a> { /// This is extracted as a method to reduce size of `read_token`. fn read_token_colon(&mut self) -> LexResult { debug_assert!(self.cur().is_some_and(|c| c == b':')); - self.bump(); // ':' + self.bump(1); // ':' Ok(Token::Colon) } @@ -2062,7 +2054,7 @@ impl<'a> Lexer<'a> { let had_line_break_before_last = self.had_line_break_before_last(); let start = self.cur_pos(); - self.bump(); + self.bump(1); let token = if is_bit_and { Token::Ampersand } else { @@ -2081,10 +2073,10 @@ impl<'a> Lexer<'a> { // '||', '&&' if self.input().cur() == Some(C) { - self.bump(); + self.bump(1); if self.input().cur() == Some(b'=') { - self.bump(); + self.bump(1); return Ok(if is_bit_and { Token::LogicalAndEq @@ -2120,7 +2112,7 @@ impl<'a> Lexer<'a> { /// This is extracted as a method to reduce size of `read_token`. fn read_token_mul_mod(&mut self) -> LexResult { debug_assert!(self.cur().is_some_and(|c| c == b'*' || c == b'%')); - self.bump(); + self.bump(1); let token = if IS_MUL { if self.input_mut().eat_byte(b'*') { // `**` @@ -2148,7 +2140,7 @@ impl<'a> Lexer<'a> { fn read_slash(&mut self) -> LexResult { debug_assert_eq!(self.cur(), Some(b'/')); - self.bump(); // '/' + self.bump(1); // '/' Ok(if self.eat(b'=') { Token::DivEq } else { @@ -2179,7 +2171,7 @@ impl<'a> Lexer<'a> { let start = self.cur_pos(); let quote = self.cur().unwrap(); - self.bump(); // '"' or '\'' + self.bump(1); // '"' or '\'' let mut slice_start = self.input().cur_pos(); @@ -2231,7 +2223,7 @@ impl<'a> Lexer<'a> { self.wtf8_atom(Wtf8::from_str(s)) }; - self.bump(); // cur is quote + self.bump(1); // cur is quote let end = self.cur_pos(); let raw = unsafe { @@ -2286,7 +2278,7 @@ impl<'a> Lexer<'a> { self, )); } - _ => self.bump(), + _ => self.bump(1), } } } @@ -2324,7 +2316,7 @@ impl<'a> Lexer<'a> { // Fast path: try to scan ASCII identifier using byte_search // Performance optimization: check if first char disqualifies as keyword // Advance past first byte - self.bump(); + self.bump(1); // Use byte_search to quickly scan to end of ASCII identifier let next_byte = byte_search! { diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index e52ebdd07227..dd02ff45f464 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -248,7 +248,7 @@ impl crate::input::Tokens for Lexer<'_> { while let Some(ch) = self.input().cur() { if ch == b'-' { v.push(ch as char); - self.bump(); + self.bump(1); } else { let old_pos = self.cur_pos(); v.push_str(&self.scan_identifier_parts()); @@ -464,7 +464,13 @@ impl Lexer<'_> { chunk_start = self.input.cur_pos(); } } else { - self.bump(); + let len = if ch < 0x80 { + 1 // ASCII + } else { + // For multi-byte UTF-8, get the full character + self.input().cur_as_char().unwrap().len_utf8() + }; + self.bump(len); } } @@ -514,12 +520,12 @@ impl Lexer<'_> { v.push(ch as char); self.input_mut().bump_bytes(1); } else if ch == b'\\' { - self.bump(); // bump '\' + self.bump(1); // bump '\' if !self.is(b'u') { self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape); continue; } - self.bump(); // bump 'u' + self.bump(1); // bump 'u' let Ok(value) = self.read_unicode_escape() else { self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape); break; @@ -538,7 +544,7 @@ impl Lexer<'_> { if let Some(c) = self.input().cur_as_char() { if c.is_ident_part() { v.push(c); - self.bump(); + self.bump(c.len_utf8()); } else { break; } diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs index 066d580fbe83..76e8e95f2dfb 100644 --- a/crates/swc_ecma_parser/src/lexer/table.rs +++ b/crates/swc_ecma_parser/src/lexer/table.rs @@ -48,7 +48,7 @@ const ERR: ByteHandler = |lexer| { }; let start = lexer.cur_pos(); - lexer.bump(); + lexer.bump(1); lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }; @@ -312,7 +312,7 @@ const UNI: ByteHandler = |lexer| { } let start = lexer.cur_pos(); - lexer.bump(); + lexer.bump(1); lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }; diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs index e423fc7457ed..46cd7319ec73 100644 --- a/crates/swc_ecma_parser/src/lexer/whitespace.rs +++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs @@ -133,10 +133,10 @@ const UNI: ByteHandler = |lexer| { }; if is_irregular_whitespace(c) { - lexer.bump(); + lexer.bump(c.len_utf8()); true } else if is_irregular_line_terminator(c) { - lexer.bump(); + lexer.bump(c.len_utf8()); lexer.state.mark_had_line_break(); true } else { From 49dce8a6cb88ed68830231850627a6e2e8f3da2b Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Wed, 26 Nov 2025 17:46:43 +0900 Subject: [PATCH 11/20] swc_html_parser --- crates/swc_html_parser/src/lexer/mod.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index 552d2c6934ab..f4b9facfa1ba 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -248,21 +248,11 @@ where } #[inline(always)] - fn consume(&mut self) { + fn consume(&mut self, len: usize) { self.cur = self.input.cur(); self.cur_pos = self.input.cur_pos(); - if let Some(byte) = self.cur { - // Calculate the number of bytes in this UTF-8 character - let len = if byte < 0x80 { - 1 // ASCII - } else if byte < 0xe0 { - 2 // 2-byte UTF-8 - } else if byte < 0xf0 { - 3 // 3-byte UTF-8 - } else { - 4 // 4-byte UTF-8 - }; + if self.cur.is_some() { self.input.bump_bytes(len); } } @@ -291,17 +281,20 @@ where let c = self.next(); // Store the full UTF-8 character before consuming (for helper functions) - if let Some(byte) = c { + let len = if let Some(byte) = c { if is_non_ascii(byte) { self.current_char = self.input.cur_as_char(); + self.current_char.map(|c| c.len_utf8()).unwrap_or(1) } else { self.current_char = Some(byte as char); + 1 } } else { self.current_char = None; - } + 1 + }; - self.consume(); + self.consume(len); c } From 68bf6d906b9b4d7e0642ea279930fd732fe34630 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Wed, 26 Nov 2025 17:59:08 +0900 Subject: [PATCH 12/20] consume(len) --- crates/swc_css_parser/src/lexer/mod.rs | 160 +++++++++++++++++-------- 1 file changed, 111 insertions(+), 49 deletions(-) diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs index 53376b2590f2..33960362f90f 100644 --- a/crates/swc_css_parser/src/lexer/mod.rs +++ b/crates/swc_css_parser/src/lexer/mod.rs @@ -214,23 +214,13 @@ where } #[inline(always)] - fn consume(&mut self) -> Option { + fn consume(&mut self, len: usize) -> Option { let cur = self.input.cur(); self.cur = cur; self.cur_pos = self.input.last_pos(); - if let Some(byte) = cur { - // Calculate the number of bytes in this UTF-8 character - let len = if byte < 0x80 { - 1 // ASCII - } else if byte < 0xe0 { - 2 // 2-byte UTF-8 - } else if byte < 0xf0 { - 3 // 3-byte UTF-8 - } else { - 4 // 4-byte UTF-8 - }; + if cur.is_some() { self.input.bump_bytes(len); } @@ -265,7 +255,16 @@ where } // Consume the next input code point. - match self.consume() { + let byte_len = if let Some(b) = self.input.cur() { + if b < 0x80 { + 1 // ASCII + } else { + self.input.cur_as_char().map(|c| c.len_utf8()).unwrap_or(1) + } + } else { + 1 + }; + match self.consume(byte_len) { // whitespace // Consume as much whitespace as possible. Return a . Some(c) if is_whitespace(c) => self.with_buf(|l, buf| { @@ -276,7 +275,7 @@ where match c { Some(c) if is_whitespace(c) => { - l.consume(); + l.consume(1); buf.push(c as char); } @@ -364,8 +363,8 @@ where // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E // GREATER-THAN SIGN (->), consume them and return a . else if self.next() == Some(b'-') && self.next_next() == Some(b'>') { - self.consume(); - self.consume(); + self.consume(1); // - + self.consume(1); // > return Ok(Token::CDC); } @@ -410,9 +409,9 @@ where && self.next_next() == Some(b'-') && self.next_next_next() == Some(b'-') { - self.consume(); // ! - self.consume(); // - - self.consume(); // - + self.consume(1); // ! + self.consume(1); // - + self.consume(1); // - return Ok(tok!(" @@ -216,7 +218,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c) if this method is called. - self.input.bump(); + self.input.bump_bytes(1); } Ok(if self.input.eat_byte(b'=') { diff --git a/crates/swc_ecma_lexer/src/lexer/state.rs b/crates/swc_ecma_lexer/src/lexer/state.rs index 7731423f510b..f478e55f5cc8 100644 --- a/crates/swc_ecma_lexer/src/lexer/state.rs +++ b/crates/swc_ecma_lexer/src/lexer/state.rs @@ -812,7 +812,7 @@ impl Lexer<'_> { if c == '>' { unsafe { // Safety: cur() is Some('>') - self.input.bump(); + self.input.bump_bytes(1); } return Ok(Token::JSXTagEnd); } @@ -830,7 +830,7 @@ impl Lexer<'_> { unsafe { // Safety: cur() is Some('<') - self.input.bump(); + self.input.bump_bytes(1); } if had_line_break_before_last && self.is_str("<<<<<< ") { diff --git a/crates/swc_ecma_lexer/src/lexer/table.rs b/crates/swc_ecma_lexer/src/lexer/table.rs index bd8b16aa6c26..812f126ebdae 100644 --- a/crates/swc_ecma_lexer/src/lexer/table.rs +++ b/crates/swc_ecma_lexer/src/lexer/table.rs @@ -41,7 +41,9 @@ pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [ const ___: ByteHandler = None; const EOF: ByteHandler = Some(|lexer| { - lexer.input.bump_bytes(1); + unsafe { + lexer.input.bump_bytes(1); + } Ok(Token::Eof) }); @@ -56,7 +58,7 @@ const ERR: ByteHandler = Some(|lexer| { let start = lexer.cur_pos(); unsafe { // Safety: Byte handler is only called for non-last characters - lexer.input.bump(); + lexer.input.bump_bytes(c.len_utf8()); } lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }); @@ -379,7 +381,7 @@ const UNI: ByteHandler = Some(|lexer| { let start = lexer.cur_pos(); unsafe { // Safety: Byte handler is only called for non-last characters - lexer.input.bump(); + lexer.input.bump_bytes(c.len_utf8()); } lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }); @@ -405,7 +407,9 @@ const PIP: ByteHandler = Some(|lexer| lexer.read_token_logical::()); macro_rules! single_char { ($name:ident, $c:literal, $token:ident) => { const $name: ByteHandler = Some(|lexer| { - lexer.input.bump_bytes(1); + unsafe { + lexer.input.bump_bytes(1); + } Ok(Token::$token) }); }; @@ -429,9 +433,13 @@ single_char!(BEC, b'}', RBrace); /// `^` const CRT: ByteHandler = Some(|lexer| { // Bitwise xor - lexer.input.bump_bytes(1); - Ok(if lexer.input.cur_as_ascii() == Some(b'=') { + unsafe { lexer.input.bump_bytes(1); + } + Ok(if lexer.input.cur_as_ascii() == Some(b'=') { + unsafe { + lexer.input.bump_bytes(1); + } Token::AssignOp(AssignOp::BitXorAssign) } else { Token::BinOp(BinOpToken::BitXor) From 91130b49078f3be6ec27c7ba52774c27cfd36fda Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 27 Nov 2025 08:12:44 +0900 Subject: [PATCH 18/20] fix(lexer): Fix bump() to handle multibyte UTF-8 characters correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bump() helper was only advancing by 1 byte, which broke multibyte Unicode characters. Now it properly gets the current character and bumps by its full UTF-8 byte length. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- crates/swc_ecma_lexer/src/common/lexer/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 9861e1f3662b..3a1b0c5da554 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -188,8 +188,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { #[inline(always)] fn bump(&mut self) { + let c = self.cur_as_char().unwrap(); unsafe { - self.input_mut().bump_bytes(1); + self.input_mut().bump_bytes(c.len_utf8()); } } From 73519b8ec28fa9b0d2302d975c36d44c0b184c1f Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 27 Nov 2025 08:33:15 +0900 Subject: [PATCH 19/20] fmt --- crates/swc_ecma_lexer/src/common/lexer/mod.rs | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 3a1b0c5da554..af63c0cbec37 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -619,8 +619,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } // Ignore this _ character - // Safety: cur() returns Some(c) where c is a valid char - self.bump(); + // Safety: cur() returns Some(c) where c is a valid char + self.bump(); continue; } @@ -1067,8 +1067,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { fn read_jsx_str(&mut self, quote: char) -> LexResult { debug_assert!(self.syntax().jsx()); let start = self.input().cur_pos(); - // Safety: cur() was Some(quote) - self.bump(); // `quote` + // Safety: cur() was Some(quote) + self.bump(); // `quote` let mut out = String::new(); let mut chunk_start = self.input().cur_pos(); loop { @@ -1132,8 +1132,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { chunk_start = cur_pos + BytePos(ch.len_utf8() as _); } else { - // Safety: cur() was Some(ch) - self.bump(); + // Safety: cur() was Some(ch) + self.bump(); } } let cur_pos = self.input().cur_pos(); @@ -1569,8 +1569,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { _ => c, }; - // Safety: cur() is Some(c) if this method is called. - self.bump(); + // Safety: cur() is Some(c) if this method is called. + self.bump(); Ok(CodePoint::from_u32(c as u32)) } @@ -1931,8 +1931,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let had_line_break_before_last = self.had_line_break_before_last(); let start = self.cur_pos(); - // Safety: cur() is Some(c as char) - self.bump(); + // Safety: cur() is Some(c as char) + self.bump(); let token = if is_bit_and { Self::Token::BIT_AND } else { @@ -1951,12 +1951,12 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { // '||', '&&' if self.input().cur() == Some(C) { - // Safety: cur() is Some(c) - self.bump(); + // Safety: cur() is Some(c) + self.bump(); if self.input().cur() == Some(b'=') { - // Safety: cur() is Some('=') - self.bump(); + // Safety: cur() is Some('=') + self.bump(); return Ok(if is_bit_and { Self::Token::LOGICAL_AND_EQ @@ -2105,8 +2105,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.wtf8_atom(Wtf8::from_str(s)) }; - // Safety: cur is quote - self.bump(); + // Safety: cur is quote + self.bump(); let end = self.cur_pos(); let raw = unsafe { From 3b40a249c246e28967e8858ecb8d8219349ebf25 Mon Sep 17 00:00:00 2001 From: DongYun Kang Date: Thu, 27 Nov 2025 08:33:35 +0900 Subject: [PATCH 20/20] drop --- Cargo.lock | 1 - crates/swc_common/Cargo.toml | 41 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8705c92cdf85..a43b0927cc69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5476,7 +5476,6 @@ dependencies = [ "cbor4ii", "either", "from_variant", - "new_debug_unreachable", "num-bigint", "once_cell", "par-iter", diff --git a/crates/swc_common/Cargo.toml b/crates/swc_common/Cargo.toml index da0b75aaa3ea..4ce639a24690 100644 --- a/crates/swc_common/Cargo.toml +++ b/crates/swc_common/Cargo.toml @@ -51,27 +51,26 @@ shrink-to-fit = ["dep:shrink-to-fit", "swc_atoms/shrink-to-fit"] [dependencies] -anyhow = { workspace = true } -arbitrary = { workspace = true, features = ["derive"], optional = true } -bytecheck = { workspace = true, optional = true } -bytes-str = { workspace = true, features = ["serde"] } -cbor4ii = { workspace = true, features = ["use_std"], optional = true } -either = { workspace = true } -new_debug_unreachable = { workspace = true } -num-bigint = { workspace = true } -once_cell = { workspace = true } -parking_lot = { workspace = true, optional = true } -rancor = { workspace = true, optional = true } -rkyv = { workspace = true, optional = true } -rustc-hash = { workspace = true } -serde = { workspace = true, features = ["derive"] } -shrink-to-fit = { workspace = true, optional = true } -siphasher = { workspace = true } -swc_sourcemap = { workspace = true, optional = true } -termcolor = { workspace = true, optional = true } -tracing = { workspace = true } -unicode-width = { workspace = true } -url = { workspace = true } +anyhow = { workspace = true } +arbitrary = { workspace = true, features = ["derive"], optional = true } +bytecheck = { workspace = true, optional = true } +bytes-str = { workspace = true, features = ["serde"] } +cbor4ii = { workspace = true, features = ["use_std"], optional = true } +either = { workspace = true } +num-bigint = { workspace = true } +once_cell = { workspace = true } +parking_lot = { workspace = true, optional = true } +rancor = { workspace = true, optional = true } +rkyv = { workspace = true, optional = true } +rustc-hash = { workspace = true } +serde = { workspace = true, features = ["derive"] } +shrink-to-fit = { workspace = true, optional = true } +siphasher = { workspace = true } +swc_sourcemap = { workspace = true, optional = true } +termcolor = { workspace = true, optional = true } +tracing = { workspace = true } +unicode-width = { workspace = true } +url = { workspace = true } ast_node = { version = "5.0.0", path = "../ast_node" } better_scoped_tls = { version = "1.0.1", path = "../better_scoped_tls" }