From 657abf14c68efe4fc49476dc78caa9c860b1baa9 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Tue, 18 Nov 2025 11:05:07 -0500
Subject: [PATCH 01/20] check: Work on byte

---
 crates/swc_common/src/input.rs                | 124 ++++++++-------
 .../swc_ecma_lexer/src/common/lexer/char.rs   |  20 ++-
 crates/swc_ecma_lexer/src/common/lexer/mod.rs |  11 +-
 crates/swc_ecma_lexer/src/lexer/table.rs      |  16 +-
 crates/swc_ecma_parser/src/lexer/char_ext.rs  |  20 ++-
 crates/swc_ecma_parser/src/lexer/mod.rs       | 143 ++++++++++--------
 crates/swc_ecma_parser/src/lexer/state.rs     |  73 +++++----
 crates/swc_ecma_parser/src/lexer/table.rs     |  18 ++-
 .../swc_ecma_parser/src/lexer/whitespace.rs   |  32 ++--
 9 files changed, 270 insertions(+), 187 deletions(-)
diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs
index 6cd4d8e35e83..86465b956932 100644
--- a/crates/swc_common/src/input.rs
+++ b/crates/swc_common/src/input.rs
@@ -10,8 +10,8 @@ pub type SourceFileInput<'a> = StringInput<'a>;
 #[derive(Clone)]
 pub struct StringInput<'a> {
     last_pos: BytePos,
-    /// Current cursor
-    iter: str::Chars<'a>,
+    /// Remaining input as str - we slice this as we consume bytes
+    remaining: &'a str,
     orig: &'a str,
     /// Original start position.
     orig_start: BytePos,
@@ -33,7 +33,7 @@ impl<'a> StringInput<'a> {
         StringInput {
             last_pos: start,
             orig: src,
-            iter: src.chars(),
+            remaining: src,
             orig_start: start,
             orig_end: end,
         }
@@ -41,7 +41,7 @@ impl<'a> StringInput<'a> {
 
     #[inline(always)]
     pub fn as_str(&self) -> &str {
-        self.iter.as_str()
+        self.remaining
     }
 
     #[inline(always)]
@@ -68,21 +68,22 @@ impl<'a> StringInput<'a> {
 
         let ret = unsafe { s.get_unchecked(start_idx..end_idx) };
 
-        self.iter = unsafe { s.get_unchecked(end_idx..) }.chars();
+        self.remaining = unsafe { s.get_unchecked(end_idx..) };
 
         ret
     }
 
     #[inline]
     pub fn bump_bytes(&mut self, n: usize) {
-        let s = self.iter.as_str();
-        self.iter = unsafe { s.get_unchecked(n..) }.chars();
+        debug_assert!(n <= self.remaining.len());
+        self.remaining = unsafe { self.remaining.get_unchecked(n..) };
         self.last_pos.0 += n as u32;
     }
 
     #[inline]
     pub fn bump_one(&mut self) {
-        if self.iter.next().is_some() {
+        if !self.remaining.is_empty() {
+            self.remaining = unsafe { self.remaining.get_unchecked(1..) };
             self.last_pos.0 += 1;
         } else {
             unsafe {
@@ -114,41 +115,49 @@ impl<'a> From<&'a SourceFile> for StringInput<'a> {
 
 impl<'a> Input<'a> for StringInput<'a> {
     #[inline]
-    fn cur(&self) -> Option<char> {
-        self.iter.clone().next()
+    fn cur(&self) -> Option<u8> {
+        self.remaining.as_bytes().first().copied()
     }
 
     #[inline]
-    fn peek(&self) -> Option<char> {
-        let mut iter = self.iter.clone();
-        // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster.
-        iter.next();
-        iter.next()
+    fn peek(&self) -> Option<u8> {
+        self.remaining.as_bytes().get(1).copied()
     }
 
     #[inline]
-    fn peek_ahead(&self) -> Option<char> {
-        let mut iter = self.iter.clone();
-        // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster
-        iter.next();
-        iter.next();
-        iter.next()
+    fn peek_ahead(&self) -> Option<u8> {
+        self.remaining.as_bytes().get(2).copied()
     }
 
     #[inline]
     unsafe fn bump(&mut self) {
-        if let Some(c) = self.iter.next() {
-            self.last_pos = self.last_pos + BytePos((c.len_utf8()) as u32);
-        } else {
+        let bytes = self.remaining.as_bytes();
+        if bytes.is_empty() {
             unsafe {
                 debug_unreachable!("bump should not be called when cur() == None");
             }
         }
+
+        let first_byte = unsafe { *bytes.get_unchecked(0) };
+
+        // Calculate the number of bytes in this UTF-8 character
+        let len = if first_byte < 0x80 {
+            1 // ASCII
+        } else if first_byte < 0xe0 {
+            2 // 2-byte UTF-8
+        } else if first_byte < 0xf0 {
+            3 // 3-byte UTF-8
+        } else {
+            4 // 4-byte UTF-8
+        };
+
+        self.remaining = unsafe { self.remaining.get_unchecked(len..) };
+        self.last_pos = self.last_pos + BytePos(len as u32);
     }
 
     #[inline]
     fn cur_as_ascii(&self) -> Option<u8> {
-        let first_byte = *self.as_str().as_bytes().first()?;
+        let first_byte = *self.remaining.as_bytes().first()?;
         if first_byte <= 0x7f {
             Some(first_byte)
         } else {
@@ -156,6 +165,11 @@ impl<'a> Input<'a> for StringInput<'a> {
         }
     }
 
+    #[inline]
+    fn cur_as_char(&self) -> Option<char> {
+        self.remaining.chars().next()
+    }
+
     #[inline]
     fn is_at_start(&self) -> bool {
         self.orig_start == self.last_pos
@@ -184,7 +198,7 @@ impl<'a> Input<'a> for StringInput<'a> {
 
         let ret = unsafe { s.get_unchecked(start_idx..end_idx) };
 
-        self.iter = unsafe { s.get_unchecked(end_idx..) }.chars();
+        self.remaining = unsafe { s.get_unchecked(end_idx..) };
         self.last_pos = end;
 
         ret
@@ -197,7 +211,7 @@ impl<'a> Input<'a> for StringInput<'a> {
     {
         let last = {
             let mut last = 0;
-            for c in self.iter.clone() {
+            for c in self.remaining.chars() {
                 if pred(c) {
                     last += c.len_utf8();
                 } else {
@@ -207,12 +221,11 @@ impl<'a> Input<'a> for StringInput<'a> {
             last
         };
 
-        let s = self.iter.as_str();
-        debug_assert!(last <= s.len());
-        let ret = unsafe { s.get_unchecked(..last) };
+        debug_assert!(last <= self.remaining.len());
+        let ret = unsafe { self.remaining.get_unchecked(..last) };
 
         self.last_pos = self.last_pos + BytePos(last as _);
-        self.iter = unsafe { s.get_unchecked(last..) }.chars();
+        self.remaining = unsafe { self.remaining.get_unchecked(last..) };
 
         ret
     }
@@ -228,15 +241,13 @@ impl<'a> Input<'a> for StringInput<'a> {
         let idx = (to - self.orig_start).0 as usize;
 
         debug_assert!(idx <= orig.len());
-        let s = unsafe { orig.get_unchecked(idx..) };
-        self.iter = s.chars();
+        self.remaining = unsafe { orig.get_unchecked(idx..) };
         self.last_pos = to;
     }
 
     #[inline]
     fn is_byte(&self, c: u8) -> bool {
-        self.iter
-            .as_str()
+        self.remaining
             .as_bytes()
             .first()
             .map(|b| *b == c)
@@ -245,13 +256,13 @@ impl<'a> Input<'a> for StringInput<'a> {
 
     #[inline]
     fn is_str(&self, s: &str) -> bool {
-        self.as_str().starts_with(s)
+        self.remaining.starts_with(s)
     }
 
     #[inline]
     fn eat_byte(&mut self, c: u8) -> bool {
         if self.is_byte(c) {
-            self.iter.next();
+            self.remaining = unsafe { self.remaining.get_unchecked(1..) };
             self.last_pos = self.last_pos + BytePos(1_u32);
             true
         } else {
@@ -261,9 +272,14 @@ impl<'a> Input<'a> for StringInput<'a> {
 }
 
 pub trait Input<'a>: Clone {
-    fn cur(&self) -> Option<char>;
-    fn peek(&self) -> Option<char>;
-    fn peek_ahead(&self) -> Option<char>;
+    /// Returns the current byte. Returns [None] if at end of input.
+    fn cur(&self) -> Option<u8>;
+
+    /// Returns the next byte without consuming the current byte.
+    fn peek(&self) -> Option<u8>;
+
+    /// Returns the byte after the next byte without consuming anything.
+    fn peek_ahead(&self) -> Option<u8>;
 
     /// # Safety
     ///
@@ -271,18 +287,20 @@ pub trait Input<'a>: Clone {
     /// when the Input is not empty.
     unsafe fn bump(&mut self);
 
-    /// Returns [None] if it's end of input **or** current character is not an
-    /// ascii character.
+    /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F).
+    /// Returns [None] if it's end of input or if the byte is not ASCII.
     #[inline]
     fn cur_as_ascii(&self) -> Option<u8> {
-        self.cur().and_then(|i| {
-            if i.is_ascii() {
-                return Some(i as u8);
-            }
-            None
-        })
+        self.cur()
+            .and_then(|b| if b <= 0x7f { Some(b) } else { None })
     }
 
+    /// Returns the current position as a UTF-8 char for cases where we need
+    /// full character processing (identifiers, strings, etc).
+    /// Returns [None] if at end of input or if the bytes don't form valid
+    /// UTF-8.
+    fn cur_as_char(&self) -> Option<char>;
+
     fn is_at_start(&self) -> bool;
 
     fn cur_pos(&self) -> BytePos;
@@ -306,16 +324,12 @@ pub trait Input<'a>: Clone {
     /// - `to` be in the valid range of input.
     unsafe fn reset_to(&mut self, to: BytePos);
 
-    /// Implementors can override the method to make it faster.
-    ///
-    /// `c` must be ASCII.
+    /// Check if the current byte equals the given byte.
+    /// `c` should typically be an ASCII byte for performance.
     #[inline]
     #[allow(clippy::wrong_self_convention)]
     fn is_byte(&self, c: u8) -> bool {
-        match self.cur() {
-            Some(ch) => ch == c as char,
-            _ => false,
-        }
+        self.cur() == Some(c)
     }
 
     /// Implementors can override the method to make it faster.
diff --git a/crates/swc_ecma_lexer/src/common/lexer/char.rs b/crates/swc_ecma_lexer/src/common/lexer/char.rs
index 705a3fd05f70..62f4e4d08a49 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/char.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/char.rs
@@ -1,8 +1,8 @@
-/// Implemented for `char`.
+/// Implemented for `u8` - operates on bytes for performance.
 pub trait CharExt: Copy {
     fn to_char(self) -> Option<char>;
 
-    /// Test whether a given character code starts an identifier.
+    /// Test whether a given byte/character starts an identifier.
     ///
     /// https://tc39.github.io/ecma262/#prod-IdentifierStart
     #[inline]
@@ -14,7 +14,7 @@ pub trait CharExt: Copy {
         swc_ecma_ast::Ident::is_valid_start(c)
     }
 
-    /// Test whether a given character is part of an identifier.
+    /// Test whether a given byte/character is part of an identifier.
     #[inline]
     fn is_ident_part(self) -> bool {
         let c = match self.to_char() {
@@ -65,6 +65,20 @@ pub trait CharExt: Copy {
     }
 }
 
+impl CharExt for u8 {
+    #[inline(always)]
+    fn to_char(self) -> Option<char> {
+        // For ASCII bytes, this is a fast path
+        if self <= 0x7f {
+            Some(self as char)
+        } else {
+            // For non-ASCII bytes, we can't convert a single byte to a char
+            // The caller should use cur_as_char() on the Input trait instead
+            None
+        }
+    }
+}
+
 impl CharExt for char {
     #[inline(always)]
     fn to_char(self) -> Option<char> {
diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index 852877860f3f..a81ee56cd384 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -175,20 +175,25 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     }
 
     #[inline(always)]
-    fn cur(&self) -> Option<char> {
+    fn cur(&self) -> Option<u8> {
         self.input().cur()
     }
 
     #[inline(always)]
-    fn peek(&self) -> Option<char> {
+    fn peek(&self) -> Option<u8> {
         self.input().peek()
     }
 
     #[inline(always)]
-    fn peek_ahead(&self) -> Option<char> {
+    fn peek_ahead(&self) -> Option<u8> {
         self.input().peek_ahead()
     }
 
+    #[inline(always)]
+    fn cur_as_char(&self) -> Option<char> {
+        self.input().cur_as_char()
+    }
+
     #[inline(always)]
     fn cur_pos(&self) -> BytePos {
         self.input().cur_pos()
diff --git a/crates/swc_ecma_lexer/src/lexer/table.rs b/crates/swc_ecma_lexer/src/lexer/table.rs
index 798c5194a371..f6c347bba074 100644
--- a/crates/swc_ecma_lexer/src/lexer/table.rs
+++ b/crates/swc_ecma_lexer/src/lexer/table.rs
@@ -48,13 +48,14 @@ const EOF: ByteHandler = Some(|lexer| {
 
 const ERR: ByteHandler = Some(|lexer| {
     let c = unsafe {
-        // Safety: Byte handler is only called for non-last chracters
-        lexer.input.cur().unwrap_unchecked()
+        // Safety: Byte handler is only called for non-last characters
+        // Get the char representation for error messages
+        lexer.cur_as_char().unwrap_unchecked()
     };
 
     let start = lexer.cur_pos();
     unsafe {
-        // Safety: Byte handler is only called for non-last chracters
+        // Safety: Byte handler is only called for non-last characters
         lexer.input.bump();
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
@@ -361,11 +362,12 @@ const DIG: ByteHandler = Some(|lexer| {
 /// String literals with `'` or `"`
 const QOT: ByteHandler = Some(|lexer| lexer.read_str_lit());
 
-/// Unicode
+/// Unicode - handles multi-byte UTF-8 sequences
 const UNI: ByteHandler = Some(|lexer| {
     let c = unsafe {
-        // Safety: Byte handler is only called for non-last chracters
-        lexer.input.cur().unwrap_unchecked()
+        // Safety: Byte handler is only called for non-last characters
+        // For non-ASCII bytes, we need the full char
+        lexer.cur_as_char().unwrap_unchecked()
     };
 
     // Identifier or keyword. '\uXXXX' sequences are allowed in
@@ -376,7 +378,7 @@ const UNI: ByteHandler = Some(|lexer| {
 
     let start = lexer.cur_pos();
     unsafe {
-        // Safety: Byte handler is only called for non-last chracters
+        // Safety: Byte handler is only called for non-last characters
         lexer.input.bump();
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
diff --git a/crates/swc_ecma_parser/src/lexer/char_ext.rs b/crates/swc_ecma_parser/src/lexer/char_ext.rs
index 8f7dc37c2573..873dc36f94d0 100644
--- a/crates/swc_ecma_parser/src/lexer/char_ext.rs
+++ b/crates/swc_ecma_parser/src/lexer/char_ext.rs
@@ -1,8 +1,8 @@
-/// Implemented for `char`.
+/// Implemented for `u8` and `char` - operates on bytes for performance.
 pub trait CharExt: Copy {
     fn to_char(self) -> Option<char>;
 
-    /// Test whether a given character code starts an identifier.
+    /// Test whether a given byte/character starts an identifier.
     ///
     /// https://tc39.github.io/ecma262/#prod-IdentifierStart
     #[inline]
@@ -14,7 +14,7 @@ pub trait CharExt: Copy {
         swc_ecma_ast::Ident::is_valid_start(c)
     }
 
-    /// Test whether a given character is part of an identifier.
+    /// Test whether a given byte/character is part of an identifier.
     #[inline]
     fn is_ident_part(self) -> bool {
         let c = match self.to_char() {
@@ -35,6 +35,20 @@ pub trait CharExt: Copy {
     }
 }
 
+impl CharExt for u8 {
+    #[inline(always)]
+    fn to_char(self) -> Option<char> {
+        // For ASCII bytes, this is a fast path
+        if self <= 0x7f {
+            Some(self as char)
+        } else {
+            // For non-ASCII bytes, we can't convert a single byte to a char
+            // The caller should use cur_as_char() on the Input trait instead
+            None
+        }
+    }
+}
+
 impl CharExt for char {
     #[inline(always)]
     fn to_char(self) -> Option<char> {
diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index ec2ca65313cb..a802304c98eb 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -250,7 +250,7 @@ impl<'a> Lexer<'a> {
         }
 
         // '++', '--'
-        Ok(if self.input.cur() == Some(C as char) {
+        Ok(if self.input.cur() == Some(C) {
             unsafe {
                 // Safety: cur() is Some(c)
                 self.input.bump();
@@ -344,7 +344,10 @@ impl Lexer<'_> {
         }
 
         // XML style comment. `<!--`
-        if C == b'<' && self.is(b'!') && self.peek() == Some('-') && self.peek_ahead() == Some('-')
+        if C == b'<'
+            && self.is(b'!')
+            && self.peek() == Some(b'-')
+            && self.peek_ahead() == Some(b'-')
         {
             self.skip_line_comment(3);
             self.skip_space();
@@ -356,7 +359,7 @@ impl Lexer<'_> {
         let mut op = if C == b'<' { Token::Lt } else { Token::Gt };
 
         // '<<', '>>'
-        if self.cur() == Some(C as char) {
+        if self.cur() == Some(C) {
             self.bump();
             op = if C == b'<' {
                 Token::LShift
@@ -365,7 +368,7 @@ impl Lexer<'_> {
             };
 
             //'>>>'
-            if C == b'>' && self.cur() == Some(C as char) {
+            if C == b'>' && self.cur() == Some(C) {
                 self.bump();
                 op = Token::ZeroFillRShift;
             }
@@ -416,7 +419,7 @@ impl Lexer<'_> {
         start: BytePos,
         started_with_backtick: bool,
     ) -> LexResult<Token> {
-        debug_assert!(self.cur() == Some(if started_with_backtick { '`' } else { '}' }));
+        debug_assert!(self.cur() == Some(if started_with_backtick { b'`' } else { b'}' }));
         let mut cooked = Ok(Wtf8Buf::with_capacity(8));
         self.bump(); // `}` or `\``
         let mut cooked_slice_start = self.cur_pos();
@@ -440,7 +443,7 @@ impl Lexer<'_> {
         }
 
         while let Some(c) = self.cur() {
-            if c == '`' {
+            if c == b'`' {
                 consume_cooked!();
                 let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
                 let raw = raw_atom(self);
@@ -452,7 +455,7 @@ impl Lexer<'_> {
                     self.set_token_value(Some(TokenValue::Template { raw, cooked }));
                     Token::TemplateTail
                 });
-            } else if c == '$' && self.input.peek() == Some('{') {
+            } else if c == b'$' && self.input.peek() == Some(b'{') {
                 consume_cooked!();
                 let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
                 let raw = raw_atom(self);
@@ -464,7 +467,7 @@ impl Lexer<'_> {
                     self.set_token_value(Some(TokenValue::Template { raw, cooked }));
                     Token::TemplateMiddle
                 });
-            } else if c == '\\' {
+            } else if c == b'\\' {
                 consume_cooked!();
 
                 match self.read_escaped_char(true) {
@@ -483,11 +486,18 @@ impl Lexer<'_> {
             } else if c.is_line_terminator() {
                 consume_cooked!();
 
-                let c = if c == '\r' && self.peek() == Some('\n') {
+                // For line terminators, we need the full char (can be multi-byte UTF-8)
+                let c_char = if c <= 0x7f {
+                    c as char
+                } else {
+                    self.cur_as_char().unwrap()
+                };
+
+                let c = if c == b'\r' && self.peek() == Some(b'\n') {
                     self.bump(); // '\r'
                     '\n'
                 } else {
-                    match c {
+                    match c_char {
                         '\n' => '\n',
                         '\r' => '\n',
                         '\u{2028}' => '\u{2028}',
@@ -555,20 +565,25 @@ impl<'a> Lexer<'a> {
     }
 
     #[inline(always)]
-    fn cur(&self) -> Option<char> {
+    fn cur(&self) -> Option<u8> {
         self.input().cur()
     }
 
     #[inline(always)]
-    fn peek(&self) -> Option<char> {
+    fn peek(&self) -> Option<u8> {
         self.input().peek()
     }
 
     #[inline(always)]
-    fn peek_ahead(&self) -> Option<char> {
+    fn peek_ahead(&self) -> Option<u8> {
         self.input().peek_ahead()
     }
 
+    #[inline(always)]
+    fn cur_as_char(&self) -> Option<char> {
+        self.input().cur_as_char()
+    }
+
     #[inline(always)]
     fn cur_pos(&self) -> BytePos {
         self.input().cur_pos()
@@ -744,8 +759,8 @@ impl<'a> Lexer<'a> {
     fn skip_block_comment(&mut self) {
         let start = self.cur_pos();
 
-        debug_assert_eq!(self.cur(), Some('/'));
-        debug_assert_eq!(self.peek(), Some('*'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
+        debug_assert_eq!(self.peek(), Some(b'*'));
 
         // Consume initial "/*"
         self.input_mut().bump_bytes(2);
@@ -883,24 +898,24 @@ impl<'a> Lexer<'a> {
         let mut prev = None;
 
         while let Some(c) = self.cur() {
-            if c == '_' {
+            if c == b'_' {
                 *has_underscore = true;
                 if allow_num_separator {
-                    let is_allowed = |c: Option<char>| {
+                    let is_allowed = |c: Option<u8>| {
                         let Some(c) = c else {
                             return false;
                         };
-                        c.is_digit(RADIX as _)
+                        (c as char).is_digit(RADIX as _)
                     };
-                    let is_forbidden = |c: Option<char>| {
+                    let is_forbidden = |c: Option<u8>| {
                         let Some(c) = c else {
                             return false;
                         };
 
                         if RADIX == 16 {
-                            matches!(c, '.' | 'X' | '_' | 'x')
+                            matches!(c, b'.' | b'X' | b'_' | b'x')
                         } else {
-                            matches!(c, '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o')
+                            matches!(c, b'.' | b'B' | b'E' | b'O' | b'_' | b'b' | b'e' | b'o')
                         }
                     };
 
@@ -924,7 +939,7 @@ impl<'a> Lexer<'a> {
             }
 
             // e.g. (val for a) = 10  where radix = 16
-            let val = if let Some(val) = c.to_digit(RADIX as _) {
+            let val = if let Some(val) = (c as char).to_digit(RADIX as _) {
                 val
             } else {
                 return Ok(total);
@@ -1000,7 +1015,7 @@ impl<'a> Lexer<'a> {
         let lazy_integer = if START_WITH_DOT {
             // first char is '.'
             debug_assert!(
-                self.cur().is_some_and(|c| c == '.'),
+                self.cur().is_some_and(|c| c == b'.'),
                 "read_number<START_WITH_DOT = true> expects current char to be '.'"
             );
             LazyInteger {
@@ -1011,7 +1026,7 @@ impl<'a> Lexer<'a> {
             }
         } else {
             debug_assert!(!START_WITH_DOT);
-            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == '0');
+            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == b'0');
 
             // Use read_number_no_dot to support long numbers.
             let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
@@ -1076,7 +1091,7 @@ impl<'a> Lexer<'a> {
         has_underscore |= lazy_integer.has_underscore;
         // At this point, number cannot be an octal literal.
 
-        let has_dot = self.cur() == Some('.');
+        let has_dot = self.cur() == Some(b'.');
         //  `0.a`, `08.a`, `102.a` are invalid.
         //
         // `.1.a`, `.1e-4.a` are valid,
@@ -1090,7 +1105,7 @@ impl<'a> Lexer<'a> {
             self.read_digits::<_, (), 10>(|_, _, _| Ok(((), true)), true, &mut has_underscore)?;
         }
 
-        let has_e = self.cur().is_some_and(|c| c == 'e' || c == 'E');
+        let has_e = self.cur().is_some_and(|c| c == b'e' || c == b'E');
         // Handle 'e' and 'E'
         //
         // .5e1 = 5
@@ -1108,7 +1123,7 @@ impl<'a> Lexer<'a> {
                 }
             };
 
-            if next == '+' || next == '-' {
+            if next == b'+' || next == b'-' {
                 self.bump(); // remove '+', '-'
             }
 
@@ -1178,12 +1193,12 @@ impl<'a> Lexer<'a> {
         );
         let start = self.cur_pos();
 
-        debug_assert_eq!(self.cur(), Some('0'));
+        debug_assert_eq!(self.cur(), Some(b'0'));
         self.bump();
 
         debug_assert!(self
             .cur()
-            .is_some_and(|c| matches!(c, 'b' | 'B' | 'o' | 'O' | 'x' | 'X')));
+            .is_some_and(|c| matches!(c, b'b' | b'B' | b'o' | b'O' | b'x' | b'X')));
         self.bump();
 
         let lazy_integer = self.read_number_no_dot_as_str::<RADIX>()?;
@@ -1274,7 +1289,7 @@ impl<'a> Lexer<'a> {
 
         let mut s = SmartString::<LazyCompact>::default();
 
-        debug_assert!(self.input().cur().is_some_and(|c| c == '&'));
+        debug_assert!(self.input().cur().is_some_and(|c| c == b'&'));
         self.bump();
 
         let start_pos = self.input().cur_pos();
@@ -1286,7 +1301,7 @@ impl<'a> Lexer<'a> {
             };
             self.bump();
 
-            if c == ';' {
+            if c == b';' {
                 if let Some(stripped) = s.strip_prefix('#') {
                     if stripped.starts_with('x') {
                         if is_hex(&s[2..]) {
@@ -1306,7 +1321,7 @@ impl<'a> Lexer<'a> {
                 break;
             }
 
-            s.push(c)
+            s.push(c as char)
         }
 
         unsafe {
@@ -1319,10 +1334,10 @@ impl<'a> Lexer<'a> {
 
     fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult<Either<&'static str, char>> {
         debug_assert!(self.syntax().jsx());
-        let ch = self.input().cur().unwrap();
+        let ch = self.input().cur_as_char().unwrap();
         self.bump();
 
-        let out = if ch == '\r' && self.input().cur() == Some('\n') {
+        let out = if ch == '\r' && self.input().cur() == Some(b'\n') {
             self.bump(); // `\n`
             Either::Left(if normalize_crlf { "\n" } else { "\r\n" })
         } else {
@@ -1341,7 +1356,7 @@ impl<'a> Lexer<'a> {
         let mut out = String::new();
         let mut chunk_start = self.input().cur_pos();
         loop {
-            let ch = match self.input().cur() {
+            let ch = match self.input().cur_as_char() {
                 Some(c) => c,
                 None => {
                     self.emit_error(start, SyntaxError::UnterminatedStrLit);
@@ -1461,8 +1476,8 @@ impl<'a> Lexer<'a> {
         // returned `Some`, and already exited.
         debug_assert!(high >= MIN_HIGH);
         let is_pair = high <= MAX_HIGH
-            && self.input().cur() == Some('\\')
-            && self.input().peek() == Some('u');
+            && self.input().cur() == Some(b'\\')
+            && self.input().peek() == Some(b'u');
         if !is_pair {
             return Ok(Some(UnicodeEscape::LoneSurrogate(high)));
         }
@@ -1496,7 +1511,7 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_unicode_escape(&mut self) -> LexResult<UnicodeEscape> {
-        debug_assert_eq!(self.cur(), Some('u'));
+        debug_assert_eq!(self.cur(), Some(b'u'));
 
         let mut is_curly = false;
 
@@ -1580,7 +1595,7 @@ impl<'a> Lexer<'a> {
 
     #[cold]
     fn read_shebang(&mut self) -> LexResult<Option<Atom>> {
-        if self.input().cur() != Some('#') || self.input().peek() != Some('!') {
+        if self.input().cur() != Some(b'#') || self.input().peek() != Some(b'!') {
             return Ok(None);
         }
         self.bump(); // `#`
@@ -1593,13 +1608,13 @@ impl<'a> Lexer<'a> {
     ///
     /// In template literal, we should preserve raw string.
     fn read_escaped_char(&mut self, in_template: bool) -> LexResult<Option<CodePoint>> {
-        debug_assert_eq!(self.cur(), Some('\\'));
+        debug_assert_eq!(self.cur(), Some(b'\\'));
 
         let start = self.cur_pos();
 
         self.bump(); // '\'
 
-        let c = match self.cur() {
+        let c = match self.cur_as_char() {
             Some(c) => c,
             None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?,
         };
@@ -1654,7 +1669,7 @@ impl<'a> Lexer<'a> {
 
                 let first_c = if c == '0' {
                     match self.cur() {
-                        Some(next) if next.is_digit(8) => c,
+                        Some(next) if (next as char).is_digit(8) => c,
                         // \0 is not an octal literal nor decimal literal.
                         _ => return Ok(Some(CodePoint::from_char('\u{0000}'))),
                     }
@@ -1675,7 +1690,7 @@ impl<'a> Lexer<'a> {
                     ($check:expr) => {{
                         let cur = self.cur();
 
-                        match cur.and_then(|c| c.to_digit(8)) {
+                        match cur.and_then(|c| (c as char).to_digit(8)) {
                             Some(v) => {
                                 value = if $check {
                                     let new_val = value
@@ -1719,7 +1734,7 @@ impl<'a> Lexer<'a> {
             self.input_mut().reset_to(start);
         }
 
-        debug_assert_eq!(self.cur(), Some('/'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
 
         let start = self.cur_pos();
 
@@ -1745,14 +1760,14 @@ impl<'a> Lexer<'a> {
                 escaped = false;
             } else {
                 match c {
-                    '[' => in_class = true,
-                    ']' if in_class => in_class = false,
+                    b'[' => in_class = true,
+                    b']' if in_class => in_class = false,
                     // Terminates content part of regex literal
-                    '/' if !in_class => break,
+                    b'/' if !in_class => break,
                     _ => {}
                 }
 
-                escaped = c == '\\';
+                escaped = c == b'\\';
             }
 
             self.bump();
@@ -1921,7 +1936,7 @@ impl<'a> Lexer<'a> {
 
                 // ASCII but not a valid identifier
                 break;
-            } else if let Some(c) = self.input().cur() {
+            } else if let Some(c) = self.input().cur_as_char() {
                 if Ident::is_valid_non_ascii_continue(c) {
                     self.bump();
                     continue;
@@ -1954,14 +1969,14 @@ impl<'a> Lexer<'a> {
 
     /// `#`
     fn read_token_number_sign(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '#'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'#'));
 
         self.bump(); // '#'
 
         // `#` can also be a part of shebangs, however they should have been
         // handled by `read_shebang()`
         debug_assert!(
-            !self.input().is_at_start() || self.cur() != Some('!'),
+            !self.input().is_at_start() || self.cur() != Some(b'!'),
             "#! should have already been handled by read_shebang()"
         );
         Ok(Token::Hash)
@@ -1971,7 +1986,7 @@ impl<'a> Lexer<'a> {
     ///
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_dot(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '.'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'.'));
         // Check for eof
         let next = match self.input().peek() {
             Some(next) => next,
@@ -1989,7 +2004,7 @@ impl<'a> Lexer<'a> {
 
         self.bump(); // 1st `.`
 
-        if next == '.' && self.input().peek() == Some('.') {
+        if next == b'.' && self.input().peek() == Some(b'.') {
             self.bump(); // 2nd `.`
             self.bump(); // 3rd `.`
 
@@ -2003,7 +2018,7 @@ impl<'a> Lexer<'a> {
     ///
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_question_mark(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '?'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'?'));
         self.bump();
         if self.input_mut().eat_byte(b'?') {
             if self.input_mut().eat_byte(b'=') {
@@ -2020,7 +2035,7 @@ impl<'a> Lexer<'a> {
     ///
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_colon(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == ':'));
+        debug_assert!(self.cur().is_some_and(|c| c == b':'));
         self.bump(); // ':'
         Ok(Token::Colon)
     }
@@ -2029,13 +2044,13 @@ impl<'a> Lexer<'a> {
     ///
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_zero(&mut self) -> LexResult<Token> {
-        debug_assert_eq!(self.cur(), Some('0'));
+        debug_assert_eq!(self.cur(), Some(b'0'));
         let next = self.input().peek();
 
         let bigint = match next {
-            Some('x') | Some('X') => self.read_radix_number::<16>(),
-            Some('o') | Some('O') => self.read_radix_number::<8>(),
-            Some('b') | Some('B') => self.read_radix_number::<2>(),
+            Some(b'x') | Some(b'X') => self.read_radix_number::<16>(),
+            Some(b'o') | Some(b'O') => self.read_radix_number::<8>(),
+            Some(b'b') | Some(b'B') => self.read_radix_number::<2>(),
             _ => {
                 return self.read_number::<false, true>().map(|v| match v {
                     Left((value, raw)) => Token::num(value, raw, self),
@@ -2080,13 +2095,13 @@ impl<'a> Lexer<'a> {
         }
 
         // '||', '&&'
-        if self.input().cur() == Some(C as char) {
+        if self.input().cur() == Some(C) {
             unsafe {
                 // Safety: cur() is Some(c)
                 self.input_mut().bump();
             }
 
-            if self.input().cur() == Some('=') {
+            if self.input().cur() == Some(b'=') {
                 unsafe {
                     // Safety: cur() is Some('=')
                     self.input_mut().bump();
@@ -2125,7 +2140,7 @@ impl<'a> Lexer<'a> {
     ///
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_mul_mod<const IS_MUL: bool>(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '*' || c == '%'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'*' || c == b'%'));
         self.bump();
         let token = if IS_MUL {
             if self.input_mut().eat_byte(b'*') {
@@ -2153,7 +2168,7 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_slash(&mut self) -> LexResult<Token> {
-        debug_assert_eq!(self.cur(), Some('/'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
         self.bump(); // '/'
         Ok(if self.eat(b'=') {
             Token::DivEq
@@ -2181,7 +2196,7 @@ impl<'a> Lexer<'a> {
     /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
     // TODO: merge `read_str_lit` and `read_jsx_str`
     fn read_str_lit(&mut self) -> LexResult<Token> {
-        debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"'));
+        debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"'));
         let start = self.cur_pos();
         let quote = self.cur().unwrap() as u8;
 
diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs
index 9844a5f1ffb4..e52ebdd07227 100644
--- a/crates/swc_ecma_parser/src/lexer/state.rs
+++ b/crates/swc_ecma_parser/src/lexer/state.rs
@@ -246,8 +246,8 @@ impl crate::input::Tokens for Lexer<'_> {
         debug_assert!(token.is_word());
         let mut v = String::with_capacity(16);
         while let Some(ch) = self.input().cur() {
-            if ch == '-' {
-                v.push(ch);
+            if ch == b'-' {
+                v.push(ch as char);
                 self.bump();
             } else {
                 let old_pos = self.cur_pos();
@@ -296,8 +296,8 @@ impl crate::input::Tokens for Lexer<'_> {
         let start = self.cur_pos();
 
         match cur {
-            '\'' | '"' => {
-                let token = self.read_jsx_str(cur);
+            b'\'' | b'"' => {
+                let token = self.read_jsx_str(cur as char);
                 let token = match token {
                     Ok(token) => token,
                     Err(e) => {
@@ -417,21 +417,21 @@ impl Lexer<'_> {
         let mut value = String::new();
 
         while let Some(ch) = self.input_mut().cur() {
-            if ch == '{' {
+            if ch == b'{' {
                 break;
-            } else if ch == '<' {
+            } else if ch == b'<' {
                 // TODO: check git conflict mark
                 break;
             }
 
-            if ch == '>' {
+            if ch == b'>' {
                 self.emit_error(
                     self.input().cur_pos(),
                     SyntaxError::UnexpectedTokenWithSuggestions {
                         candidate_list: vec!["`{'>'}`", "`&gt;`"],
                     },
                 );
-            } else if ch == '}' {
+            } else if ch == b'}' {
                 self.emit_error(
                     self.input().cur_pos(),
                     SyntaxError::UnexpectedTokenWithSuggestions {
@@ -447,11 +447,11 @@ impl Lexer<'_> {
                 && first_non_whitespace > 0
             {
                 break;
-            } else if ch.is_whitespace() {
+            } else if ch <= 0x7f && (ch as char).is_whitespace() {
                 first_non_whitespace = self.cur_pos().0 as i32;
             }
 
-            if ch == '&' {
+            if ch == b'&' {
                 let s = unsafe {
                     // Safety: We already checked for the range
                     self.input_slice_to_cur(chunk_start)
@@ -508,28 +508,43 @@ impl Lexer<'_> {
     fn scan_identifier_parts(&mut self) -> String {
         let mut v = String::with_capacity(16);
         while let Some(ch) = self.input().cur() {
-            if ch.is_ident_part() {
-                v.push(ch);
-                self.input_mut().bump_bytes(ch.len_utf8());
-            } else if ch == '\\' {
-                self.bump(); // bump '\'
-                if !self.is(b'u') {
-                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
-                    continue;
-                }
-                self.bump(); // bump 'u'
-                let Ok(value) = self.read_unicode_escape() else {
-                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
-                    break;
-                };
-                if let Some(c) = CodePoint::from(value).to_char() {
-                    v.push(c);
+            // For ASCII, check if it's an identifier part quickly
+            if ch <= 0x7f {
+                if ch.is_ident_part() {
+                    v.push(ch as char);
+                    self.input_mut().bump_bytes(1);
+                } else if ch == b'\\' {
+                    self.bump(); // bump '\'
+                    if !self.is(b'u') {
+                        self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
+                        continue;
+                    }
+                    self.bump(); // bump 'u'
+                    let Ok(value) = self.read_unicode_escape() else {
+                        self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
+                        break;
+                    };
+                    if let Some(c) = CodePoint::from(value).to_char() {
+                        v.push(c);
+                    } else {
+                        self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
+                    }
+                    self.token_flags |= TokenFlags::UNICODE;
                 } else {
-                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
+                    break;
                 }
-                self.token_flags |= TokenFlags::UNICODE;
             } else {
-                break;
+                // Non-ASCII byte - need to get full UTF-8 character
+                if let Some(c) = self.input().cur_as_char() {
+                    if c.is_ident_part() {
+                        v.push(c);
+                        self.bump();
+                    } else {
+                        break;
+                    }
+                } else {
+                    break;
+                }
             }
         }
         v
diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs
index 12ad8b25a2a6..37e33537afbb 100644
--- a/crates/swc_ecma_parser/src/lexer/table.rs
+++ b/crates/swc_ecma_parser/src/lexer/table.rs
@@ -42,13 +42,14 @@ pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [
 
 const ERR: ByteHandler = |lexer| {
     let c = unsafe {
-        // Safety: Byte handler is only called for non-last chracters
-        lexer.input.cur().unwrap_unchecked()
+        // Safety: Byte handler is only called for non-last characters
+        // Get the char representation for error messages
+        lexer.input.cur_as_char().unwrap_unchecked()
     };
 
     let start = lexer.cur_pos();
     unsafe {
-        // Safety: Byte handler is only called for non-last chracters
+        // Safety: Byte handler is only called for non-last characters
         lexer.input.bump();
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
@@ -281,7 +282,7 @@ const ZER: ByteHandler = |lexer| lexer.read_token_zero();
 
 /// Numbers
 const DIG: ByteHandler = |lexer| {
-    debug_assert!(lexer.cur().is_some_and(|cur| cur != '0'));
+    debug_assert!(lexer.cur().is_some_and(|cur| cur != b'0'));
     lexer.read_number::<false, false>().map(|v| match v {
         Either::Left((value, raw)) => {
             lexer.state.set_token_value(TokenValue::Num { value, raw });
@@ -299,11 +300,12 @@ const DIG: ByteHandler = |lexer| {
 /// String literals with `'` or `"`
 const QOT: ByteHandler = |lexer| lexer.read_str_lit();
 
-/// Unicode
+/// Unicode - handles multi-byte UTF-8 sequences
 const UNI: ByteHandler = |lexer| {
     let c = unsafe {
-        // Safety: Byte handler is only called for non-last chracters
-        lexer.input.cur().unwrap_unchecked()
+        // Safety: Byte handler is only called for non-last characters
+        // For non-ASCII bytes, we need the full char
+        lexer.input.cur_as_char().unwrap_unchecked()
     };
 
     // Identifier or keyword. '\uXXXX' sequences are allowed in
@@ -314,7 +316,7 @@ const UNI: ByteHandler = |lexer| {
 
     let start = lexer.cur_pos();
     unsafe {
-        // Safety: Byte handler is only called for non-last chracters
+        // Safety: Byte handler is only called for non-last characters
         lexer.input.bump();
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs
index c4b66e15d49f..e423fc7457ed 100644
--- a/crates/swc_ecma_parser/src/lexer/whitespace.rs
+++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs
@@ -114,31 +114,33 @@ const SPC: ByteHandler = |lexer| {
 };
 
 const SLH: ByteHandler = |lexer| match lexer.peek() {
-    Some('/') => {
+    Some(b'/') => {
         lexer.skip_line_comment(2);
         true
     }
-    Some('*') => {
+    Some(b'*') => {
         lexer.skip_block_comment();
         true
     }
     _ => false,
 };
 
-/// Unicode
+/// Unicode - handles multi-byte UTF-8 whitespace characters
 const UNI: ByteHandler = |lexer| {
-    let c = lexer.cur().unwrap();
-    match c {
-        c if is_irregular_whitespace(c) => {
-            lexer.bump();
-            true
-        }
-        c if is_irregular_line_terminator(c) => {
-            lexer.bump();
-            lexer.state.mark_had_line_break();
-            true
-        }
-        _ => false,
+    // For non-ASCII bytes, we need the full UTF-8 character
+    let Some(c) = lexer.cur_as_char() else {
+        return false;
+    };
+
+    if is_irregular_whitespace(c) {
+        lexer.bump();
+        true
+    } else if is_irregular_line_terminator(c) {
+        lexer.bump();
+        lexer.state.mark_had_line_break();
+        true
+    } else {
+        false
     }
 };
 

From c23d2e7a4e099388762ae688ee7ca4d3896ffc02 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Tue, 18 Nov 2025 11:39:37 -0500
Subject: [PATCH 02/20] fix(swc_common): Fix Input tests to use byte literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updated test assertions in `swc_common/src/input.rs` to use byte literals (b'x') instead of char literals ('x') to match the new byte-based Input trait signature.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_common/src/input.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs
index 86465b956932..24457fa94fba 100644
--- a/crates/swc_common/src/input.rs
+++ b/crates/swc_common/src/input.rs
@@ -374,12 +374,12 @@ mod tests {
         with_test_sess("foo/d", |mut i| {
             assert_eq!(unsafe { i.slice(BytePos(1), BytePos(2)) }, "f");
             assert_eq!(i.last_pos, BytePos(2));
-            assert_eq!(i.cur(), Some('o'));
+            assert_eq!(i.cur(), Some(b'o'));
 
             assert_eq!(unsafe { i.slice(BytePos(2), BytePos(4)) }, "oo");
             assert_eq!(unsafe { i.slice(BytePos(1), BytePos(4)) }, "foo");
             assert_eq!(i.last_pos, BytePos(4));
-            assert_eq!(i.cur(), Some('/'));
+            assert_eq!(i.cur(), Some(b'/'));
         });
     }
 
@@ -388,10 +388,10 @@ mod tests {
         with_test_sess("load", |mut i| {
             assert_eq!(unsafe { i.slice(BytePos(1), BytePos(3)) }, "lo");
             assert_eq!(i.last_pos, BytePos(3));
-            assert_eq!(i.cur(), Some('a'));
+            assert_eq!(i.cur(), Some(b'a'));
             unsafe { i.reset_to(BytePos(1)) };
 
-            assert_eq!(i.cur(), Some('l'));
+            assert_eq!(i.cur(), Some(b'l'));
             assert_eq!(i.last_pos, BytePos(1));
         });
     }
@@ -405,13 +405,13 @@ mod tests {
 
             // assert_eq!(i.cur_pos(), BytePos(4));
             assert_eq!(i.last_pos, BytePos(4));
-            assert_eq!(i.cur(), Some('/'));
+            assert_eq!(i.cur(), Some(b'/'));
 
             unsafe {
                 i.bump();
             }
             assert_eq!(i.last_pos, BytePos(5));
-            assert_eq!(i.cur(), Some('d'));
+            assert_eq!(i.cur(), Some(b'd'));
 
             unsafe {
                 i.bump();

From 0ba3cfa90d62831e6205ffb0935f4f9266e7c82c Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Tue, 18 Nov 2025 12:32:08 -0500
Subject: [PATCH 03/20] fix(swc_ecma_lexer): Convert char literals to byte
 literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaced all char literals with byte literals in the lexer to match the new byte-based Input trait.

Key changes:
- Changed char literals to byte literals: 'x' → b'x'
- Fixed debug assertions in lexer methods
- Updated comparisons in peek() and cur() checks
- For non-ASCII Unicode characters, used cur_as_char() to get full UTF-8 character
- Converted bytes to chars where needed for digit/identifier checks

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs | 111 +++++++++---------
 crates/swc_ecma_lexer/src/lexer/jsx.rs        |   2 +-
 crates/swc_ecma_lexer/src/lexer/mod.rs        |  11 +-
 crates/swc_ecma_lexer/src/lexer/state.rs      |   3 +-
 crates/swc_ecma_lexer/src/lexer/table.rs      |   2 +-
 5 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index a81ee56cd384..0c4da3a6b6c7 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -369,8 +369,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     fn skip_block_comment(&mut self) {
         let start = self.cur_pos();
 
-        debug_assert_eq!(self.cur(), Some('/'));
-        debug_assert_eq!(self.peek(), Some('*'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
+        debug_assert_eq!(self.peek(), Some(b'*'));
 
         // Consume initial "/*"
         self.input_mut().bump_bytes(2);
@@ -422,7 +422,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
             match matched_byte {
                 b'*' => {
-                    if self.peek() == Some('/') {
+                    if self.peek() == Some(b'/') {
                         // Consume "*/"
                         self.input_mut().bump_bytes(2);
 
@@ -481,13 +481,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                 b'\r' => {
                     should_mark_had_line_break = true;
                     self.bump();
-                    if self.peek() == Some('\n') {
+                    if self.peek() == Some(b'\n') {
                         self.bump();
                     }
                 }
                 _ => {
                     // Unicode line terminator (LS/PS) or other character
-                    if let Some('\u{2028}' | '\u{2029}') = self.cur() {
+                    if let Some('\u{2028}' | '\u{2029}') = self.cur_as_char() {
                         should_mark_had_line_break = true;
                     }
                     self.bump();
@@ -521,10 +521,10 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
             if LEX_COMMENTS && self.input().is_byte(b'/') {
                 if let Some(c) = self.peek() {
-                    if c == '/' {
+                    if c == b'/' {
                         self.skip_line_comment(2);
                         continue;
-                    } else if c == '*' {
+                    } else if c == b'*' {
                         self.skip_block_comment();
                         continue;
                     }
@@ -580,24 +580,24 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let mut prev = None;
 
         while let Some(c) = self.cur() {
-            if c == '_' {
+            if c == b'_' {
                 *has_underscore = true;
                 if allow_num_separator {
-                    let is_allowed = |c: Option<char>| {
+                    let is_allowed = |c: Option<u8>| {
                         let Some(c) = c else {
                             return false;
                         };
-                        c.is_digit(RADIX as _)
+                        (c as char).is_digit(RADIX as _)
                     };
-                    let is_forbidden = |c: Option<char>| {
+                    let is_forbidden = |c: Option<u8>| {
                         let Some(c) = c else {
                             return false;
                         };
 
                         if RADIX == 16 {
-                            matches!(c, '.' | 'X' | '_' | 'x')
+                            matches!(c, b'.' | b'X' | b'_' | b'x')
                         } else {
-                            matches!(c, '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o')
+                            matches!(c, b'.' | b'B' | b'E' | b'O' | b'_' | b'b' | b'e' | b'o')
                         }
                     };
 
@@ -621,7 +621,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             }
 
             // e.g. (val for a) = 10  where radix = 16
-            let val = if let Some(val) = c.to_digit(RADIX as _) {
+            let val = if let Some(val) = (c as char).to_digit(RADIX as _) {
                 val
             } else {
                 return Ok(total);
@@ -697,7 +697,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let lazy_integer = if START_WITH_DOT {
             // first char is '.'
             debug_assert!(
-                self.cur().is_some_and(|c| c == '.'),
+                self.cur().is_some_and(|c| c == b'.'),
                 "read_number<START_WITH_DOT = true> expects current char to be '.'"
             );
             LazyInteger {
@@ -708,7 +708,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             }
         } else {
             debug_assert!(!START_WITH_DOT);
-            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == '0');
+            debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == b'0');
 
             // Use read_number_no_dot to support long numbers.
             let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
@@ -776,7 +776,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         has_underscore |= lazy_integer.has_underscore;
         // At this point, number cannot be an octal literal.
 
-        let has_dot = self.cur() == Some('.');
+        let has_dot = self.cur() == Some(b'.');
         //  `0.a`, `08.a`, `102.a` are invalid.
         //
         // `.1.a`, `.1e-4.a` are valid,
@@ -790,7 +790,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             self.read_digits::<_, (), 10>(|_, _, _| Ok(((), true)), true, &mut has_underscore)?;
         }
 
-        let has_e = self.cur().is_some_and(|c| c == 'e' || c == 'E');
+        let has_e = self.cur().is_some_and(|c| c == b'e' || c == b'E');
         // Handle 'e' and 'E'
         //
         // .5e1 = 5
@@ -808,7 +808,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                 }
             };
 
-            if next == '+' || next == '-' {
+            if next == b'+' || next == b'-' {
                 self.bump(); // remove '+', '-'
             }
 
@@ -880,12 +880,12 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         );
         let start = self.cur_pos();
 
-        debug_assert_eq!(self.cur(), Some('0'));
+        debug_assert_eq!(self.cur(), Some(b'0'));
         self.bump();
 
         debug_assert!(self
             .cur()
-            .is_some_and(|c| matches!(c, 'b' | 'B' | 'o' | 'O' | 'x' | 'X')));
+            .is_some_and(|c| matches!(c, b'b' | b'B' | b'o' | b'O' | b'x' | b'X')));
         self.bump();
 
         let lazy_integer = self.read_number_no_dot_as_str::<RADIX>()?;
@@ -1001,14 +1001,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
         let mut s = SmartString::<LazyCompact>::default();
 
-        debug_assert!(self.input().cur().is_some_and(|c| c == '&'));
+        debug_assert!(self.input().cur().is_some_and(|c| c == b'&'));
         self.bump();
 
         let start_pos = self.input().cur_pos();
 
         for _ in 0..10 {
             let c = match self.input().cur() {
-                Some(c) => c,
+                Some(c) => c as char,
                 None => break,
             };
             self.bump();
@@ -1046,10 +1046,10 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
     fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult<Either<&'static str, char>> {
         debug_assert!(self.syntax().jsx());
-        let ch = self.input().cur().unwrap();
+        let ch = self.input().cur().unwrap() as char;
         self.bump();
 
-        let out = if ch == '\r' && self.input().cur() == Some('\n') {
+        let out = if ch == '\r' && self.input().cur() == Some(b'\n') {
             self.bump(); // `\n`
             Either::Left(if normalize_crlf { "\n" } else { "\r\n" })
         } else {
@@ -1069,7 +1069,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let mut chunk_start = self.input().cur_pos();
         loop {
             let ch = match self.input().cur() {
-                Some(c) => c,
+                Some(c) => c as char,
                 None => {
                     self.emit_error(start, SyntaxError::UnterminatedStrLit);
                     break;
@@ -1190,8 +1190,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         // returned `Some`, and already exited.
         debug_assert!(high >= MIN_HIGH);
         let is_pair = high <= MAX_HIGH
-            && self.input().cur() == Some('\\')
-            && self.input().peek() == Some('u');
+            && self.input().cur() == Some(b'\\')
+            && self.input().peek() == Some(b'u');
         if !is_pair {
             return Ok(Some(UnicodeEscape::LoneSurrogate(high)));
         }
@@ -1225,7 +1225,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     }
 
     fn read_unicode_escape(&mut self) -> LexResult<UnicodeEscape> {
-        debug_assert_eq!(self.cur(), Some('u'));
+        debug_assert_eq!(self.cur(), Some(b'u'));
 
         let mut is_curly = false;
 
@@ -1309,7 +1309,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
     #[cold]
     fn read_shebang(&mut self) -> LexResult<Option<Atom>> {
-        if self.input().cur() != Some('#') || self.input().peek() != Some('!') {
+        if self.input().cur() != Some(b'#') || self.input().peek() != Some(b'!') {
             return Ok(None);
         }
         self.bump(); // `#`
@@ -1341,11 +1341,11 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         // Handle edge case for immediate template end
         if start == self.cur_pos() && self.state().last_was_tpl_element() {
             if let Some(c) = self.cur() {
-                if c == '$' && self.peek() == Some('{') {
+                if c == b'$' && self.peek() == Some(b'{') {
                     self.bump(); // '$'
                     self.bump(); // '{'
                     return Ok(Self::Token::DOLLAR_LBRACE);
-                } else if c == '`' {
+                } else if c == b'`' {
                     self.bump(); // '`'
                     return Ok(Self::Token::BACKQUOTE);
                 }
@@ -1366,7 +1366,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             match matched_byte {
                 b'$' => {
                     // Check if this is ${
-                    if self.peek() == Some('{') {
+                    if self.peek() == Some(b'{') {
                         // Found template substitution
                         let cooked = if cooked_slice_start == raw_slice_start {
                             let last_pos = self.cur_pos();
@@ -1418,7 +1418,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
                     // Handle carriage return - consume \r and optionally \n, normalize to \n
                     self.bump(); // '\r'
-                    if self.peek() == Some('\n') {
+                    if self.peek() == Some(b'\n') {
                         self.bump(); // '\n'
                     }
 
@@ -1454,14 +1454,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     ///
     /// In template literal, we should preserve raw string.
     fn read_escaped_char(&mut self, in_template: bool) -> LexResult<Option<CodePoint>> {
-        debug_assert_eq!(self.cur(), Some('\\'));
+        debug_assert_eq!(self.cur(), Some(b'\\'));
 
         let start = self.cur_pos();
 
         self.bump(); // '\'
 
         let c = match self.cur() {
-            Some(c) => c,
+            Some(c) => c as char,
             None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?,
         };
 
@@ -1515,7 +1515,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
                 let first_c = if c == '0' {
                     match self.cur() {
-                        Some(next) if next.is_digit(8) => c,
+                        Some(next) if (next as char).is_digit(8) => c,
                         // \0 is not an octal literal nor decimal literal.
                         _ => return Ok(Some(CodePoint::from_char('\u{0000}'))),
                     }
@@ -1536,7 +1536,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                     ($check:expr) => {{
                         let cur = self.cur();
 
-                        match cur.and_then(|c| c.to_digit(8)) {
+                        match cur.and_then(|c| (c as char).to_digit(8)) {
                             Some(v) => {
                                 value = if $check {
                                     let new_val = value
@@ -1580,7 +1580,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             self.input_mut().reset_to(start);
         }
 
-        debug_assert_eq!(self.cur(), Some('/'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
 
         let start = self.cur_pos();
 
@@ -1591,6 +1591,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let (mut escaped, mut in_class) = (false, false);
 
         while let Some(c) = self.cur() {
+            let c = c as char;
             // This is ported from babel.
             // Seems like regexp literal cannot contain linebreak.
             if c.is_line_terminator() {
@@ -1785,7 +1786,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
                 // ASCII but not a valid identifier
                 break;
-            } else if let Some(c) = self.input().cur() {
+            } else if let Some(c) = self.input().cur_as_char() {
                 if Ident::is_valid_non_ascii_continue(c) {
                     self.bump();
                     continue;
@@ -1818,14 +1819,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
     /// `#`
     fn read_token_number_sign(&mut self) -> LexResult<Self::Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '#'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'#'));
 
         self.bump(); // '#'
 
         // `#` can also be a part of shebangs, however they should have been
         // handled by `read_shebang()`
         debug_assert!(
-            !self.input().is_at_start() || self.cur() != Some('!'),
+            !self.input().is_at_start() || self.cur() != Some(b'!'),
             "#! should have already been handled by read_shebang()"
         );
         Ok(Self::Token::HASH)
@@ -1836,7 +1837,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// This is extracted as a method to reduce size of `read_token`.
     #[inline(never)]
     fn read_token_dot(&mut self) -> LexResult<Self::Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '.'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'.'));
         // Check for eof
         let next = match self.input().peek() {
             Some(next) => next,
@@ -1854,7 +1855,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
         self.bump(); // 1st `.`
 
-        if next == '.' && self.input().peek() == Some('.') {
+        if next == b'.' && self.input().peek() == Some(b'.') {
             self.bump(); // 2nd `.`
             self.bump(); // 3rd `.`
 
@@ -1869,7 +1870,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// This is extracted as a method to reduce size of `read_token`.
     #[inline(never)]
     fn read_token_question_mark(&mut self) -> LexResult<Self::Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '?'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'?'));
         self.bump();
         if self.input_mut().eat_byte(b'?') {
             if self.input_mut().eat_byte(b'=') {
@@ -1887,7 +1888,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// This is extracted as a method to reduce size of `read_token`.
     #[inline(never)]
     fn read_token_colon(&mut self) -> LexResult<Self::Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == ':'));
+        debug_assert!(self.cur().is_some_and(|c| c == b':'));
         self.bump(); // ':'
         Ok(Self::Token::COLON)
     }
@@ -1897,13 +1898,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// This is extracted as a method to reduce size of `read_token`.
     #[inline(never)]
     fn read_token_zero(&mut self) -> LexResult<Self::Token> {
-        debug_assert_eq!(self.cur(), Some('0'));
+        debug_assert_eq!(self.cur(), Some(b'0'));
         let next = self.input().peek();
 
         let bigint = match next {
-            Some('x') | Some('X') => self.read_radix_number::<16>(),
-            Some('o') | Some('O') => self.read_radix_number::<8>(),
-            Some('b') | Some('B') => self.read_radix_number::<2>(),
+            Some(b'x') | Some(b'X') => self.read_radix_number::<16>(),
+            Some(b'o') | Some(b'O') => self.read_radix_number::<8>(),
+            Some(b'b') | Some(b'B') => self.read_radix_number::<2>(),
             _ => {
                 return self.read_number::<false, true>().map(|v| match v {
                     Left((value, raw)) => Self::Token::num(value, raw, self),
@@ -1949,13 +1950,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         }
 
         // '||', '&&'
-        if self.input().cur() == Some(C as char) {
+        if self.input().cur() == Some(C) {
             unsafe {
                 // Safety: cur() is Some(c)
                 self.input_mut().bump();
             }
 
-            if self.input().cur() == Some('=') {
+            if self.input().cur() == Some(b'=') {
                 unsafe {
                     // Safety: cur() is Some('=')
                     self.input_mut().bump();
@@ -1995,7 +1996,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// This is extracted as a method to reduce size of `read_token`.
     #[inline(never)]
     fn read_token_mul_mod(&mut self, is_mul: bool) -> LexResult<Self::Token> {
-        debug_assert!(self.cur().is_some_and(|c| c == '*' || c == '%'));
+        debug_assert!(self.cur().is_some_and(|c| c == b'*' || c == b'%'));
         self.bump();
         let token = if is_mul {
             if self.input_mut().eat_byte(b'*') {
@@ -2024,7 +2025,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
     #[inline(never)]
     fn read_slash(&mut self) -> LexResult<Self::Token> {
-        debug_assert_eq!(self.cur(), Some('/'));
+        debug_assert_eq!(self.cur(), Some(b'/'));
         self.bump(); // '/'
         Ok(if self.eat(b'=') {
             Self::Token::DIV_EQ
@@ -2052,7 +2053,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
     // TODO: merge `read_str_lit` and `read_jsx_str`
     fn read_str_lit(&mut self) -> LexResult<Self::Token> {
-        debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"'));
+        debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"'));
         let start = self.cur_pos();
         let quote = self.cur().unwrap() as u8;
 
diff --git a/crates/swc_ecma_lexer/src/lexer/jsx.rs b/crates/swc_ecma_lexer/src/lexer/jsx.rs
index 5fbc92266b80..82825c00459f 100644
--- a/crates/swc_ecma_lexer/src/lexer/jsx.rs
+++ b/crates/swc_ecma_lexer/src/lexer/jsx.rs
@@ -12,7 +12,7 @@ impl Lexer<'_> {
 
         loop {
             let cur = match self.input.cur() {
-                Some(c) => c,
+                Some(c) => c as char,
                 None => {
                     let start = self.state.start;
                     self.error(start, SyntaxError::UnterminatedJSXContents)?
diff --git a/crates/swc_ecma_lexer/src/lexer/mod.rs b/crates/swc_ecma_lexer/src/lexer/mod.rs
index 1f245fc57a0c..49e5be16bfc5 100644
--- a/crates/swc_ecma_lexer/src/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/lexer/mod.rs
@@ -176,7 +176,7 @@ impl<'a> Lexer<'a> {
         }
 
         // '++', '--'
-        Ok(if self.input.cur() == Some(C as char) {
+        Ok(if self.input.cur() == Some(C) {
             unsafe {
                 // Safety: cur() is Some(c)
                 self.input.bump();
@@ -273,7 +273,10 @@ impl Lexer<'_> {
         }
 
         // XML style comment. `<!--`
-        if C == b'<' && self.is(b'!') && self.peek() == Some('-') && self.peek_ahead() == Some('-')
+        if C == b'<'
+            && self.is(b'!')
+            && self.peek() == Some(b'-')
+            && self.peek_ahead() == Some(b'-')
         {
             self.skip_line_comment(3);
             self.skip_space::<true>();
@@ -289,7 +292,7 @@ impl Lexer<'_> {
         };
 
         // '<<', '>>'
-        if self.cur() == Some(C as char) {
+        if self.cur() == Some(C) {
             self.bump();
             op = if C == b'<' {
                 BinOpToken::LShift
@@ -298,7 +301,7 @@ impl Lexer<'_> {
             };
 
             //'>>>'
-            if C == b'>' && self.cur() == Some(C as char) {
+            if C == b'>' && self.cur() == Some(C) {
                 self.bump();
                 op = BinOpToken::ZeroFillRShift;
             }
diff --git a/crates/swc_ecma_lexer/src/lexer/state.rs b/crates/swc_ecma_lexer/src/lexer/state.rs
index 8dc108697f8a..7731423f510b 100644
--- a/crates/swc_ecma_lexer/src/lexer/state.rs
+++ b/crates/swc_ecma_lexer/src/lexer/state.rs
@@ -801,6 +801,7 @@ impl Lexer<'_> {
 
             let c = self.cur();
             if let Some(c) = c {
+                let c = c as char;
                 if self.state.context.current() == Some(TokenContext::JSXOpeningTag)
                     || self.state.context.current() == Some(TokenContext::JSXClosingTag)
                 {
@@ -823,7 +824,7 @@ impl Lexer<'_> {
                     }
                 }
 
-                if c == '<' && self.state.is_expr_allowed && self.input.peek() != Some('!') {
+                if c == '<' && self.state.is_expr_allowed && self.input.peek() != Some(b'!') {
                     let had_line_break_before_last = self.had_line_break_before_last();
                     let cur_pos = self.input.cur_pos();
 
diff --git a/crates/swc_ecma_lexer/src/lexer/table.rs b/crates/swc_ecma_lexer/src/lexer/table.rs
index f6c347bba074..bd8b16aa6c26 100644
--- a/crates/swc_ecma_lexer/src/lexer/table.rs
+++ b/crates/swc_ecma_lexer/src/lexer/table.rs
@@ -352,7 +352,7 @@ const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero());
 
 /// Numbers
 const DIG: ByteHandler = Some(|lexer| {
-    debug_assert!(lexer.cur().is_some_and(|cur| cur != '0'));
+    debug_assert!(lexer.cur().is_some_and(|cur| cur != b'0'));
     lexer.read_number::<false, false>().map(|v| match v {
         Either::Left((value, raw)) => Token::Num { value, raw },
         Either::Right((value, raw)) => Token::BigInt { value, raw },

From 9a64e0ea2b5688bea71676f7b8da3012c91407f1 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 19 Nov 2025 11:37:09 -0500
Subject: [PATCH 04/20] fix

---
 crates/swc_css_parser/src/lexer/mod.rs  | 278 ++++----
 crates/swc_html_parser/src/lexer/mod.rs | 838 +++++++++++++-----------
 2 files changed, 582 insertions(+), 534 deletions(-)

diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 904633584aab..048f13b12706 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -26,7 +26,7 @@ where
     comments: Option<&'a dyn Comments>,
     pending_leading_comments: Vec<Comment>,
     input: I,
-    cur: Option<char>,
+    cur: Option<u8>,
     cur_pos: BytePos,
     start_pos: BytePos,
     /// Used to override last_pos
@@ -172,7 +172,11 @@ where
         loop {
             self.read_comments();
 
-            if self.input.uncons_while(is_whitespace).is_empty() {
+            if self
+                .input
+                .uncons_while(|c| is_whitespace(c as u8))
+                .is_empty()
+            {
                 break;
             }
         }
@@ -190,27 +194,27 @@ where
     I: Input<'a>,
 {
     #[inline(always)]
-    fn cur(&mut self) -> Option<char> {
+    fn cur(&mut self) -> Option<u8> {
         self.cur
     }
 
     #[inline(always)]
-    fn next(&mut self) -> Option<char> {
+    fn next(&mut self) -> Option<u8> {
         self.input.cur()
     }
 
     #[inline(always)]
-    fn next_next(&mut self) -> Option<char> {
+    fn next_next(&mut self) -> Option<u8> {
         self.input.peek()
     }
 
     #[inline(always)]
-    fn next_next_next(&mut self) -> Option<char> {
+    fn next_next_next(&mut self) -> Option<u8> {
         self.input.peek_ahead()
     }
 
     #[inline(always)]
-    fn consume(&mut self) -> Option<char> {
+    fn consume(&mut self) -> Option<u8> {
         let cur = self.input.cur();
 
         self.cur = cur;
@@ -258,7 +262,7 @@ where
             // whitespace
             // Consume as much whitespace as possible. Return a <whitespace-token>.
             Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
-                buf.push(c);
+                buf.push(c as char);
 
                 loop {
                     let c = l.next();
@@ -267,7 +271,7 @@ where
                         Some(c) if is_whitespace(c) => {
                             l.consume();
 
-                            buf.push(c);
+                            buf.push(c as char);
                         }
                         _ => {
                             break;
@@ -281,9 +285,9 @@ where
             }),
             // U+0022 QUOTATION MARK (")
             // Consume a string token and return it.
-            Some('"') => self.read_str(None),
+            Some(b'"') => self.read_str(None),
             // U+0023 NUMBER SIGN (#)
-            Some('#') => {
+            Some(b'#') => {
                 let first = self.next();
                 let second = self.next_next();
 
@@ -311,19 +315,21 @@ where
                     });
                 }
 
-                Ok(Token::Delim { value: '#' })
+                Ok(Token::Delim {
+                    value: b'#' as char,
+                })
             }
             // U+0027 APOSTROPHE (')
             // Consume a string token and return it.
-            Some('\'') => self.read_str(None),
+            Some(b'\'') => self.read_str(None),
             // U+0028 LEFT PARENTHESIS (()
             // Return a <(-token>.
-            Some('(') => Ok(tok!("(")),
+            Some(b'(') => Ok(tok!("(")),
             // U+0029 RIGHT PARENTHESIS ())
             // Return a <)-token>.
-            Some(')') => Ok(tok!(")")),
+            Some(b')') => Ok(tok!(")")),
             // U+002B PLUS SIGN (+)
-            Some('+') => {
+            Some(b'+') => {
                 // If the input stream starts with a number, reconsume the current input code
                 // point, consume a numeric token and return it.
                 if self.would_start_number(None, None, None) {
@@ -338,9 +344,9 @@ where
             }
             // U+002C COMMA (,)
             // Return a <comma-token>.
-            Some(',') => Ok(tok!(",")),
+            Some(b',') => Ok(tok!(",")),
             // U+002D HYPHEN-MINUS (-)
-            Some('-') => {
+            Some(b'-') => {
                 // If the input stream starts with a number, reconsume the current input code
                 // point, consume a numeric token, and return it.
                 if self.would_start_number(None, None, None) {
@@ -350,7 +356,7 @@ where
                 }
                 // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
                 // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
-                else if self.next() == Some('-') && self.next_next() == Some('>') {
+                else if self.next() == Some(b'-') && self.next_next() == Some(b'>') {
                     self.consume();
                     self.consume();
 
@@ -369,7 +375,7 @@ where
                 Ok(tok!("-"))
             }
             // U+002E FULL STOP (.)
-            Some('.') => {
+            Some(b'.') => {
                 // If the input stream starts with a number, reconsume the current input code
                 // point, consume a numeric token, and return it.
                 if self.would_start_number(None, None, None) {
@@ -384,18 +390,18 @@ where
             }
             // U+003A COLON (:)
             // Return a <colon-token>.
-            Some(':') => Ok(tok!(":")),
+            Some(b':') => Ok(tok!(":")),
             // U+003B SEMICOLON (;)
             // Return a <semicolon-token>.
-            Some(';') => Ok(tok!(";")),
+            Some(b';') => Ok(tok!(";")),
             // U+003C LESS-THAN SIGN (<)
-            Some('<') => {
+            Some(b'<') => {
                 // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D
                 // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a
                 // <CDO-token>.
-                if self.next() == Some('!')
-                    && self.next_next() == Some('-')
-                    && self.next_next_next() == Some('-')
+                if self.next() == Some(b'!')
+                    && self.next_next() == Some(b'-')
+                    && self.next_next_next() == Some(b'-')
                 {
                     self.consume(); // !
                     self.consume(); // -
@@ -409,7 +415,7 @@ where
                 Ok(tok!("<"))
             }
             // U+0040 COMMERCIAL AT (@)
-            Some('@') => {
+            Some(b'@') => {
                 let first = self.next();
                 let second = self.next_next();
                 let third = self.next_next_next();
@@ -428,13 +434,15 @@ where
 
                 // Otherwise, return a <delim-token> with its value set to the current input
                 // code point.
-                Ok(Token::Delim { value: '@' })
+                Ok(Token::Delim {
+                    value: b'@' as char,
+                })
             }
             // U+005B LEFT SQUARE BRACKET ([)
             // Return a <[-token>.
-            Some('[') => Ok(tok!("[")),
+            Some(b'[') => Ok(tok!("[")),
             // U+005C REVERSE SOLIDUS (\)
-            Some('\\') => {
+            Some(b'\\') => {
                 // If the input stream starts with a valid escape, reconsume the current input
                 // code point, consume an ident-like token, and return it.
                 if self.is_valid_escape(None, None) {
@@ -447,20 +455,22 @@ where
                 // to the current input code point.
                 self.emit_error(ErrorKind::InvalidEscape);
 
-                Ok(Token::Delim { value: '\\' })
+                Ok(Token::Delim {
+                    value: b'\\' as char,
+                })
             }
             // U+005D RIGHT SQUARE BRACKET (])
             // Return a <]-token>.
-            Some(']') => Ok(tok!("]")),
+            Some(b']') => Ok(tok!("]")),
             // U+007B LEFT CURLY BRACKET ({)
             // Return a <{-token>.
-            Some('{') => Ok(tok!("{")),
+            Some(b'{') => Ok(tok!("{")),
             // U+007D RIGHT CURLY BRACKET (})
             // Return a <}-token>.
-            Some('}') => Ok(tok!("}")),
+            Some(b'}') => Ok(tok!("}")),
             // digit
             // Reconsume the current input code point, consume a numeric token, and return it.
-            Some('0'..='9') => {
+            Some(b'0'..=b'9') => {
                 self.reconsume();
 
                 self.read_numeric()
@@ -477,7 +487,7 @@ where
             None => Err(ErrorKind::Eof),
             // anything else
             // Return a <delim-token> with its value set to the current input code point.
-            Some(c) => Ok(Token::Delim { value: c }),
+            Some(c) => Ok(Token::Delim { value: c as char }),
         }
     }
 
@@ -490,16 +500,16 @@ where
         // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an
         // EOF code point. Return to the start of this step.
         // NOTE: We allow to parse line comments under the option.
-        if self.next() == Some('/') && self.next_next() == Some('*') {
+        if self.next() == Some(b'/') && self.next_next() == Some(b'*') {
             let cmt_start = self.input.last_pos();
 
-            while self.next() == Some('/') && self.next_next() == Some('*') {
+            while self.next() == Some(b'/') && self.next_next() == Some(b'*') {
                 self.consume(); // '*'
                 self.consume(); // '/'
 
                 loop {
                     match self.consume() {
-                        Some('*') if self.next() == Some('/') => {
+                        Some(b'*') if self.next() == Some(b'/') => {
                             self.consume(); // '/'
 
                             if self.comments.is_some() {
@@ -532,10 +542,10 @@ where
                 }
             }
         } else if self.config.allow_wrong_line_comments
-            && self.next() == Some('/')
-            && self.next_next() == Some('/')
+            && self.next() == Some(b'/')
+            && self.next_next() == Some(b'/')
         {
-            while self.next() == Some('/') && self.next_next() == Some('/') {
+            while self.next() == Some(b'/') && self.next_next() == Some(b'/') {
                 self.consume(); // '/'
                 self.consume(); // '/'
 
@@ -600,7 +610,7 @@ where
         }
         // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
         // a <percentage-token> with the same value as number, and return it.
-        else if next_first == Some('%') {
+        else if next_first == Some(b'%') {
             self.consume();
 
             return Ok(Token::Percentage {
@@ -625,9 +635,9 @@ where
         // Consume a name, and let string be the result.
         let ident_sequence = self.read_ident_sequence()?;
 
-        // If string’s value is an ASCII case-insensitive match for "url", and the next
+        // If string's value is an ASCII case-insensitive match for "url", and the next
         // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
-        if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') {
+        if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some(b'(') {
             self.consume();
 
             let start_whitespace = self.input.last_pos();
@@ -639,7 +649,7 @@ where
                     if is_whitespace(next) && is_whitespace(next_next) {
                         l.consume();
 
-                        buf.push(next);
+                        buf.push(next as char);
                     } else {
                         break;
                     }
@@ -655,7 +665,7 @@ where
                 // return it.
                 Some(c)
                     if is_whitespace(c)
-                        && (self.next_next() == Some('"') || self.next_next() == Some('\'')) =>
+                        && (self.next_next() == Some(b'"') || self.next_next() == Some(b'\'')) =>
                 {
                     // Override last position because we consumed whitespaces, but they
                     // should not be part of token
@@ -666,7 +676,7 @@ where
                         raw: ident_sequence.1,
                     });
                 }
-                Some('"' | '\'') => {
+                Some(b'"' | b'\'') => {
                     return Ok(Token::Function {
                         value: ident_sequence.0,
                         raw: ident_sequence.1,
@@ -680,7 +690,7 @@ where
         }
         // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
         // Create a <function-token> with its value set to string and return it.
-        else if self.next() == Some('(') {
+        else if self.next() == Some(b'(') {
             self.consume();
 
             return Ok(Token::Function {
@@ -699,7 +709,7 @@ where
 
     // This section describes how to consume a string token from a stream of code
     // points. It returns either a <string-token> or <bad-string-token>.
-    fn read_str(&mut self, maybe_ending_code_point: Option<char>) -> LexResult<Token> {
+    fn read_str(&mut self, maybe_ending_code_point: Option<u8>) -> LexResult<Token> {
         self.with_buf_and_raw_buf(|l, buf, raw| {
             // This algorithm may be called with an ending code point, which denotes the
             // code point that ends the string. If an ending code point is not specified,
@@ -709,7 +719,7 @@ where
             // Initially create a <string-token> with its value set to the empty string.
             // Done above
 
-            raw.push(ending_code_point.unwrap());
+            raw.push(ending_code_point.unwrap() as char);
 
             // Repeatedly consume the next input code point from the stream:
             loop {
@@ -717,7 +727,7 @@ where
                     // ending code point
                     // Return the <string-token>.
                     Some(c) if c == ending_code_point.unwrap() => {
-                        raw.push(c);
+                        raw.push(c as char);
 
                         break;
                     }
@@ -746,7 +756,7 @@ where
                     }
 
                     // U+005C REVERSE SOLIDUS (\)
-                    Some(c) if c == '\\' => {
+                    Some(c) if c == b'\\' => {
                         let next = l.next();
 
                         // If the next input code point is EOF, do nothing.
@@ -757,26 +767,26 @@ where
                         else if l.next().is_some() && is_newline(l.next().unwrap()) {
                             l.consume();
 
-                            raw.push(c);
-                            raw.push(next.unwrap());
+                            raw.push(c as char);
+                            raw.push(next.unwrap() as char);
                         }
                         // Otherwise, (the stream starts with a valid escape) consume an escaped
                         // code point and append the returned code point to
-                        // the <string-token>’s value.
+                        // the <string-token>'s value.
                         else if l.is_valid_escape(None, None) {
                             let escape = l.read_escape()?;
 
                             buf.push(escape.0);
-                            raw.push(c);
+                            raw.push(c as char);
                             raw.push_str(&escape.1);
                         }
                     }
 
                     // Anything else
-                    // Append the current input code point to the <string-token>’s value.
+                    // Append the current input code point to the <string-token>'s value.
                     Some(c) => {
-                        buf.push(c);
-                        raw.push(c);
+                        buf.push(c as char);
+                        raw.push(c as char);
                     }
                 }
             }
@@ -800,7 +810,7 @@ where
                 if is_whitespace(c) {
                     l.consume();
 
-                    raw.push(c);
+                    raw.push(c as char);
                 } else {
                     break;
                 }
@@ -811,7 +821,7 @@ where
                 match l.consume() {
                     // U+0029 RIGHT PARENTHESIS ())
                     // Return the <url-token>.
-                    Some(')') => {
+                    Some(b')') => {
                         return Ok(Token::Url {
                             value: l.atoms.atom(&**out),
                             raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))),
@@ -833,13 +843,13 @@ where
                     Some(c) if is_whitespace(c) => {
                         // Consume as much whitespace as possible.
                         let whitespaces: String = l.with_sub_buf(|l, buf| {
-                            buf.push(c);
+                            buf.push(c as char);
 
                             while let Some(c) = l.next() {
                                 if is_whitespace(c) {
                                     l.consume();
 
-                                    buf.push(c);
+                                    buf.push(c as char);
                                 } else {
                                     break;
                                 }
@@ -852,7 +862,7 @@ where
                         // consume it and return the <url-token> (if EOF was
                         // encountered, this is a parse error);
                         match l.next() {
-                            Some(')') => {
+                            Some(b')') => {
                                 l.consume();
 
                                 raw.push_str(&whitespaces);
@@ -894,12 +904,12 @@ where
                     // non-printable code point
                     // This is a parse error. Consume the remnants of a bad url, create a
                     // <bad-url-token>, and return it.
-                    Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => {
+                    Some(c) if c == b'"' || c == b'\'' || c == b'(' || is_non_printable(c) => {
                         l.emit_error(ErrorKind::UnexpectedCharInUrl);
 
                         let remnants = l.read_bad_url_remnants()?;
 
-                        raw.push(c);
+                        raw.push(c as char);
                         raw.push_str(&remnants);
 
                         return Ok(Token::BadUrl {
@@ -908,15 +918,15 @@ where
                     }
 
                     // U+005C REVERSE SOLIDUS (\)
-                    Some(c) if c == '\\' => {
+                    Some(c) if c == b'\\' => {
                         // If the stream starts with a valid escape, consume an escaped code point
                         // and append the returned code point to the
-                        // <url-token>’s value.
+                        // <url-token>'s value.
                         if l.is_valid_escape(None, None) {
                             let escaped = l.read_escape()?;
 
                             out.push(escaped.0);
-                            raw.push(c);
+                            raw.push(c as char);
                             raw.push_str(&escaped.1);
                         }
                         // Otherwise, this is a parse error. Consume the remnants of a bad url,
@@ -926,7 +936,7 @@ where
 
                             let remnants = l.read_bad_url_remnants()?;
 
-                            raw.push(c);
+                            raw.push(c as char);
                             raw.push_str(&remnants);
 
                             return Ok(Token::BadUrl {
@@ -936,10 +946,10 @@ where
                     }
 
                     // anything else
-                    // Append the current input code point to the <url-token>’s value.
+                    // Append the current input code point to the <url-token>'s value.
                     Some(c) => {
-                        out.push(c);
-                        raw.push(c);
+                        out.push(c as char);
+                        raw.push(c as char);
                     }
                 }
             }
@@ -957,22 +967,22 @@ where
             match l.consume() {
                 // hex digit
                 Some(c) if is_hex_digit(c) => {
-                    let mut hex = c.to_digit(16).unwrap();
+                    let mut hex = (c as char).to_digit(16).unwrap();
 
-                    buf.push(c);
+                    buf.push(c as char);
 
                     // Consume as many hex digits as possible, but no more than 5.
                     // Note that this means 1-6 hex digits have been consumed in total.
                     for _ in 0..5 {
                         let next = l.next();
-                        let digit = match next.and_then(|c| c.to_digit(16)) {
+                        let digit = match next.and_then(|c| (c as char).to_digit(16)) {
                             Some(v) => v,
                             None => break,
                         };
 
                         l.consume();
 
-                        buf.push(next.unwrap());
+                        buf.push(next.unwrap() as char);
                         hex = hex * 16 + digit;
                     }
 
@@ -983,7 +993,7 @@ where
                         if is_whitespace(next) {
                             l.consume();
 
-                            buf.push(next);
+                            buf.push(next as char);
                         }
                     }
 
@@ -1017,9 +1027,9 @@ where
                 // anything else
                 // Return the current input code point.
                 Some(c) => {
-                    buf.push(c);
+                    buf.push(c as char);
 
-                    Ok((c, (&**buf).into()))
+                    Ok((c as char, (&**buf).into()))
                 }
             }
         })
@@ -1031,9 +1041,9 @@ where
     // or can be called with the input stream itself. In the latter case, the two
     // code points in question are the current input code point and the next input
     // code point, in that order.
-    fn is_valid_escape(&mut self, maybe_first: Option<char>, maybe_second: Option<char>) -> bool {
+    fn is_valid_escape(&mut self, maybe_first: Option<u8>, maybe_second: Option<u8>) -> bool {
         // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
-        if maybe_first.or_else(|| self.cur()) != Some('\\') {
+        if maybe_first.or_else(|| self.cur()) != Some(b'\\') {
             return false;
         }
 
@@ -1053,16 +1063,16 @@ where
     // the next two input code points, in that order.
     fn would_start_ident(
         &mut self,
-        maybe_first: Option<char>,
-        maybe_second: Option<char>,
-        maybe_third: Option<char>,
+        maybe_first: Option<u8>,
+        maybe_second: Option<u8>,
+        maybe_third: Option<u8>,
     ) -> bool {
         // Look at the first code point:
         let first = maybe_first.or_else(|| self.cur());
 
         match first {
             // U+002D HYPHEN-MINUS
-            Some('-') => {
+            Some(b'-') => {
                 let second = maybe_second.or_else(|| self.next());
 
                 match second {
@@ -1071,7 +1081,7 @@ where
                     Some(c) if is_name_start(c) => true,
                     // or a U+002D HYPHEN-MINUS,
                     // return true.
-                    Some('-') => true,
+                    Some(b'-') => true,
                     // or the second and third code points are a valid escape
                     // return true.
                     Some(_) => {
@@ -1089,7 +1099,7 @@ where
             // U+005C REVERSE SOLIDUS (\)
             // If the first and second code points are a valid escape, return true. Otherwise,
             // return false.
-            Some('\\') => {
+            Some(b'\\') => {
                 let second = maybe_second.or_else(|| self.next());
 
                 self.is_valid_escape(first, second)
@@ -1107,9 +1117,9 @@ where
     #[allow(clippy::needless_return)]
     fn would_start_number(
         &mut self,
-        maybe_first: Option<char>,
-        maybe_second: Option<char>,
-        maybe_third: Option<char>,
+        maybe_first: Option<u8>,
+        maybe_second: Option<u8>,
+        maybe_third: Option<u8>,
     ) -> bool {
         // Look at the first code point:
         let first = maybe_first.or_else(|| self.cur());
@@ -1117,13 +1127,13 @@ where
         match first {
             // U+002B PLUS SIGN (+)
             // U+002D HYPHEN-MINUS (-)
-            Some('+') | Some('-') => {
+            Some(b'+') | Some(b'-') => {
                 match maybe_second.or_else(|| self.next()) {
                     // If the second code point is a digit, return true.
                     Some(second) if second.is_ascii_digit() => return true,
                     // Otherwise, if the second code point is a U+002E FULL STOP (.) and the
                     // third code point is a digit, return true.
-                    Some('.') => {
+                    Some(b'.') => {
                         if let Some(third) = maybe_third.or_else(|| self.next_next()) {
                             if third.is_ascii_digit() {
                                 return true;
@@ -1137,7 +1147,7 @@ where
                 };
             }
             // U+002E FULL STOP (.)
-            Some('.') => {
+            Some(b'.') => {
                 // If the second code point is a digit, return true.
                 if let Some(second) = self.next() {
                     if second.is_ascii_digit() {
@@ -1172,8 +1182,8 @@ where
                     // name code point
                     // Append the code point to result.
                     Some(c) if is_name(c) => {
-                        buf.push(c);
-                        raw.push(c);
+                        buf.push(c as char);
+                        raw.push(c as char);
                     }
                     // the stream starts with a valid escape
                     // Consume an escaped code point. Append the returned code point to result.
@@ -1181,7 +1191,7 @@ where
                         let escaped = l.read_escape()?;
 
                         buf.push(escaped.0);
-                        raw.push(c);
+                        raw.push(c as char);
                         raw.push_str(&escaped.1);
                     }
                     // anything else
@@ -1209,10 +1219,10 @@ where
             // (-), consume it and append it to repr.
             let next = l.next();
 
-            if next == Some('+') || next == Some('-') {
+            if next == Some(b'+') || next == Some(b'-') {
                 l.consume();
 
-                out.push(next.unwrap());
+                out.push(next.unwrap() as char);
             }
 
             // While the next input code point is a digit, consume it and append it to repr.
@@ -1220,7 +1230,7 @@ where
                 if c.is_ascii_digit() {
                     l.consume();
 
-                    out.push(c);
+                    out.push(c as char);
                 } else {
                     break;
                 }
@@ -1230,7 +1240,7 @@ where
             // then:
             let next = l.next();
 
-            if next == Some('.') {
+            if next == Some(b'.') {
                 if let Some(n) = l.next_next() {
                     if n.is_ascii_digit() {
                         // Consume them.
@@ -1238,8 +1248,8 @@ where
                         l.consume();
 
                         // Append them to repr.
-                        out.push(next.unwrap());
-                        out.push(n);
+                        out.push(next.unwrap() as char);
+                        out.push(n as char);
 
                         // Set type to "number".
                         type_flag = NumberType::Number;
@@ -1250,7 +1260,7 @@ where
                             if c.is_ascii_digit() {
                                 l.consume();
 
-                                out.push(c);
+                                out.push(c as char);
                             } else {
                                 break;
                             }
@@ -1264,12 +1274,12 @@ where
             // (-) or U+002B PLUS SIGN (+), followed by a digit, then:
             let next = l.next();
 
-            if next == Some('E') || next == Some('e') {
+            if next == Some(b'E') || next == Some(b'e') {
                 let next_next = l.next_next();
                 let next_next_next = l.next_next_next();
 
-                if (next_next == Some('-')
-                    || next_next == Some('+')
+                if (next_next == Some(b'-')
+                    || next_next == Some(b'+')
                         && next_next_next.is_some()
                         && next_next_next.unwrap().is_ascii_digit())
                     || next_next.is_some() && next_next.unwrap().is_ascii_digit()
@@ -1279,8 +1289,8 @@ where
                     l.consume();
 
                     // Append them to repr.
-                    out.push(next.unwrap());
-                    out.push(next_next.unwrap());
+                    out.push(next.unwrap() as char);
+                    out.push(next_next.unwrap() as char);
 
                     // Set type to "number".
                     type_flag = NumberType::Number;
@@ -1291,7 +1301,7 @@ where
                         if c.is_ascii_digit() {
                             l.consume();
 
-                            out.push(c);
+                            out.push(c as char);
                         } else {
                             break;
                         }
@@ -1326,8 +1336,8 @@ where
                     // U+0029 RIGHT PARENTHESIS ())
                     // EOF
                     // Return.
-                    Some(c @ ')') => {
-                        raw.push(c);
+                    Some(c @ b')') => {
+                        raw.push(c as char);
 
                         break;
                     }
@@ -1340,13 +1350,13 @@ where
                         // ("\)") to be encountered without ending the <bad-url-token>.
                         let escaped = l.read_escape()?;
 
-                        raw.push(c);
+                        raw.push(c as char);
                         raw.push_str(&escaped.1);
                     }
                     // anything else
                     // Do nothing.
                     Some(c) => {
-                        raw.push(c);
+                        raw.push(c as char);
                     }
                 }
             }
@@ -1357,61 +1367,61 @@ where
 }
 
 #[inline(always)]
-fn is_digit(c: char) -> bool {
+fn is_digit(c: u8) -> bool {
     c.is_ascii_digit()
 }
 
 #[inline(always)]
-fn is_hex_digit(c: char) -> bool {
+fn is_hex_digit(c: u8) -> bool {
     match c {
         c if is_digit(c) => true,
-        'A'..='F' => true,
-        'a'..='f' => true,
+        b'A'..=b'F' => true,
+        b'a'..=b'f' => true,
         _ => false,
     }
 }
 
 #[inline(always)]
-fn is_uppercase_letter(c: char) -> bool {
+fn is_uppercase_letter(c: u8) -> bool {
     c.is_ascii_uppercase()
 }
 
 #[inline(always)]
-fn is_lowercase_letter(c: char) -> bool {
+fn is_lowercase_letter(c: u8) -> bool {
     c.is_ascii_lowercase()
 }
 
 #[inline(always)]
-fn is_letter(c: char) -> bool {
+fn is_letter(c: u8) -> bool {
     is_uppercase_letter(c) || is_lowercase_letter(c)
 }
 
 #[inline(always)]
-fn is_non_ascii(c: char) -> bool {
-    c as u32 >= 0x80
+fn is_non_ascii(c: u8) -> bool {
+    c >= 0x80
 }
 
 #[inline(always)]
-fn is_name_start(c: char) -> bool {
-    matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00')
+fn is_name_start(c: u8) -> bool {
+    matches!(c, c if is_letter(c) || is_non_ascii(c) || c == b'_' || c == 0x00)
 }
 
 #[inline(always)]
-fn is_name(c: char) -> bool {
-    is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-')
+fn is_name(c: u8) -> bool {
+    is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == b'-')
 }
 
 #[inline(always)]
-fn is_non_printable(c: char) -> bool {
-    matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F')
+fn is_non_printable(c: u8) -> bool {
+    matches!(c, 0x00..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F)
 }
 
 #[inline(always)]
-fn is_newline(c: char) -> bool {
-    matches!(c, '\n' | '\r' | '\x0C')
+fn is_newline(c: u8) -> bool {
+    matches!(c, b'\n' | b'\r' | 0x0c)
 }
 
 #[inline(always)]
-fn is_whitespace(c: char) -> bool {
-    matches!(c, c if c == ' ' || c == '\t' || is_newline(c))
+fn is_whitespace(c: u8) -> bool {
+    matches!(c, c if c == b' ' || c == b'\t' || is_newline(c))
 }
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index d9e6b4094f30..8d98f63b9e59 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -102,7 +102,7 @@ where
     I: Input<'a>,
 {
     input: I,
-    cur: Option<char>,
+    cur: Option<u8>,
     cur_pos: BytePos,
     last_token_pos: BytePos,
     finished: bool,
@@ -154,7 +154,7 @@ where
 
         // A leading Byte Order Mark (BOM) causes the character encoding argument to be
         // ignored and will itself be skipped.
-        if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
+        if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') {
             unsafe {
                 // Safety: We know that the current character is '\u{feff}'.
                 lexer.input.bump();
@@ -216,7 +216,7 @@ where
     I: Input<'a>,
 {
     #[inline(always)]
-    fn next(&mut self) -> Option<char> {
+    fn next(&mut self) -> Option<u8> {
         self.input.cur()
     }
 
@@ -228,8 +228,8 @@ where
     // Postpone validation for each character for perf reasons and do it in
     // `anything else`
     #[inline(always)]
-    fn validate_input_stream_character(&mut self, c: char) {
-        let code = c as u32;
+    fn validate_input_stream_character(&mut self, c: u8) {
+        let code = (c as char) as u32;
 
         if is_surrogate(code) {
             self.emit_error(ErrorKind::SurrogateInInputStream);
@@ -268,7 +268,7 @@ where
     }
 
     #[inline(always)]
-    fn consume_next_char(&mut self) -> Option<char> {
+    fn consume_next_char(&mut self) -> Option<u8> {
         // The next input character is the first character in the input stream that has
         // not yet been consumed or explicitly ignored by the requirements in this
         // section. Initially, the next input character is the first character in the
@@ -403,51 +403,51 @@ where
         });
     }
 
-    fn append_raw_to_doctype_token(&mut self, c: char) {
+    fn append_raw_to_doctype_token(&mut self, c: u8) {
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        let is_cr = c == '\r';
+        let is_cr = c == b'\r';
 
         if is_cr {
-            sub_buf.push(c);
+            sub_buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
                 sub_buf.push('\n');
             }
         } else {
-            sub_buf.push(c);
+            sub_buf.push(c as char);
         }
     }
 
     fn append_to_doctype_token(
         &mut self,
-        name: Option<char>,
-        public_id: Option<char>,
-        system_id: Option<char>,
+        name: Option<u8>,
+        public_id: Option<u8>,
+        system_id: Option<u8>,
     ) {
         let b = self.buf.clone();
         let mut buf = b.borrow_mut();
 
         if let Some(name) = name {
-            buf.push(name);
+            buf.push(name as char);
         }
 
         if let Some(public_id) = public_id {
-            buf.push(public_id);
+            buf.push(public_id as char);
         }
 
         if let Some(system_id) = system_id {
-            buf.push(system_id);
+            buf.push(system_id as char);
         }
     }
 
-    fn consume_and_append_to_doctype_token_name<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_doctype_token_name<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
     {
@@ -456,8 +456,8 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c.to_ascii_lowercase());
-        sub_buf.push(c);
+        buf.push((c as char).to_ascii_lowercase());
+        sub_buf.push(c as char);
 
         let value = self.input.uncons_while(f);
 
@@ -465,7 +465,7 @@ where
         sub_buf.push_str(value);
     }
 
-    fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
     {
@@ -474,23 +474,23 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        let is_cr = c == '\r';
+        let is_cr = c == b'\r';
 
         if is_cr {
             buf.push('\n');
-            sub_buf.push(c);
+            sub_buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c);
-            sub_buf.push(c);
+            buf.push(c as char);
+            sub_buf.push(c as char);
         }
 
         let value = self.input.uncons_while(f);
@@ -499,7 +499,7 @@ where
         sub_buf.push_str(value);
     }
 
-    fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
     {
@@ -508,23 +508,23 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        let is_cr = c == '\r';
+        let is_cr = c == b'\r';
 
         if is_cr {
             buf.push('\n');
-            sub_buf.push(c);
+            sub_buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c);
-            sub_buf.push(c);
+            buf.push(c as char);
+            sub_buf.push(c as char);
         }
 
         let value = self.input.uncons_while(f);
@@ -639,7 +639,7 @@ where
         });
     }
 
-    fn append_to_tag_token_name(&mut self, c: char, raw_c: char) {
+    fn append_to_tag_token_name(&mut self, c: char, raw_c: u8) {
         if let Some(Token::StartTag { .. } | Token::EndTag { .. }) = &mut self.current_token {
             let b = self.buf.clone();
             let mut buf = b.borrow_mut();
@@ -647,11 +647,11 @@ where
             let mut sub_buf = b.borrow_mut();
 
             buf.push(c);
-            sub_buf.push(raw_c);
+            sub_buf.push(raw_c as char);
         }
     }
 
-    fn consume_and_append_to_tag_token_name<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_tag_token_name<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
     {
@@ -660,8 +660,8 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c.to_ascii_lowercase());
-        sub_buf.push(c);
+        buf.push((c as char).to_ascii_lowercase());
+        sub_buf.push(c as char);
 
         let value = self.input.uncons_while(f);
 
@@ -712,17 +712,17 @@ where
         }
     }
 
-    fn append_to_attribute_token_name(&mut self, c: char, raw_c: char) {
+    fn append_to_attribute_token_name(&mut self, c: u8, raw_c: u8) {
         let b = self.buf.clone();
         let mut buf = b.borrow_mut();
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c);
-        sub_buf.push(raw_c);
+        buf.push(c as char);
+        sub_buf.push(raw_c as char);
     }
 
-    fn consume_and_append_to_attribute_token_name<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_attribute_token_name<F>(&mut self, c: u8, f: F)
     where
         F: FnMut(char) -> bool,
     {
@@ -731,8 +731,8 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c.to_ascii_lowercase());
-        sub_buf.push(c);
+        buf.push((c as char).to_ascii_lowercase());
+        sub_buf.push(c as char);
 
         let value = self.input.uncons_while(f);
 
@@ -740,7 +740,7 @@ where
         sub_buf.push_str(value);
     }
 
-    fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: u8, f: F)
     where
         F: FnMut(char) -> bool,
     {
@@ -749,10 +749,10 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c.to_ascii_lowercase());
-        sub_buf.push(c);
+        buf.push((c as char).to_ascii_lowercase());
+        sub_buf.push(c as char);
 
-        self.temporary_buffer.push(c);
+        self.temporary_buffer.push(c as char);
 
         let value = self.input.uncons_while(f);
 
@@ -814,9 +814,9 @@ where
             buf.push('\n');
             sub_buf.push('\r');
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
@@ -833,7 +833,7 @@ where
         }
     }
 
-    fn consume_and_append_to_attribute_token_value<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_attribute_token_value<F>(&mut self, c: u8, f: F)
     where
         F: FnMut(char) -> bool,
     {
@@ -842,23 +842,23 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        let is_cr = c == '\r';
+        let is_cr = c == b'\r';
 
         if is_cr {
             buf.push('\n');
-            sub_buf.push(c);
+            sub_buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c);
-            sub_buf.push(c);
+            buf.push(c as char);
+            sub_buf.push(c as char);
         }
 
         let value = self.input.uncons_while(f);
@@ -964,7 +964,7 @@ where
         sub_buf.push(raw_c);
     }
 
-    fn consume_and_append_to_comment_token<F>(&mut self, c: char, f: F)
+    fn consume_and_append_to_comment_token<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
     {
@@ -973,23 +973,23 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        let is_cr = c == '\r';
+        let is_cr = c == b'\r';
 
         if is_cr {
             buf.push('\n');
-            sub_buf.push(c);
+            sub_buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c);
-            sub_buf.push(c);
+            buf.push(c as char);
+            sub_buf.push(c as char);
         }
 
         let value = self.input.uncons_while(f);
@@ -1018,19 +1018,19 @@ where
     }
 
     #[inline(always)]
-    fn emit_character_token(&mut self, value: char) {
+    fn emit_character_token(&mut self, value: u8) {
         self.emit_token(Token::Character {
-            value,
+            value: value as char,
             raw: Some(Raw::Same),
         });
     }
 
     #[inline(always)]
-    fn emit_character_token_with_raw(&mut self, c: char, raw_c: char) {
+    fn emit_character_token_with_raw(&mut self, c: char, raw_c: u8) {
         let b = self.buf.clone();
         let mut buf = b.borrow_mut();
 
-        buf.push(raw_c);
+        buf.push(raw_c as char);
 
         self.emit_token(Token::Character {
             value: c,
@@ -1040,18 +1040,18 @@ where
         buf.clear();
     }
 
-    fn handle_raw_and_emit_character_token(&mut self, c: char) {
-        let is_cr = c == '\r';
+    fn handle_raw_and_emit_character_token(&mut self, c: u8) {
+        let is_cr = c == b'\r';
 
         if is_cr {
             let b = self.buf.clone();
             let mut buf = b.borrow_mut();
 
-            buf.push(c);
+            buf.push(c as char);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
                 buf.push('\n');
@@ -1065,7 +1065,7 @@ where
             buf.clear();
         } else {
             self.emit_token(Token::Character {
-                value: c,
+                value: c as char,
                 raw: Some(Raw::Same),
             });
         }
@@ -1103,19 +1103,19 @@ where
                     // U+0026 AMPERSAND (&)
                     // Set the return state to the data state. Switch to the character reference
                     // state.
-                    Some('&') => {
+                    Some(b'&') => {
                         self.return_state = State::Data;
                         self.state = State::CharacterReference;
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the tag open state.
-                    Some('<') => {
+                    Some(b'<') => {
                         self.state = State::TagOpen;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit the current input
                     // character as a character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token(c);
                     }
@@ -1141,19 +1141,19 @@ where
                     // U+0026 AMPERSAND (&)
                     // Set the return state to the RCDATA state. Switch to the character
                     // reference state.
-                    Some('&') => {
+                    Some(b'&') => {
                         self.return_state = State::Rcdata;
                         self.state = State::CharacterReference;
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the RCDATA less-than sign state.
-                    Some('<') => {
+                    Some(b'<') => {
                         self.state = State::RcdataLessThanSign;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -1178,11 +1178,11 @@ where
                 match self.consume_next_char() {
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the RAWTEXT less-than sign state.
-                    Some('<') => self.state = State::RawtextLessThanSign,
+                    Some(b'<') => self.state = State::RawtextLessThanSign,
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -1207,11 +1207,11 @@ where
                 match self.consume_next_char() {
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data less-than sign state.
-                    Some('<') => self.state = State::ScriptDataLessThanSign,
+                    Some(b'<') => self.state = State::ScriptDataLessThanSign,
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -1237,7 +1237,7 @@ where
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -1262,12 +1262,12 @@ where
                 match self.consume_next_char() {
                     // U+002F SOLIDUS (/)
                     // Switch to the end tag open state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.state = State::EndTagOpen;
                     }
                     // U+0021 EXCLAMATION MARK (!)
                     // Switch to the markup declaration open state.
-                    Some('!') => {
+                    Some(b'!') => {
                         self.state = State::MarkupDeclarationOpen;
                     }
                     // ASCII alpha
@@ -1281,7 +1281,7 @@ where
                     // This is an unexpected-question-mark-instead-of-tag-name parse error.
                     // Create a comment token whose data is the empty string. Reconsume in the
                     // bogus comment state.
-                    Some('?') => {
+                    Some(b'?') => {
                         self.emit_error(ErrorKind::UnexpectedQuestionMarkInsteadOfTagName);
                         self.create_comment_token("<");
                         self.reconsume_in_state(State::BogusComment);
@@ -1291,7 +1291,7 @@ where
                     // character token and an end-of-file token.
                     None => {
                         self.emit_error(ErrorKind::EofBeforeTagName);
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.emit_token(Token::Eof);
 
                         return Ok(());
@@ -1301,7 +1301,7 @@ where
                     // LESS-THAN SIGN character token. Reconsume in the data state.
                     _ => {
                         self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName);
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::Data);
                     }
                 }
@@ -1319,7 +1319,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is a missing-end-tag-name parse error. Switch to the data state.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.emit_error(ErrorKind::MissingEndTagName);
                         self.state = State::Data;
                     }
@@ -1329,8 +1329,8 @@ where
                     // token.
                     None => {
                         self.emit_error(ErrorKind::EofBeforeTagName);
-                        self.emit_character_token('<');
-                        self.emit_character_token('/');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'/');
                         self.emit_token(Token::Eof);
 
                         return Ok(());
@@ -1362,13 +1362,13 @@ where
                     }
                     // U+002F SOLIDUS (/)
                     // Switch to the self-closing start tag state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.finish_tag_token_name();
                         self.state = State::SelfClosingStartTag;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.finish_tag_token_name();
                         self.state = State::Data;
                         self.emit_tag_token();
@@ -1377,12 +1377,12 @@ where
                     // Append the lowercase version of the current input character (add 0x0020
                     // to the character's code point) to the current tag token's tag name.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha);
+                        self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha_char);
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current tag token's tag name.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.append_to_tag_token_name(REPLACEMENT_CHARACTER, c);
                     }
@@ -1406,9 +1406,9 @@ where
 
                             // List of characters from above to stop consumption and a certain
                             // branch took control
-                            !is_spacy(c)
+                            !is_spacy_char(c)
                                 && !matches!(c, '/' | '>' | '\x00')
-                                && !is_ascii_upper_alpha(c)
+                                && !is_ascii_upper_alpha_char(c)
                         });
                     }
                 }
@@ -1420,7 +1420,7 @@ where
                     // U+002F SOLIDUS (/)
                     // Set the temporary buffer to the empty string. Switch to the RCDATA end
                     // tag open state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.temporary_buffer.clear();
                         self.state = State::RcdataEndTagOpen;
                     }
@@ -1428,7 +1428,7 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RCDATA
                     // state.
                     _ => {
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::Rcdata);
                     }
                 }
@@ -1448,8 +1448,8 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
                     // character token. Reconsume in the RCDATA state.
                     _ => {
-                        self.emit_character_token('<');
-                        self.emit_character_token('/');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'/');
                         self.reconsume_in_state(State::Rcdata);
                     }
                 }
@@ -1458,8 +1458,8 @@ where
             State::RcdataEndTagName => {
                 let anything_else = |lexer: &mut Lexer<'a, I>| {
                     lexer.finish_tag_token_name();
-                    lexer.emit_character_token('<');
-                    lexer.emit_character_token('/');
+                    lexer.emit_character_token(b'<');
+                    lexer.emit_character_token(b'/');
                     lexer.emit_temporary_buffer_as_character_tokens();
                     lexer.reconsume_in_state(State::Rcdata);
                 };
@@ -1487,7 +1487,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the self-closing start tag state. Otherwise, treat it as per the
                     // "anything else" entry below.
-                    Some('/') => {
+                    Some(b'/') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::SelfClosingStartTag;
@@ -1499,7 +1499,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the data state and emit the current tag token. Otherwise, treat it as
                     // per the "anything else" entry below.
-                    Some('>') => {
+                    Some(b'>') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::Data;
@@ -1513,19 +1513,17 @@ where
                     // to the character's code point) to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_upper_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_upper_alpha(ch as u8)
+                        });
                     }
                     // ASCII lower alpha
                     // Append the current input character to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_lower_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_lower_alpha(ch as u8)
+                        });
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@@ -1544,7 +1542,7 @@ where
                     // U+002F SOLIDUS (/)
                     // Set the temporary buffer to the empty string. Switch to the RAWTEXT end
                     // tag open state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.temporary_buffer.clear();
                         self.state = State::RawtextEndTagOpen;
                     }
@@ -1552,7 +1550,7 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RAWTEXT
                     // state.
                     _ => {
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::Rawtext);
                     }
                 }
@@ -1572,8 +1570,8 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
                     // character token. Reconsume in the RAWTEXT state.
                     _ => {
-                        self.emit_character_token('<');
-                        self.emit_character_token('/');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'/');
                         self.reconsume_in_state(State::Rawtext);
                     }
                 }
@@ -1582,8 +1580,8 @@ where
             State::RawtextEndTagName => {
                 let anything_else = |lexer: &mut Lexer<'a, I>| {
                     lexer.finish_tag_token_name();
-                    lexer.emit_character_token('<');
-                    lexer.emit_character_token('/');
+                    lexer.emit_character_token(b'<');
+                    lexer.emit_character_token(b'/');
                     lexer.emit_temporary_buffer_as_character_tokens();
                     lexer.reconsume_in_state(State::Rawtext);
                 };
@@ -1611,7 +1609,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the self-closing start tag state. Otherwise, treat it as per the
                     // "anything else" entry below.
-                    Some('/') => {
+                    Some(b'/') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::SelfClosingStartTag;
@@ -1623,7 +1621,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the data state and emit the current tag token. Otherwise, treat it as
                     // per the "anything else" entry below.
-                    Some('>') => {
+                    Some(b'>') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::Data;
@@ -1637,19 +1635,17 @@ where
                     // to the character's code point) to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_upper_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_upper_alpha(ch as u8)
+                        });
                     }
                     // ASCII lower alpha
                     // Append the current input character to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_lower_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_lower_alpha(ch as u8)
+                        });
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@@ -1668,23 +1664,23 @@ where
                     // U+002F SOLIDUS (/)
                     // Set the temporary buffer to the empty string. Switch to the script data
                     // end tag open state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.temporary_buffer.clear();
                         self.state = State::ScriptDataEndTagOpen;
                     }
                     // U+0021 EXCLAMATION MARK (!)
                     // Switch to the script data escape start state. Emit a U+003C LESS-THAN
                     // SIGN character token and a U+0021 EXCLAMATION MARK character token.
-                    Some('!') => {
+                    Some(b'!') => {
                         self.state = State::ScriptDataEscapeStart;
-                        self.emit_character_token('<');
-                        self.emit_character_token('!');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'!');
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
                     // data state.
                     _ => {
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::ScriptData);
                     }
                 }
@@ -1704,8 +1700,8 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
                     // character token. Reconsume in the script data state.
                     _ => {
-                        self.emit_character_token('<');
-                        self.emit_character_token('/');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'/');
                         self.reconsume_in_state(State::ScriptData);
                     }
                 }
@@ -1714,8 +1710,8 @@ where
             State::ScriptDataEndTagName => {
                 let anything_else = |lexer: &mut Lexer<'a, I>| {
                     lexer.finish_tag_token_name();
-                    lexer.emit_character_token('<');
-                    lexer.emit_character_token('/');
+                    lexer.emit_character_token(b'<');
+                    lexer.emit_character_token(b'/');
                     lexer.emit_temporary_buffer_as_character_tokens();
                     lexer.reconsume_in_state(State::ScriptData);
                 };
@@ -1743,7 +1739,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the self-closing start tag state. Otherwise, treat it as per the
                     // "anything else" entry below.
-                    Some('/') => {
+                    Some(b'/') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::SelfClosingStartTag;
@@ -1755,7 +1751,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the data state and emit the current tag token. Otherwise, treat it as
                     // per the "anything else" entry below.
-                    Some('>') => {
+                    Some(b'>') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::Data;
@@ -1769,19 +1765,17 @@ where
                     // to the character's code point) to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_upper_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_upper_alpha(ch as u8)
+                        });
                     }
                     // ASCII lower alpha
                     // Append the current input character to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_lower_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_lower_alpha(ch as u8)
+                        });
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@@ -1800,7 +1794,7 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data escape start dash state. Emit a U+002D
                     // HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataEscapeStartDash;
                         self.emit_character_token(c);
                     }
@@ -1818,7 +1812,7 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data escaped dash dash state. Emit a U+002D
                     // HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataEscapedDashDash;
                         self.emit_character_token(c);
                     }
@@ -1836,19 +1830,19 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS
                     // character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataEscapedDash;
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data escaped less-than sign state.
-                    Some('<') => {
+                    Some(b'<') => {
                         self.state = State::ScriptDataEscapedLessThanSign;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -1876,19 +1870,19 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data escaped dash dash state. Emit a U+002D
                     // HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataEscapedDashDash;
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data escaped less-than sign state.
-                    Some('<') => {
+                    Some(b'<') => {
                         self.state = State::ScriptDataEscapedLessThanSign;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Switch to the script
                     // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.state = State::ScriptDataEscaped;
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
@@ -1918,25 +1912,25 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Emit a U+002D HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data escaped less-than sign state.
-                    Some('<') => {
+                    Some(b'<') => {
                         self.state = State::ScriptDataEscapedLessThanSign;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
                     // character token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.state = State::ScriptData;
                         self.emit_character_token(c);
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Switch to the script
                     // data escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.state = State::ScriptDataEscaped;
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
@@ -1967,7 +1961,7 @@ where
                     // U+002F SOLIDUS (/)
                     // Set the temporary buffer to the empty string. Switch to the script data
                     // escaped end tag open state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.temporary_buffer.clear();
                         self.state = State::ScriptDataEscapedEndTagOpen;
                     }
@@ -1977,14 +1971,14 @@ where
                     // state.
                     Some(c) if is_ascii_alpha(c) => {
                         self.temporary_buffer.clear();
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::ScriptDataDoubleEscapeStart);
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script
                     // data escaped state.
                     _ => {
-                        self.emit_character_token('<');
+                        self.emit_character_token(b'<');
                         self.reconsume_in_state(State::ScriptDataEscaped);
                     }
                 }
@@ -2004,8 +1998,8 @@ where
                     // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS
                     // character token. Reconsume in the script data escaped state.
                     _ => {
-                        self.emit_character_token('<');
-                        self.emit_character_token('/');
+                        self.emit_character_token(b'<');
+                        self.emit_character_token(b'/');
                         self.reconsume_in_state(State::ScriptDataEscaped);
                     }
                 }
@@ -2014,8 +2008,8 @@ where
             State::ScriptDataEscapedEndTagName => {
                 let anything_else = |lexer: &mut Lexer<'a, I>| {
                     lexer.finish_tag_token_name();
-                    lexer.emit_character_token('<');
-                    lexer.emit_character_token('/');
+                    lexer.emit_character_token(b'<');
+                    lexer.emit_character_token(b'/');
                     lexer.emit_temporary_buffer_as_character_tokens();
                     lexer.reconsume_in_state(State::ScriptDataEscaped);
                 };
@@ -2043,7 +2037,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the self-closing start tag state. Otherwise, treat it as per the
                     // "anything else" entry below.
-                    Some('/') => {
+                    Some(b'/') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::SelfClosingStartTag;
@@ -2055,7 +2049,7 @@ where
                     // If the current end tag token is an appropriate end tag token, then switch
                     // to the data state and emit the current tag token. Otherwise, treat it as
                     // per the "anything else" entry below.
-                    Some('>') => {
+                    Some(b'>') => {
                         if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                             self.finish_tag_token_name();
                             self.state = State::Data;
@@ -2069,19 +2063,17 @@ where
                     // to the character's code point) to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_upper_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_upper_alpha(ch as u8)
+                        });
                     }
                     // ASCII lower alpha
                     // Append the current input character to the current tag token's tag name.
                     // Append the current input character to the temporary buffer.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
-                            c,
-                            is_ascii_lower_alpha,
-                        );
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(c, |ch| {
+                            is_ascii_lower_alpha(ch as u8)
+                        });
                     }
                     // Anything else
                     // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@@ -2117,7 +2109,7 @@ where
 
                         self.handle_raw_and_emit_character_token(c);
                     }
-                    Some(c @ '/' | c @ '>') => {
+                    Some(c @ b'/' | c @ b'>') => {
                         let is_script = self.temporary_buffer == "script";
 
                         if is_script {
@@ -2133,14 +2125,14 @@ where
                     // to the character's code point) to the temporary buffer. Emit the current
                     // input character as a character token.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.temporary_buffer.push(c.to_ascii_lowercase());
+                        self.temporary_buffer.push(c.to_ascii_lowercase() as char);
                         self.emit_character_token(c);
                     }
                     // ASCII lower alpha
                     // Append the current input character to the temporary buffer. Emit the
                     // current input character as a character token.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.temporary_buffer.push(c);
+                        self.temporary_buffer.push(c as char);
                         self.emit_character_token(c);
                     }
                     // Anything else
@@ -2157,21 +2149,21 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data double escaped dash state. Emit a U+002D
                     // HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataDoubleEscapedDash;
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data double escaped less-than sign state. Emit a
                     // U+003C LESS-THAN SIGN character token.
-                    Some(c @ '<') => {
+                    Some(c @ b'<') => {
                         self.state = State::ScriptDataDoubleEscapedLessThanSign;
                         self.emit_character_token(c);
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Emit a U+FFFD
                     // REPLACEMENT CHARACTER character token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
                     }
@@ -2199,14 +2191,14 @@ where
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the script data double escaped dash dash state. Emit a U+002D
                     // HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.state = State::ScriptDataDoubleEscapedDashDash;
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data double escaped less-than sign state. Emit a
                     // U+003C LESS-THAN SIGN character token.
-                    Some(c @ '<') => {
+                    Some(c @ b'<') => {
                         self.state = State::ScriptDataDoubleEscapedLessThanSign;
                         self.emit_character_token(c);
                     }
@@ -2214,7 +2206,7 @@ where
                     // This is an unexpected-null-character parse error. Switch to the script
                     // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
                     // token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.state = State::ScriptDataDoubleEscaped;
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
@@ -2244,20 +2236,20 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Emit a U+002D HYPHEN-MINUS character token.
-                    Some(c @ '-') => {
+                    Some(c @ b'-') => {
                         self.emit_character_token(c);
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Switch to the script data double escaped less-than sign state. Emit a
                     // U+003C LESS-THAN SIGN character token.
-                    Some(c @ '<') => {
+                    Some(c @ b'<') => {
                         self.state = State::ScriptDataDoubleEscapedLessThanSign;
                         self.emit_character_token(c);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN
                     // character token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.state = State::ScriptData;
                         self.emit_character_token(c);
                     }
@@ -2265,7 +2257,7 @@ where
                     // This is an unexpected-null-character parse error. Switch to the script
                     // data double escaped state. Emit a U+FFFD REPLACEMENT CHARACTER character
                     // token.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.state = State::ScriptDataDoubleEscaped;
                         self.emit_character_token_with_raw(REPLACEMENT_CHARACTER, c);
@@ -2296,7 +2288,7 @@ where
                     // U+002F SOLIDUS (/)
                     // Set the temporary buffer to the empty string. Switch to the script data
                     // double escape end state. Emit a U+002F SOLIDUS character token.
-                    Some(c @ '/') => {
+                    Some(c @ b'/') => {
                         self.temporary_buffer.clear();
                         self.state = State::ScriptDataDoubleEscapeEnd;
                         self.emit_character_token(c);
@@ -2332,7 +2324,7 @@ where
 
                         self.handle_raw_and_emit_character_token(c);
                     }
-                    Some(c @ '/' | c @ '>') => {
+                    Some(c @ b'/' | c @ b'>') => {
                         let is_script = self.temporary_buffer == "script";
 
                         if is_script {
@@ -2348,14 +2340,14 @@ where
                     // to the character's code point) to the temporary buffer. Emit the current
                     // input character as a character token.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.temporary_buffer.push(c.to_ascii_lowercase());
+                        self.temporary_buffer.push(c.to_ascii_lowercase() as char);
                         self.emit_character_token(c);
                     }
                     // ASCII lower alpha
                     // Append the current input character to the temporary buffer. Emit the
                     // current input character as a character token.
                     Some(c) if is_ascii_lower_alpha(c) => {
-                        self.temporary_buffer.push(c);
+                        self.temporary_buffer.push(c as char);
 
                         self.emit_character_token(c);
                     }
@@ -2382,7 +2374,7 @@ where
                     // U+003E GREATER-THAN SIGN (>)
                     // EOF
                     // Reconsume in the after attribute name state.
-                    Some('/') | Some('>') | None => {
+                    Some(b'/') | Some(b'>') | None => {
                         self.reconsume_in_state(State::AfterAttributeName);
                     }
                     // U+003D EQUALS SIGN (=)
@@ -2391,7 +2383,7 @@ where
                     // to the current input character, and its value to the empty string. Switch
                     // to the attribute name state.
                     // We set `None` for `value` to support boolean attributes in AST
-                    Some(c @ '=') => {
+                    Some(c @ b'=') => {
                         self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
                         self.start_new_attribute_token();
                         self.append_to_attribute_token_name(c, c);
@@ -2409,7 +2401,7 @@ where
             }
             // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
             State::AttributeName => {
-                let anything_else = |lexer: &mut Lexer<'a, I>, c: char| {
+                let anything_else = |lexer: &mut Lexer<'a, I>, c: u8| {
                     lexer.append_to_attribute_token_name(c, c);
                 };
 
@@ -2428,13 +2420,13 @@ where
                         self.skip_whitespaces(c);
                         self.reconsume_in_state(State::AfterAttributeName);
                     }
-                    Some('/' | '>') | None => {
+                    Some(b'/' | b'>') | None => {
                         self.finish_attribute_token_name();
                         self.reconsume_in_state(State::AfterAttributeName);
                     }
                     // U+003D EQUALS SIGN (=)
                     // Switch to the before attribute value state.
-                    Some('=') => {
+                    Some(b'=') => {
                         self.finish_attribute_token_name();
                         self.state = State::BeforeAttributeValue;
                     }
@@ -2442,23 +2434,23 @@ where
                     // Append the lowercase version of the current input character (add 0x0020
                     // to the character's code point) to the current attribute's name.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_attribute_token_name(c, |c| {
-                            is_ascii_upper_alpha(c)
+                        self.consume_and_append_to_attribute_token_name(c, |ch| {
+                            is_ascii_upper_alpha(ch as u8)
                         });
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current attribute's name.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_attribute_token_name(REPLACEMENT_CHARACTER, c);
+                        self.append_to_attribute_token_name(REPLACEMENT_CHARACTER as u8, c);
                     }
                     // U+0022 QUOTATION MARK (")
                     // U+0027 APOSTROPHE (')
                     // U+003C LESS-THAN SIGN (<)
                     // This is an unexpected-character-in-attribute-name parse error. Treat it
                     // as per the "anything else" entry below.
-                    Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') => {
+                    Some(c @ b'"') | Some(c @ b'\'') | Some(c @ b'<') => {
                         self.emit_error(ErrorKind::UnexpectedCharacterInAttributeName);
 
                         anything_else(self, c);
@@ -2467,16 +2459,16 @@ where
                     // Append the current input character to the current attribute's name.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_attribute_token_name(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_attribute_token_name(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
                             // List of characters from above to stop consumption and a certain
                             // branch took control
-                            !is_spacy(c)
-                                && !matches!(c, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
-                                && !is_ascii_upper_alpha(c)
+                            !is_spacy_char(ch)
+                                && !matches!(ch, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
+                                && !is_ascii_upper_alpha_char(ch)
                         });
                     }
                 }
@@ -2505,17 +2497,17 @@ where
                     }
                     // U+002F SOLIDUS (/)
                     // Switch to the self-closing start tag state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.state = State::SelfClosingStartTag;
                     }
                     // U+003D EQUALS SIGN (=)
                     // Switch to the before attribute value state.
-                    Some('=') => {
+                    Some(b'=') => {
                         self.state = State::BeforeAttributeValue;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.state = State::Data;
                         self.emit_tag_token();
                     }
@@ -2551,20 +2543,20 @@ where
                     }
                     // U+0022 QUOTATION MARK (")
                     // Switch to the attribute value (double-quoted) state.
-                    Some(c @ '"') => {
-                        self.append_to_attribute_token_value(None, Some(c));
+                    Some(c @ b'"') => {
+                        self.append_to_attribute_token_value(None, Some(c as char));
                         self.state = State::AttributeValueDoubleQuoted;
                     }
                     // U+0027 APOSTROPHE (')
                     // Switch to the attribute value (single-quoted) state.
-                    Some(c @ '\'') => {
-                        self.append_to_attribute_token_value(None, Some(c));
+                    Some(c @ b'\'') => {
+                        self.append_to_attribute_token_value(None, Some(c as char));
                         self.state = State::AttributeValueSingleQuoted;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is a missing-attribute-value parse error. Switch to the data state.
                     // Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.emit_error(ErrorKind::MissingAttributeValue);
                         self.state = State::Data;
                         self.emit_tag_token();
@@ -2583,23 +2575,26 @@ where
                     // U+0022 QUOTATION MARK (")
                     // Switch to the after attribute value (quoted) state.
                     // We set value to support empty attributes (i.e. `attr=""`)
-                    Some(c @ '"') => {
-                        self.append_to_attribute_token_value(None, Some(c));
+                    Some(c @ b'"') => {
+                        self.append_to_attribute_token_value(None, Some(c as char));
                         self.state = State::AfterAttributeValueQuoted;
                     }
                     // U+0026 AMPERSAND (&)
                     // Set the return state to the attribute value (double-quoted) state. Switch
                     // to the character reference state.
-                    Some('&') => {
+                    Some(b'&') => {
                         self.return_state = State::AttributeValueDoubleQuoted;
                         self.state = State::CharacterReference;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current attribute's value.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
+                        self.append_to_attribute_token_value(
+                            Some(REPLACEMENT_CHARACTER),
+                            Some(c as char),
+                        );
                     }
                     // EOF
                     // This is an eof-in-tag parse error. Emit an end-of-file token.
@@ -2613,14 +2608,14 @@ where
                     // Append the current input character to the current attribute's value.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_attribute_token_value(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_attribute_token_value(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
                             // List of characters from above to stop consumption and a certain
                             // branch took control, `\r` is in list because of newline normalization
-                            !matches!(c, '"' | '&' | '\x00' | '\r')
+                            !matches!(ch, '"' | '&' | '\x00' | '\r')
                         });
                     }
                 }
@@ -2632,23 +2627,26 @@ where
                     // U+0027 APOSTROPHE (')
                     // Switch to the after attribute value (quoted) state.
                     // We set value to support empty attributes (i.e. `attr=''`)
-                    Some(c @ '\'') => {
-                        self.append_to_attribute_token_value(None, Some(c));
+                    Some(c @ b'\'') => {
+                        self.append_to_attribute_token_value(None, Some(c as char));
                         self.state = State::AfterAttributeValueQuoted;
                     }
                     // U+0026 AMPERSAND (&)
                     // Set the return state to the attribute value (single-quoted) state. Switch
                     // to the character reference state.
-                    Some('&') => {
+                    Some(b'&') => {
                         self.return_state = State::AttributeValueSingleQuoted;
                         self.state = State::CharacterReference;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current attribute's value.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
+                        self.append_to_attribute_token_value(
+                            Some(REPLACEMENT_CHARACTER),
+                            Some(c as char),
+                        );
                     }
                     // EOF
                     // This is an eof-in-tag parse error. Emit an end-of-file token.
@@ -2662,14 +2660,14 @@ where
                     // Append the current input character to the current attribute's value.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_attribute_token_value(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_attribute_token_value(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
                             // List of characters from above to stop consumption and a certain
                             // branch took control, `\r` is in list because of newline normalization
-                            !matches!(c, '\'' | '&' | '\x00' | '\r')
+                            !matches!(ch, '\'' | '&' | '\x00' | '\r')
                         });
                     }
                 }
@@ -2695,13 +2693,13 @@ where
                     // U+0026 AMPERSAND (&)
                     // Set the return state to the attribute value (unquoted) state. Switch to
                     // the character reference state.
-                    Some('&') => {
+                    Some(b'&') => {
                         self.return_state = State::AttributeValueUnquoted;
                         self.state = State::CharacterReference;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.finish_attribute_token_value();
                         self.state = State::Data;
                         self.emit_tag_token();
@@ -2709,9 +2707,12 @@ where
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current attribute's value.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_attribute_token_value(Some(REPLACEMENT_CHARACTER), Some(c));
+                        self.append_to_attribute_token_value(
+                            Some(REPLACEMENT_CHARACTER),
+                            Some(c as char),
+                        );
                     }
                     // U+0022 QUOTATION MARK (")
                     // U+0027 APOSTROPHE (')
@@ -2720,11 +2721,11 @@ where
                     // U+0060 GRAVE ACCENT (`)
                     // This is an unexpected-character-in-unquoted-attribute-value parse error.
                     // Treat it as per the "anything else" entry below.
-                    Some(c @ '"') | Some(c @ '\'') | Some(c @ '<') | Some(c @ '=')
-                    | Some(c @ '`') => {
+                    Some(c @ b'"') | Some(c @ b'\'') | Some(c @ b'<') | Some(c @ b'=')
+                    | Some(c @ b'`') => {
                         self.emit_error(ErrorKind::UnexpectedCharacterInUnquotedAttributeValue);
 
-                        anything_else(self, c);
+                        anything_else(self, c as char);
                     }
                     // EOF
                     // This is an eof-in-tag parse error. Emit an end-of-file token.
@@ -2739,16 +2740,16 @@ where
                     // Append the current input character to the current attribute's value.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_attribute_token_value(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_attribute_token_value(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
                             // List of characters from above to stop consumption and a certain
                             // branch took control, `\r` is in list because of newline normalization
-                            !is_spacy(c)
+                            !is_spacy_char(ch)
                                 && !matches!(
-                                    c,
+                                    ch,
                                     '&' | '>' | '\x00' | '"' | '\'' | '<' | '=' | '`' | '\r'
                                 )
                         });
@@ -2771,13 +2772,13 @@ where
                     }
                     // U+002F SOLIDUS (/)
                     // Switch to the self-closing start tag state.
-                    Some('/') => {
+                    Some(b'/') => {
                         self.finish_attribute_token_value();
                         self.state = State::SelfClosingStartTag;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.finish_attribute_token_value();
                         self.state = State::Data;
                         self.emit_tag_token();
@@ -2808,7 +2809,7 @@ where
                     // U+003E GREATER-THAN SIGN (>)
                     // Set the self-closing flag of the current tag token. Switch to the data
                     // state. Emit the current tag token.
-                    Some('>') => {
+                    Some(b'>') => {
                         if let Some(
                             Token::StartTag {
                                 is_self_closing, ..
@@ -2847,7 +2848,7 @@ where
                 match self.consume_next_char() {
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current comment token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.state = State::Data;
                         self.emit_comment_token(Some(">"));
                     }
@@ -2862,9 +2863,9 @@ where
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the comment token's data.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
+                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c as char);
                     }
                     // Anything else
                     // Append the current input character to the comment token's data.
@@ -2902,8 +2903,8 @@ where
                     // Two U+002D HYPHEN-MINUS characters (-)
                     // Consume those two characters, create a comment token whose data
                     // is the empty string, and switch to the comment start state.
-                    Some('-') => match self.consume_next_char() {
-                        Some('-') => {
+                    Some(b'-') => match self.consume_next_char() {
+                        Some(b'-') => {
                             self.create_comment_token("<!--");
                             self.state = State::CommentStart;
                         }
@@ -2913,13 +2914,13 @@ where
                     },
                     // ASCII case-insensitive match for the word "DOCTYPE"
                     // Consume those characters and switch to the DOCTYPE state.
-                    Some(d @ 'd' | d @ 'D') => match self.consume_next_char() {
-                        Some(o @ 'o' | o @ 'O') => match self.consume_next_char() {
-                            Some(c @ 'c' | c @ 'C') => match self.consume_next_char() {
-                                Some(t @ 't' | t @ 'T') => match self.consume_next_char() {
-                                    Some(y @ 'y' | y @ 'Y') => match self.consume_next_char() {
-                                        Some(p @ 'p' | p @ 'P') => match self.consume_next_char() {
-                                            Some(e @ 'e' | e @ 'E') => {
+                    Some(d @ b'd' | d @ b'D') => match self.consume_next_char() {
+                        Some(o @ b'o' | o @ b'O') => match self.consume_next_char() {
+                            Some(c @ b'c' | c @ b'C') => match self.consume_next_char() {
+                                Some(t @ b't' | t @ b'T') => match self.consume_next_char() {
+                                    Some(y @ b'y' | y @ b'Y') => match self.consume_next_char() {
+                                        Some(p @ b'p' | p @ b'P') => match self.consume_next_char() {
+                                            Some(e @ b'e' | e @ b'E') => {
                                                 self.state = State::Doctype;
 
                                                 let b = self.sub_buf.clone();
@@ -2927,13 +2928,13 @@ where
 
                                                 sub_buf.push('<');
                                                 sub_buf.push('!');
-                                                sub_buf.push(d);
-                                                sub_buf.push(o);
-                                                sub_buf.push(c);
-                                                sub_buf.push(t);
-                                                sub_buf.push(y);
-                                                sub_buf.push(p);
-                                                sub_buf.push(e);
+                                                sub_buf.push(d as char);
+                                                sub_buf.push(o as char);
+                                                sub_buf.push(c as char);
+                                                sub_buf.push(t as char);
+                                                sub_buf.push(y as char);
+                                                sub_buf.push(p as char);
+                                                sub_buf.push(e as char);
                                             }
                                             _ => {
                                                 anything_else(self);
@@ -2966,13 +2967,13 @@ where
                     // section state. Otherwise, this is a cdata-in-html-content parse
                     // error. Create a comment token whose data is the "[CDATA[" string.
                     // Switch to the bogus comment state.
-                    Some('[') => match self.consume_next_char() {
-                        Some('C') => match self.consume_next_char() {
-                            Some('D') => match self.consume_next_char() {
-                                Some('A') => match self.consume_next_char() {
-                                    Some('T') => match self.consume_next_char() {
-                                        Some('A') => match self.consume_next_char() {
-                                            Some('[') => {
+                    Some(b'[') => match self.consume_next_char() {
+                        Some(b'C') => match self.consume_next_char() {
+                            Some(b'D') => match self.consume_next_char() {
+                                Some(b'A') => match self.consume_next_char() {
+                                    Some(b'T') => match self.consume_next_char() {
+                                        Some(b'A') => match self.consume_next_char() {
+                                            Some(b'[') => {
                                                 if let Some(false) = self.is_adjusted_current_node_is_element_in_html_namespace {
                                                     self.state = State::CdataSection;
                                                 } else {
@@ -3023,13 +3024,13 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment start dash state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentStartDash;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-closing-of-empty-comment parse error. Switch to the
                     // data state. Emit the current comment token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
                         self.state = State::Data;
                         self.emit_comment_token(Some(">"));
@@ -3047,13 +3048,13 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment end state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentEnd;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-closing-of-empty-comment parse error. Switch to the
                     // data state. Emit the current comment token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.emit_error(ErrorKind::AbruptClosingOfEmptyComment);
                         self.state = State::Data;
                         self.emit_comment_token(Some("->"));
@@ -3084,21 +3085,21 @@ where
                     // U+003C LESS-THAN SIGN (<)
                     // Append the current input character to the comment token's data. Switch to
                     // the comment less-than sign state.
-                    Some(c @ '<') => {
-                        self.append_to_comment_token(c, c);
+                    Some(c @ b'<') => {
+                        self.append_to_comment_token(c as char, c as char);
                         self.state = State::CommentLessThanSign;
                     }
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment end dash state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentEndDash;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the comment token's data.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c);
+                        self.append_to_comment_token(REPLACEMENT_CHARACTER, c as char);
                     }
                     // EOF
                     // This is an eof-in-comment parse error. Emit the current comment token.
@@ -3133,14 +3134,14 @@ where
                     // U+0021 EXCLAMATION MARK (!)
                     // Append the current input character to the comment token's data. Switch to
                     // the comment less-than sign bang state.
-                    Some(c @ '!') => {
-                        self.append_to_comment_token(c, c);
+                    Some(c @ b'!') => {
+                        self.append_to_comment_token(c as char, c as char);
                         self.state = State::CommentLessThanSignBang;
                     }
                     // U+003C LESS-THAN SIGN (<)
                     // Append the current input character to the comment token's data.
-                    Some(c @ '<') => {
-                        self.append_to_comment_token(c, c);
+                    Some(c @ b'<') => {
+                        self.append_to_comment_token(c as char, c as char);
                     }
                     // Anything else
                     // Reconsume in the comment state.
@@ -3155,7 +3156,7 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment less-than sign bang dash state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentLessThanSignBangDash;
                     }
                     // Anything else
@@ -3171,7 +3172,7 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment less-than sign bang dash dash state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentLessThanSignBangDashDash;
                     }
                     // Anything else
@@ -3188,7 +3189,7 @@ where
                     // U+003E GREATER-THAN SIGN (>)
                     // EOF
                     // Reconsume in the comment end state.
-                    Some('>') | None => {
+                    Some(b'>') | None => {
                         self.reconsume_in_state(State::CommentEnd);
                     }
                     // Anything else
@@ -3205,7 +3206,7 @@ where
                 match self.consume_next_char() {
                     // U+002D HYPHEN-MINUS (-)
                     // Switch to the comment end state.
-                    Some('-') => {
+                    Some(b'-') => {
                         self.state = State::CommentEnd;
                     }
                     // EOF
@@ -3233,19 +3234,19 @@ where
                 match self.consume_next_char() {
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current comment token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.state = State::Data;
                         self.emit_comment_token(Some("-->"));
                     }
                     // U+0021 EXCLAMATION MARK (!)
                     // Switch to the comment end bang state.
-                    Some('!') => {
+                    Some(b'!') => {
                         self.state = State::CommentEndBang;
                     }
                     // U+002D HYPHEN-MINUS (-)
                     // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
-                    Some(c @ '-') => {
-                        self.append_to_comment_token(c, c);
+                    Some(c @ b'-') => {
+                        self.append_to_comment_token(c as char, c as char);
                     }
                     // EOF
                     // This is an eof-in-comment parse error. Emit the current comment token.
@@ -3275,16 +3276,16 @@ where
                     // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION
                     // MARK character (!) to the comment token's data. Switch to the comment end
                     // dash state.
-                    Some(c @ '-') => {
-                        self.append_to_comment_token(c, c);
-                        self.append_to_comment_token('-', '-');
+                    Some(c @ b'-') => {
+                        self.append_to_comment_token(c as char, c as char);
+                        self.append_to_comment_token(c as char, c as char);
                         self.append_to_comment_token('!', '!');
                         self.state = State::CommentEndDash;
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an incorrectly-closed-comment parse error. Switch to the data
                     // state. Emit the current comment token.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.emit_error(ErrorKind::IncorrectlyClosedComment);
                         self.state = State::Data;
                         self.emit_comment_token(Some(">"));
@@ -3326,7 +3327,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Reconsume in the before DOCTYPE name state.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.reconsume_in_state(State::BeforeDoctypeName);
                     }
                     // EOF
@@ -3370,14 +3371,14 @@ where
                     Some(c) if is_ascii_upper_alpha(c) => {
                         self.append_raw_to_doctype_token(c);
                         self.create_doctype_token();
-                        self.set_doctype_token_name(c.to_ascii_lowercase());
+                        self.set_doctype_token_name(c.to_ascii_lowercase() as char);
                         self.state = State::DoctypeName;
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Create a new DOCTYPE
                     // token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character.
                     // Switch to the DOCTYPE name state.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         self.create_doctype_token();
@@ -3388,7 +3389,7 @@ where
                     // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
                     // Set its force-quirks flag to on. Switch to the data state. Emit the
                     // current token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingDoctypeName);
                         self.create_doctype_token();
@@ -3416,7 +3417,7 @@ where
                         self.validate_input_stream_character(c);
                         self.append_raw_to_doctype_token(c);
                         self.create_doctype_token();
-                        self.set_doctype_token_name(c);
+                        self.set_doctype_token_name(c as char);
                         self.state = State::DoctypeName;
                     }
                 }
@@ -3437,7 +3438,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.finish_doctype_token_name();
                         self.state = State::Data;
@@ -3447,15 +3448,15 @@ where
                     // Append the lowercase version of the current input character (add 0x0020
                     // to the character's code point) to the current DOCTYPE token's name.
                     Some(c) if is_ascii_upper_alpha(c) => {
-                        self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha);
+                        self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha_char);
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current DOCTYPE token's name.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER), None, None);
+                        self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER as u8), None, None);
                     }
                     // EOF
                     // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
@@ -3474,12 +3475,14 @@ where
                     // Append the current input character to the current DOCTYPE token's name.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_doctype_token_name(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_doctype_token_name(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
-                            !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c)
+                            !is_spacy_char(ch)
+                                && !matches!(ch, '>' | '\x00')
+                                && !is_ascii_upper_alpha_char(ch)
                         });
                     }
                 }
@@ -3500,7 +3503,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.state = State::Data;
                         self.emit_doctype_token();
@@ -3534,12 +3537,12 @@ where
                         let b = self.buf.clone();
                         let mut buf = b.borrow_mut();
 
-                        buf.push(c);
+                        buf.push(c as char);
 
                         for _ in 0..5 {
                             match self.consume_next_char() {
                                 Some(c) => {
-                                    buf.push(c);
+                                    buf.push(c as char);
                                 }
                                 _ => {
                                     break;
@@ -3602,7 +3605,7 @@ where
                     // Set the current DOCTYPE token's public identifier to the empty string
                     // (not missing), then switch to the DOCTYPE public identifier
                     // (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
                         self.set_doctype_token_public_id();
@@ -3613,7 +3616,7 @@ where
                     // Set the current DOCTYPE token's public identifier to the empty string
                     // (not missing), then switch to the DOCTYPE public identifier
                     // (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword);
                         self.set_doctype_token_public_id();
@@ -3623,7 +3626,7 @@ where
                     // This is a missing-doctype-public-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
                         self.set_doctype_token_force_quirks();
@@ -3669,7 +3672,7 @@ where
                     // Set the current DOCTYPE token's public identifier to the empty string
                     // (not missing), then switch to the DOCTYPE public identifier
                     // (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_public_id();
                         self.state = State::DoctypePublicIdentifierDoubleQuoted;
@@ -3678,7 +3681,7 @@ where
                     // Set the current DOCTYPE token's public identifier to the empty string
                     // (not missing), then switch to the DOCTYPE public identifier
                     // (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_public_id();
                         self.state = State::DoctypePublicIdentifierSingleQuoted;
@@ -3687,7 +3690,7 @@ where
                     // This is a missing-doctype-public-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingDoctypePublicIdentifier);
                         self.set_doctype_token_force_quirks();
@@ -3723,7 +3726,7 @@ where
                 match self.consume_next_char() {
                     // U+0022 QUOTATION MARK (")
                     // Switch to the after DOCTYPE public identifier state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.finish_doctype_token_public_id();
                         self.state = State::AfterDoctypePublicIdentifier;
@@ -3732,16 +3735,16 @@ where
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
                     // identifier.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
+                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-public-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.finish_doctype_token_public_id();
                         self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
@@ -3767,12 +3770,12 @@ where
                     // identifier.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_doctype_token_public_id(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
-                            !matches!(c, '"' | '\x00' | '>' | '\r')
+                            !matches!(ch, '"' | '\x00' | '>' | '\r')
                         });
                     }
                 }
@@ -3783,7 +3786,7 @@ where
                 match self.consume_next_char() {
                     // U+0027 APOSTROPHE (')
                     // Switch to the after DOCTYPE public identifier state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.finish_doctype_token_public_id();
                         self.append_raw_to_doctype_token(c);
                         self.state = State::AfterDoctypePublicIdentifier;
@@ -3792,16 +3795,16 @@ where
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current DOCTYPE token's public
                     // identifier.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None);
+                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-public-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.finish_doctype_token_public_id();
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier);
@@ -3827,12 +3830,12 @@ where
                     // identifier.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_doctype_token_public_id(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
-                            !matches!(c, '\'' | '\x00' | '>' | '\r')
+                            !matches!(ch, '\'' | '\x00' | '>' | '\r')
                         });
                     }
                 }
@@ -3852,7 +3855,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.state = State::Data;
                         self.emit_doctype_token();
@@ -3862,7 +3865,7 @@ where
                     // parse error. Set the current DOCTYPE token's system
                     // identifier to the empty string (not missing), then switch
                     // to the DOCTYPE system identifier (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(
                             ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
@@ -3875,7 +3878,7 @@ where
                     // parse error. Set the current DOCTYPE token's system
                     // identifier to the empty string (not missing), then switch
                     // to the DOCTYPE system identifier (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(
                             ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers,
@@ -3920,7 +3923,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.state = State::Data;
                         self.emit_doctype_token();
@@ -3929,7 +3932,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_system_id();
                         self.state = State::DoctypeSystemIdentifierDoubleQuoted;
@@ -3938,7 +3941,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_system_id();
                         self.state = State::DoctypeSystemIdentifierSingleQuoted;
@@ -3984,7 +3987,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
                         self.set_doctype_token_system_id();
@@ -3995,7 +3998,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword);
                         self.set_doctype_token_system_id();
@@ -4005,7 +4008,7 @@ where
                     // This is a missing-doctype-system-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::MissingDoctypeSystemIdentifier);
                         self.set_doctype_token_force_quirks();
@@ -4051,7 +4054,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (double-quoted) state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_system_id();
                         self.state = State::DoctypeSystemIdentifierDoubleQuoted;
@@ -4060,7 +4063,7 @@ where
                     // Set the current DOCTYPE token's system identifier to the empty string
                     // (not missing), then switch to the DOCTYPE system identifier
                     // (single-quoted) state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.append_raw_to_doctype_token(c);
                         self.set_doctype_token_system_id();
                         self.state = State::DoctypeSystemIdentifierSingleQuoted;
@@ -4069,7 +4072,7 @@ where
                     // This is a missing-doctype-system-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::EofInDoctype);
                         self.set_doctype_token_force_quirks();
@@ -4105,7 +4108,7 @@ where
                 match self.consume_next_char() {
                     // U+0027 APOSTROPHE (')
                     // Switch to the after DOCTYPE system identifier state.
-                    Some(c @ '"') => {
+                    Some(c @ b'"') => {
                         self.finish_doctype_token_system_id();
                         self.append_raw_to_doctype_token(c);
                         self.state = State::AfterDoctypeSystemIdentifier;
@@ -4114,16 +4117,16 @@ where
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
                     // identifier.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
+                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8));
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-system-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.finish_doctype_token_system_id();
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
@@ -4149,12 +4152,12 @@ where
                     // identifier.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_doctype_token_system_id(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
-                            !matches!(c, '"' | '\x00' | '>' | '\r')
+                            !matches!(ch, '"' | '\x00' | '>' | '\r')
                         });
                     }
                 }
@@ -4165,7 +4168,7 @@ where
                 match self.consume_next_char() {
                     // U+0027 APOSTROPHE (')
                     // Switch to the after DOCTYPE system identifier state.
-                    Some(c @ '\'') => {
+                    Some(c @ b'\'') => {
                         self.finish_doctype_token_system_id();
                         self.append_raw_to_doctype_token(c);
                         self.state = State::AfterDoctypeSystemIdentifier;
@@ -4174,16 +4177,16 @@ where
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current DOCTYPE token's system
                     // identifier.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER));
+                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8));
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-system-identifier parse error. Set the current
                     // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit
                     // the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.finish_doctype_token_system_id();
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier);
@@ -4209,12 +4212,12 @@ where
                     // identifier.
                     Some(c) => {
                         self.validate_input_stream_character(c);
-                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
-                            if !is_allowed_character(c) {
+                        self.consume_and_append_to_doctype_token_system_id(c, |ch| {
+                            if !is_allowed_character(ch) {
                                 return false;
                             }
 
-                            !matches!(c, '\'' | '\x00' | '>' | '\r')
+                            !matches!(ch, '\'' | '\x00' | '>' | '\r')
                         });
                     }
                 }
@@ -4233,7 +4236,7 @@ where
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the current DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.state = State::Data;
                         self.emit_doctype_token();
@@ -4266,14 +4269,14 @@ where
                 match self.consume_next_char() {
                     // U+003E GREATER-THAN SIGN (>)
                     // Switch to the data state. Emit the DOCTYPE token.
-                    Some(c @ '>') => {
+                    Some(c @ b'>') => {
                         self.append_raw_to_doctype_token(c);
                         self.state = State::Data;
                         self.emit_doctype_token();
                     }
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Ignore the character.
-                    Some(c @ '\x00') => {
+                    Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                     }
@@ -4299,7 +4302,7 @@ where
                 match self.consume_next_char() {
                     // U+005D RIGHT SQUARE BRACKET (])
                     // Switch to the CDATA section bracket state.
-                    Some(']') => {
+                    Some(b']') => {
                         self.state = State::CdataSectionBracket;
                     }
                     // EOF
@@ -4324,14 +4327,14 @@ where
                 match self.consume_next_char() {
                     // U+005D RIGHT SQUARE BRACKET (])
                     // Switch to the CDATA section end state.
-                    Some(']') => {
+                    Some(b']') => {
                         self.state = State::CdataSectionEnd;
                     }
                     // Anything else
                     // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
                     // CDATA section state.
                     _ => {
-                        self.emit_character_token(']');
+                        self.emit_character_token(b']');
                         self.reconsume_in_state(State::CdataSection);
                     }
                 }
@@ -4342,20 +4345,20 @@ where
                 match self.consume_next_char() {
                     // U+005D RIGHT SQUARE BRACKET (])
                     // Emit a U+005D RIGHT SQUARE BRACKET character token.
-                    Some(c @ ']') => {
+                    Some(c @ b']') => {
                         self.emit_character_token_with_raw(']', c);
                     }
                     // U+003E GREATER-THAN SIGN character
                     // Switch to the data state.
-                    Some('>') => {
+                    Some(b'>') => {
                         self.state = State::Data;
                     }
                     // Anything else
                     // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the
                     // CDATA section state.
                     _ => {
-                        self.emit_character_token(']');
-                        self.emit_character_token(']');
+                        self.emit_character_token(b']');
+                        self.emit_character_token(b']');
                         self.reconsume_in_state(State::CdataSection);
                     }
                 }
@@ -4377,8 +4380,8 @@ where
                     // U+0023 NUMBER SIGN (#)
                     // Append the current input character to the temporary buffer. Switch to the
                     // numeric character reference state.
-                    Some(c @ '#') => {
-                        self.temporary_buffer.push(c);
+                    Some(c @ b'#') => {
+                        self.temporary_buffer.push(c as char);
                         self.state = State::NumericCharacterReference;
                     }
                     // Anything else
@@ -4409,7 +4412,7 @@ where
 
                 // No need to validate input, because we reset position if nothing was found
                 while let Some(c) = &self.consume_next_char() {
-                    entity_temporary_buffer.push(*c);
+                    entity_temporary_buffer.push(*c as char);
 
                     if let Some(found_entity) = HTML_ENTITIES.get(&entity_temporary_buffer) {
                         entity = Some(found_entity);
@@ -4448,7 +4451,7 @@ where
                 match entity {
                     Some(entity) => {
                         let is_next_equals_sign_or_ascii_alphanumeric = match self.next() {
-                            Some('=') => true,
+                            Some(b'=') => true,
                             Some(c) if c.is_ascii_alphanumeric() => true,
                             _ => false,
                         };
@@ -4514,7 +4517,7 @@ where
                     // Otherwise, emit the current input character as a character token.
                     Some(c) if c.is_ascii_alphanumeric() => {
                         if self.is_consumed_as_part_of_an_attribute() {
-                            self.append_to_attribute_token_value(Some(c), Some(c));
+                            self.append_to_attribute_token_value(Some(c as char), Some(c as char));
                         } else {
                             self.emit_character_token(c);
                         }
@@ -4522,7 +4525,7 @@ where
                     // U+003B SEMICOLON (;)
                     // This is an unknown-named-character-reference parse error. Reconsume in
                     // the return state.
-                    Some(';') => {
+                    Some(b';') => {
                         self.emit_error(ErrorKind::UnknownNamedCharacterReference);
                         self.reconsume_in_state(self.return_state.clone());
                     }
@@ -4543,8 +4546,8 @@ where
                     // U+0058 LATIN CAPITAL LETTER X
                     // Append the current input character to the temporary buffer. Switch to the
                     // hexadecimal character reference start state.
-                    Some(c @ 'x' | c @ 'X') => {
-                        self.temporary_buffer.push(c);
+                    Some(c @ b'x' | c @ b'X') => {
+                        self.temporary_buffer.push(c as char);
                         self.state = State::HexademicalCharacterReferenceStart;
                     }
                     // Anything else
@@ -4604,7 +4607,7 @@ where
                     // to the character reference code.
                     Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
                         Some(character_reference_code) => {
-                            character_reference_code.push((16, c as u32 - 0x30, Some(c)));
+                            character_reference_code.push((16, c as u32 - 0x30, Some(c as char)));
                         }
                         _ => {
                             unreachable!();
@@ -4616,7 +4619,7 @@ where
                     // character's code point) to the character reference code.
                     Some(c) if is_upper_hex_digit(c) => match &mut self.character_reference_code {
                         Some(character_reference_code) => {
-                            character_reference_code.push((16, c as u32 - 0x37, Some(c)));
+                            character_reference_code.push((16, c as u32 - 0x37, Some(c as char)));
                         }
                         _ => {
                             unreachable!();
@@ -4628,7 +4631,7 @@ where
                     // character's code point) to the character reference code.
                     Some(c) if is_lower_hex_digit(c) => match &mut self.character_reference_code {
                         Some(character_reference_code) => {
-                            character_reference_code.push((16, c as u32 - 0x57, Some(c)));
+                            character_reference_code.push((16, c as u32 - 0x57, Some(c as char)));
                         }
                         _ => {
                             unreachable!();
@@ -4636,7 +4639,7 @@ where
                     },
                     // U+003B SEMICOLON
                     // Switch to the numeric character reference end state.
-                    Some(';') => {
+                    Some(b';') => {
                         self.state = State::NumericCharacterReferenceEnd;
                     }
                     // Anything else
@@ -4658,7 +4661,7 @@ where
                     // to the character reference code.
                     Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code {
                         Some(character_reference_code) => {
-                            character_reference_code.push((10, c as u32 - 0x30, Some(c)));
+                            character_reference_code.push((10, c as u32 - 0x30, Some(c as char)));
                         }
                         _ => {
                             unreachable!();
@@ -4666,7 +4669,7 @@ where
                     },
                     // U+003B SEMICOLON
                     // Switch to the numeric character reference end state.
-                    Some(';') => self.state = State::NumericCharacterReferenceEnd,
+                    Some(b';') => self.state = State::NumericCharacterReferenceEnd,
                     // Anything else
                     // This is a missing-semicolon-after-character-reference parse error.
                     // Reconsume in the numeric character reference end state.
@@ -4832,7 +4835,7 @@ where
                 raw.push_str(&old_temporary_buffer);
                 raw.push_str(&raw_char_ref);
 
-                if self.cur == Some(';') {
+                if self.cur == Some(b';') {
                     raw.push(';');
                 }
 
@@ -4855,8 +4858,8 @@ where
     }
 
     #[inline(always)]
-    fn skip_whitespaces(&mut self, c: char) {
-        if c == '\r' && self.input.cur() == Some('\n') {
+    fn skip_whitespaces(&mut self, c: u8) {
+        if c == b'\r' && self.input.cur() == Some(b'\n') {
             unsafe {
                 // Safety: cur() is Some
                 self.input.bump();
@@ -4868,8 +4871,13 @@ where
 // By spec '\r` removed before tokenizer, but we keep them to have better AST
 // and don't break logic to ignore characters
 #[inline(always)]
-fn is_spacy(c: char) -> bool {
-    matches!(c, '\x09' | '\x0a' | '\x0d' | '\x0c' | '\x20')
+fn is_spacy(c: u8) -> bool {
+    matches!(c, b'\x09' | b'\x0a' | b'\x0d' | b'\x0c' | b'\x20')
+}
+
+#[inline(always)]
+fn is_spacy_char(c: char) -> bool {
+    is_spacy(c as u8)
 }
 
 #[inline(always)]
@@ -4932,35 +4940,65 @@ fn is_noncharacter(c: u32) -> bool {
 }
 
 #[inline(always)]
-fn is_upper_hex_digit(c: char) -> bool {
-    matches!(c, '0'..='9' | 'A'..='F')
+fn is_upper_hex_digit(c: u8) -> bool {
+    matches!(c, b'0'..=b'9' | b'A'..=b'F')
 }
 
 #[inline(always)]
-fn is_lower_hex_digit(c: char) -> bool {
-    matches!(c, '0'..='9' | 'a'..='f')
+fn is_lower_hex_digit(c: u8) -> bool {
+    matches!(c, b'0'..=b'9' | b'a'..=b'f')
 }
 
 #[inline(always)]
-fn is_ascii_hex_digit(c: char) -> bool {
+fn is_ascii_hex_digit(c: u8) -> bool {
     is_upper_hex_digit(c) || is_lower_hex_digit(c)
 }
 
 #[inline(always)]
-fn is_ascii_upper_alpha(c: char) -> bool {
+fn is_upper_hex_digit_char(c: char) -> bool {
+    is_upper_hex_digit(c as u8)
+}
+
+#[inline(always)]
+fn is_lower_hex_digit_char(c: char) -> bool {
+    is_lower_hex_digit(c as u8)
+}
+
+#[inline(always)]
+fn is_ascii_hex_digit_char(c: char) -> bool {
+    is_ascii_hex_digit(c as u8)
+}
+
+#[inline(always)]
+fn is_ascii_upper_alpha(c: u8) -> bool {
     c.is_ascii_uppercase()
 }
 
 #[inline(always)]
-fn is_ascii_lower_alpha(c: char) -> bool {
+fn is_ascii_lower_alpha(c: u8) -> bool {
     c.is_ascii_lowercase()
 }
 
 #[inline(always)]
-fn is_ascii_alpha(c: char) -> bool {
+fn is_ascii_alpha(c: u8) -> bool {
     is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c)
 }
 
+#[inline(always)]
+fn is_ascii_upper_alpha_char(c: char) -> bool {
+    c.is_ascii_uppercase()
+}
+
+#[inline(always)]
+fn is_ascii_lower_alpha_char(c: char) -> bool {
+    c.is_ascii_lowercase()
+}
+
+#[inline(always)]
+fn is_ascii_alpha_char(c: char) -> bool {
+    is_ascii_upper_alpha_char(c) || is_ascii_lower_alpha_char(c)
+}
+
 #[inline(always)]
 fn is_allowed_control_character(c: u32) -> bool {
     c != 0x00 && is_control(c)

From 98dc922a8fdb50f3f36cb65a3234032e2d84c9f3 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 19 Nov 2025 17:18:18 -0500
Subject: [PATCH 05/20] fix ci?

---
 crates/swc_css_parser/src/lexer/mod.rs  |  46 ++++++--
 crates/swc_html_parser/src/lexer/mod.rs | 147 ++++++++++++++++++++----
 2 files changed, 162 insertions(+), 31 deletions(-)

diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 048f13b12706..1c29b02e8b68 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -963,6 +963,18 @@ where
     // will return a code point.
     fn read_escape(&mut self) -> LexResult<(char, String)> {
         self.with_sub_buf(|l, buf| {
+            // Get the full character before consuming (for non-ASCII)
+            let cur_byte = l.input.cur();
+            let cur_char = if let Some(b) = cur_byte {
+                if is_non_ascii(b) {
+                    l.input.cur_as_char()
+                } else {
+                    Some(b as char)
+                }
+            } else {
+                None
+            };
+
             // Consume the next input code point.
             match l.consume() {
                 // hex digit
@@ -1027,9 +1039,10 @@ where
                 // anything else
                 // Return the current input code point.
                 Some(c) => {
-                    buf.push(c as char);
+                    let ch = cur_char.unwrap_or(c as char);
+                    buf.push(ch);
 
-                    Ok((c as char, (&**buf).into()))
+                    Ok((ch, (&**buf).into()))
                 }
             }
         })
@@ -1178,20 +1191,37 @@ where
 
             // Repeatedly consume the next input code point from the stream:
             loop {
-                match l.consume() {
+                // For non-ASCII bytes, we need to get the full UTF-8 character before consuming
+                let cur_byte = l.input.cur();
+                let cur_char = if let Some(b) = cur_byte {
+                    if is_non_ascii(b) {
+                        l.input.cur_as_char()
+                    } else {
+                        Some(b as char)
+                    }
+                } else {
+                    None
+                };
+
+                let c = l.consume();
+
+                match c {
                     // name code point
                     // Append the code point to result.
-                    Some(c) if is_name(c) => {
-                        buf.push(c as char);
-                        raw.push(c as char);
+                    Some(byte) if is_name(byte) => {
+                        // Use the full character we got earlier
+                        if let Some(ch) = cur_char {
+                            buf.push(ch);
+                            raw.push(ch);
+                        }
                     }
                     // the stream starts with a valid escape
                     // Consume an escaped code point. Append the returned code point to result.
-                    Some(c) if l.is_valid_escape(None, None) => {
+                    Some(byte) if l.is_valid_escape(None, None) => {
                         let escaped = l.read_escape()?;
 
                         buf.push(escaped.0);
-                        raw.push(c as char);
+                        raw.push(byte as char);
                         raw.push_str(&escaped.1);
                     }
                     // anything else
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index 8d98f63b9e59..4dc796078e30 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -119,6 +119,9 @@ where
     character_reference_code: Option<Vec<(u8, u32, Option<char>)>>,
     temporary_buffer: String,
     is_adjusted_current_node_is_element_in_html_namespace: Option<bool>,
+    /// The full UTF-8 character corresponding to the current byte (for
+    /// non-ASCII)
+    current_char: Option<char>,
     phantom: std::marker::PhantomData<&'a ()>,
 }
 
@@ -149,6 +152,7 @@ where
             // Do this without a new allocation.
             temporary_buffer: String::with_capacity(33),
             is_adjusted_current_node_is_element_in_html_namespace: None,
+            current_char: None,
             phantom: std::marker::PhantomData,
         };
 
@@ -276,6 +280,17 @@ where
         // consumed.
         let c = self.next();
 
+        // Store the full UTF-8 character before consuming (for helper functions)
+        if let Some(byte) = c {
+            if is_non_ascii(byte) {
+                self.current_char = self.input.cur_as_char();
+            } else {
+                self.current_char = Some(byte as char);
+            }
+        } else {
+            self.current_char = None;
+        }
+
         self.consume();
 
         c
@@ -421,7 +436,13 @@ where
                 sub_buf.push('\n');
             }
         } else {
-            sub_buf.push(c as char);
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
+
+            sub_buf.push(ch);
         }
     }
 
@@ -435,15 +456,30 @@ where
         let mut buf = b.borrow_mut();
 
         if let Some(name) = name {
-            buf.push(name as char);
+            let ch = if is_non_ascii(name) {
+                self.input.cur_as_char().unwrap_or(name as char)
+            } else {
+                name as char
+            };
+            buf.push(ch);
         }
 
         if let Some(public_id) = public_id {
-            buf.push(public_id as char);
+            let ch = if is_non_ascii(public_id) {
+                self.input.cur_as_char().unwrap_or(public_id as char)
+            } else {
+                public_id as char
+            };
+            buf.push(ch);
         }
 
         if let Some(system_id) = system_id {
-            buf.push(system_id as char);
+            let ch = if is_non_ascii(system_id) {
+                self.input.cur_as_char().unwrap_or(system_id as char)
+            } else {
+                system_id as char
+            };
+            buf.push(ch);
         }
     }
 
@@ -456,8 +492,14 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push((c as char).to_ascii_lowercase());
-        sub_buf.push(c as char);
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+
+        buf.push(ch.to_ascii_lowercase());
+        sub_buf.push(ch);
 
         let value = self.input.uncons_while(f);
 
@@ -489,8 +531,14 @@ where
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c as char);
-            sub_buf.push(c as char);
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
+
+            buf.push(ch);
+            sub_buf.push(ch);
         }
 
         let value = self.input.uncons_while(f);
@@ -523,8 +571,14 @@ where
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c as char);
-            sub_buf.push(c as char);
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
+
+            buf.push(ch);
+            sub_buf.push(ch);
         }
 
         let value = self.input.uncons_while(f);
@@ -660,8 +714,14 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push((c as char).to_ascii_lowercase());
-        sub_buf.push(c as char);
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+
+        buf.push(ch.to_ascii_lowercase());
+        sub_buf.push(ch);
 
         let value = self.input.uncons_while(f);
 
@@ -718,8 +778,20 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push(c as char);
-        sub_buf.push(raw_c as char);
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+
+        let raw_ch = if is_non_ascii(raw_c) {
+            self.input.cur_as_char().unwrap_or(raw_c as char)
+        } else {
+            raw_c as char
+        };
+
+        buf.push(ch);
+        sub_buf.push(raw_ch);
     }
 
     fn consume_and_append_to_attribute_token_name<F>(&mut self, c: u8, f: F)
@@ -731,8 +803,14 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push((c as char).to_ascii_lowercase());
-        sub_buf.push(c as char);
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+
+        buf.push(ch.to_ascii_lowercase());
+        sub_buf.push(ch);
 
         let value = self.input.uncons_while(f);
 
@@ -749,10 +827,16 @@ where
         let b = self.sub_buf.clone();
         let mut sub_buf = b.borrow_mut();
 
-        buf.push((c as char).to_ascii_lowercase());
-        sub_buf.push(c as char);
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+
+        buf.push(ch.to_ascii_lowercase());
+        sub_buf.push(ch);
 
-        self.temporary_buffer.push(c as char);
+        self.temporary_buffer.push(ch);
 
         let value = self.input.uncons_while(f);
 
@@ -857,8 +941,14 @@ where
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c as char);
-            sub_buf.push(c as char);
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
+
+            buf.push(ch);
+            sub_buf.push(ch);
         }
 
         let value = self.input.uncons_while(f);
@@ -988,8 +1078,14 @@ where
                 sub_buf.push('\n');
             }
         } else {
-            buf.push(c as char);
-            sub_buf.push(c as char);
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
+
+            buf.push(ch);
+            sub_buf.push(ch);
         }
 
         let value = self.input.uncons_while(f);
@@ -5014,3 +5110,8 @@ fn is_allowed_character(c: char) -> bool {
 
     return true;
 }
+
+#[inline(always)]
+fn is_non_ascii(c: u8) -> bool {
+    c >= 0x80
+}

From bb477af05cb2b03471f7c15ce67f03069f9a0539 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 20 Nov 2025 04:37:06 -0500
Subject: [PATCH 06/20] fix(parser): Fix CI errors after byte conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove unnecessary `as u8` cast in ECMAScript lexer (clippy error)
- Fix XML parser to use `cur_as_char()` and byte literals for char comparisons
- Fix HTML parser current_char usage (already in staged changes)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_ecma_parser/src/lexer/mod.rs |  2 +-
 crates/swc_html_parser/src/lexer/mod.rs | 54 +++++++++++++++++++------
 crates/swc_xml_parser/src/lexer/mod.rs  | 26 ++++++------
 3 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index a802304c98eb..6aff7668c428 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -2198,7 +2198,7 @@ impl<'a> Lexer<'a> {
     fn read_str_lit(&mut self) -> LexResult<Token> {
         debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"'));
         let start = self.cur_pos();
-        let quote = self.cur().unwrap() as u8;
+        let quote = self.cur().unwrap();
 
         self.bump(); // '"' or '\''
 
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index 4dc796078e30..bc9888eeb177 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -457,7 +457,7 @@ where
 
         if let Some(name) = name {
             let ch = if is_non_ascii(name) {
-                self.input.cur_as_char().unwrap_or(name as char)
+                self.current_char.unwrap_or(name as char)
             } else {
                 name as char
             };
@@ -466,7 +466,7 @@ where
 
         if let Some(public_id) = public_id {
             let ch = if is_non_ascii(public_id) {
-                self.input.cur_as_char().unwrap_or(public_id as char)
+                self.current_char.unwrap_or(public_id as char)
             } else {
                 public_id as char
             };
@@ -475,7 +475,7 @@ where
 
         if let Some(system_id) = system_id {
             let ch = if is_non_ascii(system_id) {
-                self.input.cur_as_char().unwrap_or(system_id as char)
+                self.current_char.unwrap_or(system_id as char)
             } else {
                 system_id as char
             };
@@ -785,7 +785,7 @@ where
         };
 
         let raw_ch = if is_non_ascii(raw_c) {
-            self.input.cur_as_char().unwrap_or(raw_c as char)
+            self.current_char.unwrap_or(raw_c as char)
         } else {
             raw_c as char
         };
@@ -1115,8 +1115,13 @@ where
 
     #[inline(always)]
     fn emit_character_token(&mut self, value: u8) {
+        let ch = if is_non_ascii(value) {
+            self.current_char.unwrap_or(value as char)
+        } else {
+            value as char
+        };
         self.emit_token(Token::Character {
-            value: value as char,
+            value: ch,
             raw: Some(Raw::Same),
         });
     }
@@ -1126,7 +1131,12 @@ where
         let b = self.buf.clone();
         let mut buf = b.borrow_mut();
 
-        buf.push(raw_c as char);
+        let raw_ch = if is_non_ascii(raw_c) {
+            self.current_char.unwrap_or(raw_c as char)
+        } else {
+            raw_c as char
+        };
+        buf.push(raw_ch);
 
         self.emit_token(Token::Character {
             value: c,
@@ -1160,8 +1170,13 @@ where
 
             buf.clear();
         } else {
+            let ch = if is_non_ascii(c) {
+                self.current_char.unwrap_or(c as char)
+            } else {
+                c as char
+            };
             self.emit_token(Token::Character {
-                value: c as char,
+                value: ch,
                 raw: Some(Raw::Same),
             });
         }
@@ -2539,7 +2554,9 @@ where
                     // REPLACEMENT CHARACTER character to the current attribute's name.
                     Some(c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_attribute_token_name(REPLACEMENT_CHARACTER as u8, c);
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // U+0022 QUOTATION MARK (")
                     // U+0027 APOSTROPHE (')
@@ -3552,7 +3569,10 @@ where
                     Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER as u8), None, None);
+
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // EOF
                     // This is an eof-in-doctype parse error. Set the current DOCTYPE token's
@@ -3834,7 +3854,9 @@ where
                     Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None);
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-public-identifier parse error. Set the current
@@ -3894,7 +3916,9 @@ where
                     Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER as u8), None);
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-public-identifier parse error. Set the current
@@ -4216,7 +4240,9 @@ where
                     Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8));
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-system-identifier parse error. Set the current
@@ -4276,7 +4302,9 @@ where
                     Some(c @ b'\x00') => {
                         self.append_raw_to_doctype_token(c);
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
-                        self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER as u8));
+                        let b = self.buf.clone();
+                        let mut buf = b.borrow_mut();
+                        buf.push(REPLACEMENT_CHARACTER);
                     }
                     // U+003E GREATER-THAN SIGN (>)
                     // This is an abrupt-doctype-system-identifier parse error. Set the current
diff --git a/crates/swc_xml_parser/src/lexer/mod.rs b/crates/swc_xml_parser/src/lexer/mod.rs
index 95b4c5057614..ebac6ad14726 100644
--- a/crates/swc_xml_parser/src/lexer/mod.rs
+++ b/crates/swc_xml_parser/src/lexer/mod.rs
@@ -173,9 +173,9 @@ where
 
         // A leading Byte Order Mark (BOM) causes the character encoding argument to be
         // ignored and will itself be skipped.
-        if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') {
+        if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') {
             unsafe {
-                // Safety: cur() is Some('\u{feff}')
+                // Safety: cur_as_char() is Some('\u{feff}')
                 lexer.input.bump();
             }
         }
@@ -224,7 +224,7 @@ where
 {
     #[inline(always)]
     fn next(&mut self) -> Option<char> {
-        self.input.cur()
+        self.input.cur_as_char()
     }
 
     // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any
@@ -249,12 +249,12 @@ where
 
     #[inline(always)]
     fn consume(&mut self) {
-        self.cur = self.input.cur();
+        self.cur = self.input.cur_as_char();
         self.cur_pos = self.input.cur_pos();
 
         if self.cur.is_some() {
             unsafe {
-                // Safety: cur() is Some(c)
+                // Safety: cur_as_char() is Some(c)
                 self.input.bump();
             }
         }
@@ -573,9 +573,9 @@ where
 
                 raw.push(c);
 
-                if self.input.cur() == Some('\n') {
+                if self.input.cur() == Some(b'\n') {
                     unsafe {
-                        // Safety: cur() is Some('\n')
+                        // Safety: cur() is Some(b'\n')
                         self.input.bump();
                     }
 
@@ -895,9 +895,9 @@ where
 
                 raw_c.push(c);
 
-                if self.input.cur() == Some('\n') {
+                if self.input.cur() == Some(b'\n') {
                     unsafe {
-                        // Safety: cur() is Some('\n')
+                        // Safety: cur() is Some(b'\n')
                         self.input.bump();
                     }
 
@@ -962,9 +962,9 @@ where
 
             raw.push(c);
 
-            if self.input.cur() == Some('\n') {
+            if self.input.cur() == Some(b'\n') {
                 unsafe {
-                    // Safety: cur() is Some('\n')
+                    // Safety: cur() is Some(b'\n')
                     self.input.bump();
                 }
 
@@ -3104,9 +3104,9 @@ where
 
     #[inline(always)]
     fn skip_next_lf(&mut self, c: char) {
-        if c == '\r' && self.input.cur() == Some('\n') {
+        if c == '\r' && self.input.cur() == Some(b'\n') {
             unsafe {
-                // Safety: cur() is Some('\n')
+                // Safety: cur() is Some(b'\n')
                 self.input.bump();
             }
         }

From 6331ce0e7b538343ac065a7e0c022d9443a7e04b Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 20 Nov 2025 06:37:21 -0500
Subject: [PATCH 07/20] fix(parser): Fix UTF-8 multibyte character handling
 after byte conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes issues where multibyte UTF-8 characters (like '⬇' and '😀') were being corrupted during parsing after the char-to-byte conversion. The problem was that individual UTF-8 bytes were being cast directly to char using `as char`, which treats each byte as a separate Unicode code point.

**Changes made:**

1. **CSS Parser** (`swc_css_parser`):
   - Updated string tokenization to use `Input::cur_as_char()` for non-ASCII bytes
   - Fixed URL tokenization to properly decode multibyte UTF-8 sequences
   - Fixed bad URL remnant parsing

2. **HTML Parser** (`swc_html_parser`):
   - Removed truly unused functions (`is_upper_hex_digit_char`, `is_lower_hex_digit_char`, `is_ascii_hex_digit_char`, `is_ascii_lower_alpha_char`, `is_ascii_alpha_char`, `append_to_doctype_token`)
   - Kept `is_ascii_upper_alpha_char` which is still in use
   - HTML parser already had correct UTF-8 handling via `current_char` field

**Implementation:**
Before consuming bytes, we now check if it's non-ASCII (`>= 0x80`) and call `cur_as_char()` to get the full UTF-8 character before advancing the input stream. For ASCII bytes, we continue using the fast path of `as char`.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_css_parser/src/lexer/mod.rs  | 78 +++++++++++++++++++++----
 crates/swc_html_parser/src/lexer/mod.rs | 62 --------------------
 2 files changed, 67 insertions(+), 73 deletions(-)

diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 1c29b02e8b68..4820b9670d81 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -723,6 +723,18 @@ where
 
             // Repeatedly consume the next input code point from the stream:
             loop {
+                // Get the full character before consuming (for non-ASCII)
+                let cur_byte = l.input.cur();
+                let cur_char = if let Some(b) = cur_byte {
+                    if is_non_ascii(b) {
+                        l.input.cur_as_char()
+                    } else {
+                        Some(b as char)
+                    }
+                } else {
+                    None
+                };
+
                 match l.consume() {
                     // ending code point
                     // Return the <string-token>.
@@ -784,9 +796,11 @@ where
 
                     // Anything else
                     // Append the current input code point to the <string-token>'s value.
-                    Some(c) => {
-                        buf.push(c as char);
-                        raw.push(c as char);
+                    Some(_) => {
+                        if let Some(ch) = cur_char {
+                            buf.push(ch);
+                            raw.push(ch);
+                        }
                     }
                 }
             }
@@ -808,9 +822,15 @@ where
             // Consume as much whitespace as possible.
             while let Some(c) = l.next() {
                 if is_whitespace(c) {
+                    // Get char before consuming
+                    let ch = if is_non_ascii(c) {
+                        l.input.cur_as_char().unwrap_or(c as char)
+                    } else {
+                        c as char
+                    };
                     l.consume();
 
-                    raw.push(c as char);
+                    raw.push(ch);
                 } else {
                     break;
                 }
@@ -818,6 +838,18 @@ where
 
             // Repeatedly consume the next input code point from the stream:
             loop {
+                // Get the full character before consuming (for non-ASCII)
+                let cur_byte = l.input.cur();
+                let cur_char = if let Some(b) = cur_byte {
+                    if is_non_ascii(b) {
+                        l.input.cur_as_char()
+                    } else {
+                        Some(b as char)
+                    }
+                } else {
+                    None
+                };
+
                 match l.consume() {
                     // U+0029 RIGHT PARENTHESIS ())
                     // Return the <url-token>.
@@ -843,13 +875,21 @@ where
                     Some(c) if is_whitespace(c) => {
                         // Consume as much whitespace as possible.
                         let whitespaces: String = l.with_sub_buf(|l, buf| {
-                            buf.push(c as char);
+                            if let Some(ch) = cur_char {
+                                buf.push(ch);
+                            }
 
                             while let Some(c) = l.next() {
                                 if is_whitespace(c) {
+                                    // Get char before consuming
+                                    let ch = if is_non_ascii(c) {
+                                        l.input.cur_as_char().unwrap_or(c as char)
+                                    } else {
+                                        c as char
+                                    };
                                     l.consume();
 
-                                    buf.push(c as char);
+                                    buf.push(ch);
                                 } else {
                                     break;
                                 }
@@ -947,9 +987,11 @@ where
 
                     // anything else
                     // Append the current input code point to the <url-token>'s value.
-                    Some(c) => {
-                        out.push(c as char);
-                        raw.push(c as char);
+                    Some(_) => {
+                        if let Some(ch) = cur_char {
+                            out.push(ch);
+                            raw.push(ch);
+                        }
                     }
                 }
             }
@@ -1362,6 +1404,18 @@ where
         self.with_sub_buf(|l, raw| {
             // Repeatedly consume the next input code point from the stream:
             loop {
+                // Get the full character before consuming (for non-ASCII)
+                let cur_byte = l.input.cur();
+                let cur_char = if let Some(b) = cur_byte {
+                    if is_non_ascii(b) {
+                        l.input.cur_as_char()
+                    } else {
+                        Some(b as char)
+                    }
+                } else {
+                    None
+                };
+
                 match l.consume() {
                     // U+0029 RIGHT PARENTHESIS ())
                     // EOF
@@ -1385,8 +1439,10 @@ where
                     }
                     // anything else
                     // Do nothing.
-                    Some(c) => {
-                        raw.push(c as char);
+                    Some(_) => {
+                        if let Some(ch) = cur_char {
+                            raw.push(ch);
+                        }
                     }
                 }
             }
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index bc9888eeb177..d7e30e0dc4cc 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -446,43 +446,6 @@ where
         }
     }
 
-    fn append_to_doctype_token(
-        &mut self,
-        name: Option<u8>,
-        public_id: Option<u8>,
-        system_id: Option<u8>,
-    ) {
-        let b = self.buf.clone();
-        let mut buf = b.borrow_mut();
-
-        if let Some(name) = name {
-            let ch = if is_non_ascii(name) {
-                self.current_char.unwrap_or(name as char)
-            } else {
-                name as char
-            };
-            buf.push(ch);
-        }
-
-        if let Some(public_id) = public_id {
-            let ch = if is_non_ascii(public_id) {
-                self.current_char.unwrap_or(public_id as char)
-            } else {
-                public_id as char
-            };
-            buf.push(ch);
-        }
-
-        if let Some(system_id) = system_id {
-            let ch = if is_non_ascii(system_id) {
-                self.current_char.unwrap_or(system_id as char)
-            } else {
-                system_id as char
-            };
-            buf.push(ch);
-        }
-    }
-
     fn consume_and_append_to_doctype_token_name<F>(&mut self, c: u8, f: F)
     where
         F: Fn(char) -> bool,
@@ -5078,21 +5041,6 @@ fn is_ascii_hex_digit(c: u8) -> bool {
     is_upper_hex_digit(c) || is_lower_hex_digit(c)
 }
 
-#[inline(always)]
-fn is_upper_hex_digit_char(c: char) -> bool {
-    is_upper_hex_digit(c as u8)
-}
-
-#[inline(always)]
-fn is_lower_hex_digit_char(c: char) -> bool {
-    is_lower_hex_digit(c as u8)
-}
-
-#[inline(always)]
-fn is_ascii_hex_digit_char(c: char) -> bool {
-    is_ascii_hex_digit(c as u8)
-}
-
 #[inline(always)]
 fn is_ascii_upper_alpha(c: u8) -> bool {
     c.is_ascii_uppercase()
@@ -5113,16 +5061,6 @@ fn is_ascii_upper_alpha_char(c: char) -> bool {
     c.is_ascii_uppercase()
 }
 
-#[inline(always)]
-fn is_ascii_lower_alpha_char(c: char) -> bool {
-    c.is_ascii_lowercase()
-}
-
-#[inline(always)]
-fn is_ascii_alpha_char(c: char) -> bool {
-    is_ascii_upper_alpha_char(c) || is_ascii_lower_alpha_char(c)
-}
-
 #[inline(always)]
 fn is_allowed_control_character(c: u32) -> bool {
     c != 0x00 && is_control(c)

From 803ec47debf6f54eb1a5bce36bf7920c395dd3d6 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 20 Nov 2025 07:08:58 -0500
Subject: [PATCH 08/20] fi xci

---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs |  2 +-
 crates/swc_html_parser/src/lexer/mod.rs       | 16 +++++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index 0c4da3a6b6c7..3bc438454779 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -2055,7 +2055,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     fn read_str_lit(&mut self) -> LexResult<Self::Token> {
         debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"'));
         let start = self.cur_pos();
-        let quote = self.cur().unwrap() as u8;
+        let quote = self.cur().unwrap();
 
         self.bump(); // '"' or '\''
 
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index d7e30e0dc4cc..9b2812edd93d 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -233,7 +233,12 @@ where
     // `anything else`
     #[inline(always)]
     fn validate_input_stream_character(&mut self, c: u8) {
-        let code = (c as char) as u32;
+        let ch = if is_non_ascii(c) {
+            self.current_char.unwrap_or(c as char)
+        } else {
+            c as char
+        };
+        let code = ch as u32;
 
         if is_surrogate(code) {
             self.emit_error(ErrorKind::SurrogateInInputStream);
@@ -2515,7 +2520,7 @@ where
                     // U+0000 NULL
                     // This is an unexpected-null-character parse error. Append a U+FFFD
                     // REPLACEMENT CHARACTER character to the current attribute's name.
-                    Some(c @ b'\x00') => {
+                    Some(_c @ b'\x00') => {
                         self.emit_error(ErrorKind::UnexpectedNullCharacter);
                         let b = self.buf.clone();
                         let mut buf = b.borrow_mut();
@@ -3493,7 +3498,12 @@ where
                         self.validate_input_stream_character(c);
                         self.append_raw_to_doctype_token(c);
                         self.create_doctype_token();
-                        self.set_doctype_token_name(c as char);
+                        let ch = if is_non_ascii(c) {
+                            self.current_char.unwrap_or(c as char)
+                        } else {
+                            c as char
+                        };
+                        self.set_doctype_token_name(ch);
                         self.state = State::DoctypeName;
                     }
                 }

From 09283cefd1e3c0c6c7887b05564cf50efeab8ce1 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Tue, 25 Nov 2025 21:16:44 +0900
Subject: [PATCH 09/20] bump_bytes

---
 crates/swc_common/src/input.rs            | 11 ++++
 crates/swc_css_parser/src/lexer/mod.rs    | 17 ++++--
 crates/swc_ecma_parser/src/lexer/mod.rs   | 70 ++++++++--------------
 crates/swc_ecma_parser/src/lexer/table.rs | 10 +---
 crates/swc_html_parser/src/lexer/mod.rs   | 71 ++++++++++-------------
 5 files changed, 78 insertions(+), 101 deletions(-)

diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs
index 24457fa94fba..65a8986a5112 100644
--- a/crates/swc_common/src/input.rs
+++ b/crates/swc_common/src/input.rs
@@ -155,6 +155,13 @@ impl<'a> Input<'a> for StringInput<'a> {
         self.last_pos = self.last_pos + BytePos(len as u32);
     }
 
+    #[inline]
+    fn bump_bytes(&mut self, n: usize) {
+        debug_assert!(n <= self.remaining.len());
+        self.remaining = unsafe { self.remaining.get_unchecked(n..) };
+        self.last_pos.0 += n as u32;
+    }
+
     #[inline]
     fn cur_as_ascii(&self) -> Option<u8> {
         let first_byte = *self.remaining.as_bytes().first()?;
@@ -287,6 +294,10 @@ pub trait Input<'a>: Clone {
     /// when the Input is not empty.
     unsafe fn bump(&mut self);
 
+    /// Advances the input by exactly `n` bytes.
+    /// Unlike `bump()`, this does not calculate UTF-8 character boundaries.
+    fn bump_bytes(&mut self, n: usize);
+
     /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F).
     /// Returns [None] if it's end of input or if the byte is not ASCII.
     #[inline]
diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 4820b9670d81..53376b2590f2 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -220,11 +220,18 @@ where
         self.cur = cur;
         self.cur_pos = self.input.last_pos();
 
-        if cur.is_some() {
-            unsafe {
-                // Safety: cur is Some
-                self.input.bump();
-            }
+        if let Some(byte) = cur {
+            // Calculate the number of bytes in this UTF-8 character
+            let len = if byte < 0x80 {
+                1 // ASCII
+            } else if byte < 0xe0 {
+                2 // 2-byte UTF-8
+            } else if byte < 0xf0 {
+                3 // 3-byte UTF-8
+            } else {
+                4 // 4-byte UTF-8
+            };
+            self.input.bump_bytes(len);
         }
 
         cur
diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index 6aff7668c428..484ac21d321b 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -244,17 +244,11 @@ impl<'a> Lexer<'a> {
     fn read_token_plus_minus<const C: u8>(&mut self) -> LexResult<Token> {
         let start = self.cur_pos();
 
-        unsafe {
-            // Safety: cur() is Some(c), if this method is called.
-            self.input.bump();
-        }
+        self.bump();
 
         // '++', '--'
         Ok(if self.input.cur() == Some(C) {
-            unsafe {
-                // Safety: cur() is Some(c)
-                self.input.bump();
-            }
+            self.bump();
 
             // Handle -->
             if self.state.had_line_break && C == b'-' && self.eat(b'>') {
@@ -286,10 +280,7 @@ impl<'a> Lexer<'a> {
         let start = self.cur_pos();
         let had_line_break_before_last = self.had_line_break_before_last();
 
-        unsafe {
-            // Safety: cur() is Some(c) if this method is called.
-            self.input.bump();
-        }
+        self.bump();
 
         Ok(if self.input.eat_byte(b'=') {
             // "=="
@@ -543,9 +534,18 @@ impl<'a> Lexer<'a> {
 
     #[inline(always)]
     fn bump(&mut self) {
-        unsafe {
-            // Safety: Actually this is not safe but this is an internal method.
-            self.input_mut().bump()
+        if let Some(byte) = self.input().cur() {
+            // Calculate the number of bytes in this UTF-8 character
+            let len = if byte < 0x80 {
+                1 // ASCII
+            } else if byte < 0xe0 {
+                2 // 2-byte UTF-8
+            } else if byte < 0xf0 {
+                3 // 3-byte UTF-8
+            } else {
+                4 // 4-byte UTF-8
+            };
+            self.input_mut().bump_bytes(len);
         }
     }
 
@@ -929,10 +929,7 @@ impl<'a> Lexer<'a> {
                     }
 
                     // Ignore this _ character
-                    unsafe {
-                        // Safety: cur() returns Some(c) where c is a valid char
-                        self.input_mut().bump();
-                    }
+                    self.bump();
 
                     continue;
                 }
@@ -1349,10 +1346,7 @@ impl<'a> Lexer<'a> {
     fn read_jsx_str(&mut self, quote: char) -> LexResult<Token> {
         debug_assert!(self.syntax().jsx());
         let start = self.input().cur_pos();
-        unsafe {
-            // Safety: cur() was Some(quote)
-            self.input_mut().bump(); // `quote`
-        }
+        self.bump(); // `quote`
         let mut out = String::new();
         let mut chunk_start = self.input().cur_pos();
         loop {
@@ -1416,10 +1410,7 @@ impl<'a> Lexer<'a> {
 
                 chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
             } else {
-                unsafe {
-                    // Safety: cur() was Some(ch)
-                    self.input_mut().bump();
-                }
+                self.bump();
             }
         }
         let s = unsafe {
@@ -1719,10 +1710,7 @@ impl<'a> Lexer<'a> {
             _ => c,
         };
 
-        unsafe {
-            // Safety: cur() is Some(c) if this method is called.
-            self.input_mut().bump();
-        }
+        self.bump();
 
         Ok(CodePoint::from_u32(c as u32))
     }
@@ -2074,10 +2062,7 @@ impl<'a> Lexer<'a> {
         let had_line_break_before_last = self.had_line_break_before_last();
         let start = self.cur_pos();
 
-        unsafe {
-            // Safety: cur() is Some(c as char)
-            self.input_mut().bump();
-        }
+        self.bump();
         let token = if is_bit_and {
             Token::Ampersand
         } else {
@@ -2096,16 +2081,10 @@ impl<'a> Lexer<'a> {
 
         // '||', '&&'
         if self.input().cur() == Some(C) {
-            unsafe {
-                // Safety: cur() is Some(c)
-                self.input_mut().bump();
-            }
+            self.bump();
 
             if self.input().cur() == Some(b'=') {
-                unsafe {
-                    // Safety: cur() is Some('=')
-                    self.input_mut().bump();
-                }
+                self.bump();
 
                 return Ok(if is_bit_and {
                     Token::LogicalAndEq
@@ -2252,10 +2231,7 @@ impl<'a> Lexer<'a> {
                         self.wtf8_atom(Wtf8::from_str(s))
                     };
 
-                    unsafe {
-                        // Safety: cur is quote
-                        self.input_mut().bump();
-                    }
+                    self.bump(); // cur is quote
 
                     let end = self.cur_pos();
                     let raw = unsafe {
diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs
index 37e33537afbb..066d580fbe83 100644
--- a/crates/swc_ecma_parser/src/lexer/table.rs
+++ b/crates/swc_ecma_parser/src/lexer/table.rs
@@ -48,10 +48,7 @@ const ERR: ByteHandler = |lexer| {
     };
 
     let start = lexer.cur_pos();
-    unsafe {
-        // Safety: Byte handler is only called for non-last characters
-        lexer.input.bump();
-    }
+    lexer.bump();
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 };
 
@@ -315,10 +312,7 @@ const UNI: ByteHandler = |lexer| {
     }
 
     let start = lexer.cur_pos();
-    unsafe {
-        // Safety: Byte handler is only called for non-last characters
-        lexer.input.bump();
-    }
+    lexer.bump();
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 };
 
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index 9b2812edd93d..552d2c6934ab 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -159,10 +159,8 @@ where
         // A leading Byte Order Mark (BOM) causes the character encoding argument to be
         // ignored and will itself be skipped.
         if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') {
-            unsafe {
-                // Safety: We know that the current character is '\u{feff}'.
-                lexer.input.bump();
-            }
+            // Safety: We know that the current character is '\u{feff}' (3 bytes: EF BB BF).
+            lexer.input.bump_bytes(3);
         }
 
         lexer
@@ -254,11 +252,18 @@ where
         self.cur = self.input.cur();
         self.cur_pos = self.input.cur_pos();
 
-        if self.cur.is_some() {
-            unsafe {
-                // Safety: self.cur is Some()
-                self.input.bump();
-            }
+        if let Some(byte) = self.cur {
+            // Calculate the number of bytes in this UTF-8 character
+            let len = if byte < 0x80 {
+                1 // ASCII
+            } else if byte < 0xe0 {
+                2 // 2-byte UTF-8
+            } else if byte < 0xf0 {
+                3 // 3-byte UTF-8
+            } else {
+                4 // 4-byte UTF-8
+            };
+            self.input.bump_bytes(len);
         }
     }
 
@@ -433,10 +438,8 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -491,10 +494,8 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -531,10 +532,8 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -867,10 +866,8 @@ where
             sub_buf.push('\r');
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -901,10 +898,8 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -1038,10 +1033,8 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
 
                 sub_buf.push('\n');
             }
@@ -1124,10 +1117,8 @@ where
             buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                unsafe {
-                    // Safety: cur() is Some(b'\n')
-                    self.input.bump();
-                }
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
                 buf.push('\n');
             }
 
@@ -4957,10 +4948,8 @@ where
     #[inline(always)]
     fn skip_whitespaces(&mut self, c: u8) {
         if c == b'\r' && self.input.cur() == Some(b'\n') {
-            unsafe {
-                // Safety: cur() is Some
-                self.input.bump();
-            }
+            // Safety: cur() is Some(b'\n'), which is 1 byte
+            self.input.bump_bytes(1);
         }
     }
 }

From 51a14859e119d4da5e4d619d9e1f1414378b813a Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Tue, 25 Nov 2025 21:38:43 +0900
Subject: [PATCH 10/20] bump_bytes with proper len

---
 crates/swc_ecma_parser/src/lexer/mod.rs       | 142 +++++++++---------
 crates/swc_ecma_parser/src/lexer/state.rs     |  16 +-
 crates/swc_ecma_parser/src/lexer/table.rs     |   4 +-
 .../swc_ecma_parser/src/lexer/whitespace.rs   |   4 +-
 4 files changed, 82 insertions(+), 84 deletions(-)

diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index 484ac21d321b..079ed06814f5 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -244,11 +244,11 @@ impl<'a> Lexer<'a> {
     fn read_token_plus_minus<const C: u8>(&mut self) -> LexResult<Token> {
         let start = self.cur_pos();
 
-        self.bump();
+        self.bump(1);
 
         // '++', '--'
         Ok(if self.input.cur() == Some(C) {
-            self.bump();
+            self.bump(1);
 
             // Handle -->
             if self.state.had_line_break && C == b'-' && self.eat(b'>') {
@@ -280,7 +280,7 @@ impl<'a> Lexer<'a> {
         let start = self.cur_pos();
         let had_line_break_before_last = self.had_line_break_before_last();
 
-        self.bump();
+        self.bump(1);
 
         Ok(if self.input.eat_byte(b'=') {
             // "=="
@@ -321,7 +321,7 @@ impl Lexer<'_> {
     fn read_token_lt_gt<const C: u8>(&mut self) -> LexResult<Token> {
         let had_line_break_before_last = self.had_line_break_before_last();
         let start = self.cur_pos();
-        self.bump();
+        self.bump(1);
 
         if self.syntax.typescript()
             && self.ctx.contains(Context::InType)
@@ -351,7 +351,7 @@ impl Lexer<'_> {
 
         // '<<', '>>'
         if self.cur() == Some(C) {
-            self.bump();
+            self.bump(1);
             op = if C == b'<' {
                 Token::LShift
             } else {
@@ -360,7 +360,7 @@ impl Lexer<'_> {
 
             //'>>>'
             if C == b'>' && self.cur() == Some(C) {
-                self.bump();
+                self.bump(1);
                 op = Token::ZeroFillRShift;
             }
         }
@@ -412,7 +412,7 @@ impl Lexer<'_> {
     ) -> LexResult<Token> {
         debug_assert!(self.cur() == Some(if started_with_backtick { b'`' } else { b'}' }));
         let mut cooked = Ok(Wtf8Buf::with_capacity(8));
-        self.bump(); // `}` or `\``
+        self.bump(1); // `}` or `\``
         let mut cooked_slice_start = self.cur_pos();
         let raw_slice_start = cooked_slice_start;
         let raw_atom = |this: &mut Self| {
@@ -438,7 +438,7 @@ impl Lexer<'_> {
                 consume_cooked!();
                 let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
                 let raw = raw_atom(self);
-                self.bump();
+                self.bump(1);
                 return Ok(if started_with_backtick {
                     self.set_token_value(Some(TokenValue::Template { raw, cooked }));
                     Token::NoSubstitutionTemplateLiteral
@@ -485,7 +485,7 @@ impl Lexer<'_> {
                 };
 
                 let c = if c == b'\r' && self.peek() == Some(b'\n') {
-                    self.bump(); // '\r'
+                    self.bump(1); // '\r'
                     '\n'
                 } else {
                     match c_char {
@@ -497,14 +497,14 @@ impl Lexer<'_> {
                     }
                 };
 
-                self.bump();
+                self.bump(c_char.len_utf8());
 
                 if let Ok(ref mut cooked) = cooked {
                     cooked.push_char(c);
                 }
                 cooked_slice_start = self.cur_pos();
             } else {
-                self.bump();
+                self.bump(1);
             }
         }
 
@@ -532,21 +532,13 @@ impl<'a> Lexer<'a> {
         Span { lo: start, hi: end }
     }
 
+    /// Advances the input by `len` bytes.
+    ///
+    /// For ASCII characters, use `bump(1)`.
+    /// For unknown character length, use `c.len_utf8()` where c is a char.
     #[inline(always)]
-    fn bump(&mut self) {
-        if let Some(byte) = self.input().cur() {
-            // Calculate the number of bytes in this UTF-8 character
-            let len = if byte < 0x80 {
-                1 // ASCII
-            } else if byte < 0xe0 {
-                2 // 2-byte UTF-8
-            } else if byte < 0xf0 {
-                3 // 3-byte UTF-8
-            } else {
-                4 // 4-byte UTF-8
-            };
-            self.input_mut().bump_bytes(len);
-        }
+    fn bump(&mut self, len: usize) {
+        self.input_mut().bump_bytes(len);
     }
 
     #[inline(always)]
@@ -929,7 +921,7 @@ impl<'a> Lexer<'a> {
                     }
 
                     // Ignore this _ character
-                    self.bump();
+                    self.bump(1);
 
                     continue;
                 }
@@ -942,7 +934,7 @@ impl<'a> Lexer<'a> {
                 return Ok(total);
             };
 
-            self.bump();
+            self.bump(1);
 
             let (t, cont) = op(total, RADIX, val)?;
 
@@ -1093,7 +1085,7 @@ impl<'a> Lexer<'a> {
         //
         // `.1.a`, `.1e-4.a` are valid,
         if has_dot {
-            self.bump();
+            self.bump(1);
 
             // equal: if START_WITH_DOT { debug_assert!(xxxx) }
             debug_assert!(!START_WITH_DOT || self.cur().is_some_and(|cur| cur.is_ascii_digit()));
@@ -1110,7 +1102,7 @@ impl<'a> Lexer<'a> {
         // 1e+2 = 100
         // 1e-2 = 0.01
         if has_e {
-            self.bump(); // `e`/`E`
+            self.bump(1); // `e`/`E`
 
             let next = match self.cur() {
                 Some(next) => next,
@@ -1121,7 +1113,7 @@ impl<'a> Lexer<'a> {
             };
 
             if next == b'+' || next == b'-' {
-                self.bump(); // remove '+', '-'
+                self.bump(1); // remove '+', '-'
             }
 
             let lazy_integer = self.read_number_no_dot_as_str::<10>()?;
@@ -1191,12 +1183,12 @@ impl<'a> Lexer<'a> {
         let start = self.cur_pos();
 
         debug_assert_eq!(self.cur(), Some(b'0'));
-        self.bump();
+        self.bump(1);
 
         debug_assert!(self
             .cur()
             .is_some_and(|c| matches!(c, b'b' | b'B' | b'o' | b'O' | b'x' | b'X')));
-        self.bump();
+        self.bump(1);
 
         let lazy_integer = self.read_number_no_dot_as_str::<RADIX>()?;
         let has_underscore = lazy_integer.has_underscore;
@@ -1287,7 +1279,7 @@ impl<'a> Lexer<'a> {
         let mut s = SmartString::<LazyCompact>::default();
 
         debug_assert!(self.input().cur().is_some_and(|c| c == b'&'));
-        self.bump();
+        self.bump(1);
 
         let start_pos = self.input().cur_pos();
 
@@ -1296,7 +1288,7 @@ impl<'a> Lexer<'a> {
                 Some(c) => c,
                 None => break,
             };
-            self.bump();
+            self.bump(1);
 
             if c == b';' {
                 if let Some(stripped) = s.strip_prefix('#') {
@@ -1332,10 +1324,10 @@ impl<'a> Lexer<'a> {
     fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult<Either<&'static str, char>> {
         debug_assert!(self.syntax().jsx());
         let ch = self.input().cur_as_char().unwrap();
-        self.bump();
+        self.bump(ch.len_utf8());
 
         let out = if ch == '\r' && self.input().cur() == Some(b'\n') {
-            self.bump(); // `\n`
+            self.bump(1); // `\n`
             Either::Left(if normalize_crlf { "\n" } else { "\r\n" })
         } else {
             Either::Right(ch)
@@ -1346,7 +1338,7 @@ impl<'a> Lexer<'a> {
     fn read_jsx_str(&mut self, quote: char) -> LexResult<Token> {
         debug_assert!(self.syntax().jsx());
         let start = self.input().cur_pos();
-        self.bump(); // `quote`
+        self.bump(1); // `quote`
         let mut out = String::new();
         let mut chunk_start = self.input().cur_pos();
         loop {
@@ -1367,7 +1359,7 @@ impl<'a> Lexer<'a> {
                 out.push_str(value);
                 out.push('\\');
 
-                self.bump();
+                self.bump(1);
 
                 chunk_start = self.input().cur_pos();
 
@@ -1410,7 +1402,7 @@ impl<'a> Lexer<'a> {
 
                 chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
             } else {
-                self.bump();
+                self.bump(ch.len_utf8());
             }
         }
         let s = unsafe {
@@ -1428,7 +1420,7 @@ impl<'a> Lexer<'a> {
         // it might be at the end of the file when
         // the string literal is unterminated
         if self.input().peek_ahead().is_some() {
-            self.bump();
+            self.bump(1);
         }
 
         let raw = unsafe {
@@ -1506,7 +1498,7 @@ impl<'a> Lexer<'a> {
 
         let mut is_curly = false;
 
-        self.bump(); // 'u'
+        self.bump(1); // 'u'
 
         if self.eat(b'{') {
             is_curly = true;
@@ -1589,8 +1581,8 @@ impl<'a> Lexer<'a> {
         if self.input().cur() != Some(b'#') || self.input().peek() != Some(b'!') {
             return Ok(None);
         }
-        self.bump(); // `#`
-        self.bump(); // `!`
+        self.bump(1); // `#`
+        self.bump(1); // `!`
         let s = self.input_uncons_while(|c| !c.is_line_terminator());
         Ok(Some(self.atom(s)))
     }
@@ -1603,7 +1595,7 @@ impl<'a> Lexer<'a> {
 
         let start = self.cur_pos();
 
-        self.bump(); // '\'
+        self.bump(1); // '\'
 
         let c = match self.cur_as_char() {
             Some(c) => c,
@@ -1619,21 +1611,21 @@ impl<'a> Lexer<'a> {
             'v' => '\u{000b}',
             'f' => '\u{000c}',
             '\r' => {
-                self.bump(); // remove '\r'
+                self.bump(1); // remove '\r'
 
                 self.eat(b'\n');
 
                 return Ok(None);
             }
             '\n' | '\u{2028}' | '\u{2029}' => {
-                self.bump();
+                self.bump(c.len_utf8());
 
                 return Ok(None);
             }
 
             // read hexadecimal escape sequences
             'x' => {
-                self.bump(); // 'x'
+                self.bump(1); // 'x'
 
                 match self.read_int_u32::<16>(2)? {
                     Some(val) => return Ok(CodePoint::from_u32(val)),
@@ -1656,7 +1648,7 @@ impl<'a> Lexer<'a> {
 
             // octal escape sequences
             '0'..='7' => {
-                self.bump();
+                self.bump(1);
 
                 let first_c = if c == '0' {
                     match self.cur() {
@@ -1695,7 +1687,7 @@ impl<'a> Lexer<'a> {
                                     value * 8 + v as u8
                                 };
 
-                                self.bump();
+                                self.bump(1);
                             }
                             _ => return Ok(CodePoint::from_u32(value as u32)),
                         }
@@ -1710,7 +1702,7 @@ impl<'a> Lexer<'a> {
             _ => c,
         };
 
-        self.bump();
+        self.bump(1);
 
         Ok(CodePoint::from_u32(c as u32))
     }
@@ -1726,7 +1718,7 @@ impl<'a> Lexer<'a> {
 
         let start = self.cur_pos();
 
-        self.bump(); // bump '/'
+        self.bump(1); // bump '/'
 
         let slice_start = self.cur_pos();
 
@@ -1758,7 +1750,7 @@ impl<'a> Lexer<'a> {
                 escaped = c == b'\\';
             }
 
-            self.bump();
+            self.bump(1);
         }
 
         let content = {
@@ -1776,7 +1768,7 @@ impl<'a> Lexer<'a> {
             ));
         }
 
-        self.bump(); // '/'
+        self.bump(1); // '/'
 
         // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape
         // sequence." TODO: check for escape
@@ -1806,7 +1798,7 @@ impl<'a> Lexer<'a> {
         if let Some(c) = self.input().cur_as_ascii() {
             if Ident::is_valid_ascii_start(c) {
                 // Advance past first byte
-                self.bump();
+                self.bump(1);
 
                 // Use byte_search to quickly scan to end of ASCII identifier
                 let next_byte = byte_search! {
@@ -1861,10 +1853,10 @@ impl<'a> Lexer<'a> {
         loop {
             if let Some(c) = self.input().cur_as_ascii() {
                 if Ident::is_valid_ascii_continue(c) {
-                    self.bump();
+                    self.bump(1);
                     continue;
                 } else if first && Ident::is_valid_ascii_start(c) {
-                    self.bump();
+                    self.bump(1);
                     first = false;
                     continue;
                 }
@@ -1874,7 +1866,7 @@ impl<'a> Lexer<'a> {
                     first = false;
                     has_escape = true;
                     let start = self.cur_pos();
-                    self.bump();
+                    self.bump(1);
 
                     if !self.is(b'u') {
                         self.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)?
@@ -1926,10 +1918,10 @@ impl<'a> Lexer<'a> {
                 break;
             } else if let Some(c) = self.input().cur_as_char() {
                 if Ident::is_valid_non_ascii_continue(c) {
-                    self.bump();
+                    self.bump(c.len_utf8());
                     continue;
                 } else if first && Ident::is_valid_non_ascii_start(c) {
-                    self.bump();
+                    self.bump(c.len_utf8());
                     first = false;
                     continue;
                 }
@@ -1959,7 +1951,7 @@ impl<'a> Lexer<'a> {
     fn read_token_number_sign(&mut self) -> LexResult<Token> {
         debug_assert!(self.cur().is_some_and(|c| c == b'#'));
 
-        self.bump(); // '#'
+        self.bump(1); // '#'
 
         // `#` can also be a part of shebangs, however they should have been
         // handled by `read_shebang()`
@@ -1979,7 +1971,7 @@ impl<'a> Lexer<'a> {
         let next = match self.input().peek() {
             Some(next) => next,
             None => {
-                self.bump(); // '.'
+                self.bump(1); // '.'
                 return Ok(Token::Dot);
             }
         };
@@ -1990,11 +1982,11 @@ impl<'a> Lexer<'a> {
             });
         }
 
-        self.bump(); // 1st `.`
+        self.bump(1); // 1st `.`
 
         if next == b'.' && self.input().peek() == Some(b'.') {
-            self.bump(); // 2nd `.`
-            self.bump(); // 3rd `.`
+            self.bump(1); // 2nd `.`
+            self.bump(1); // 3rd `.`
 
             return Ok(Token::DotDotDot);
         }
@@ -2007,7 +1999,7 @@ impl<'a> Lexer<'a> {
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_question_mark(&mut self) -> LexResult<Token> {
         debug_assert!(self.cur().is_some_and(|c| c == b'?'));
-        self.bump();
+        self.bump(1);
         if self.input_mut().eat_byte(b'?') {
             if self.input_mut().eat_byte(b'=') {
                 Ok(Token::NullishEq)
@@ -2024,7 +2016,7 @@ impl<'a> Lexer<'a> {
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_colon(&mut self) -> LexResult<Token> {
         debug_assert!(self.cur().is_some_and(|c| c == b':'));
-        self.bump(); // ':'
+        self.bump(1); // ':'
         Ok(Token::Colon)
     }
 
@@ -2062,7 +2054,7 @@ impl<'a> Lexer<'a> {
         let had_line_break_before_last = self.had_line_break_before_last();
         let start = self.cur_pos();
 
-        self.bump();
+        self.bump(1);
         let token = if is_bit_and {
             Token::Ampersand
         } else {
@@ -2081,10 +2073,10 @@ impl<'a> Lexer<'a> {
 
         // '||', '&&'
         if self.input().cur() == Some(C) {
-            self.bump();
+            self.bump(1);
 
             if self.input().cur() == Some(b'=') {
-                self.bump();
+                self.bump(1);
 
                 return Ok(if is_bit_and {
                     Token::LogicalAndEq
@@ -2120,7 +2112,7 @@ impl<'a> Lexer<'a> {
     /// This is extracted as a method to reduce size of `read_token`.
     fn read_token_mul_mod<const IS_MUL: bool>(&mut self) -> LexResult<Token> {
         debug_assert!(self.cur().is_some_and(|c| c == b'*' || c == b'%'));
-        self.bump();
+        self.bump(1);
         let token = if IS_MUL {
             if self.input_mut().eat_byte(b'*') {
                 // `**`
@@ -2148,7 +2140,7 @@ impl<'a> Lexer<'a> {
 
     fn read_slash(&mut self) -> LexResult<Token> {
         debug_assert_eq!(self.cur(), Some(b'/'));
-        self.bump(); // '/'
+        self.bump(1); // '/'
         Ok(if self.eat(b'=') {
             Token::DivEq
         } else {
@@ -2179,7 +2171,7 @@ impl<'a> Lexer<'a> {
         let start = self.cur_pos();
         let quote = self.cur().unwrap();
 
-        self.bump(); // '"' or '\''
+        self.bump(1); // '"' or '\''
 
         let mut slice_start = self.input().cur_pos();
 
@@ -2231,7 +2223,7 @@ impl<'a> Lexer<'a> {
                         self.wtf8_atom(Wtf8::from_str(s))
                     };
 
-                    self.bump(); // cur is quote
+                    self.bump(1); // cur is quote
 
                     let end = self.cur_pos();
                     let raw = unsafe {
@@ -2286,7 +2278,7 @@ impl<'a> Lexer<'a> {
                         self,
                     ));
                 }
-                _ => self.bump(),
+                _ => self.bump(1),
             }
         }
     }
@@ -2324,7 +2316,7 @@ impl<'a> Lexer<'a> {
         // Fast path: try to scan ASCII identifier using byte_search
         // Performance optimization: check if first char disqualifies as keyword
         // Advance past first byte
-        self.bump();
+        self.bump(1);
 
         // Use byte_search to quickly scan to end of ASCII identifier
         let next_byte = byte_search! {
diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs
index e52ebdd07227..dd02ff45f464 100644
--- a/crates/swc_ecma_parser/src/lexer/state.rs
+++ b/crates/swc_ecma_parser/src/lexer/state.rs
@@ -248,7 +248,7 @@ impl crate::input::Tokens for Lexer<'_> {
         while let Some(ch) = self.input().cur() {
             if ch == b'-' {
                 v.push(ch as char);
-                self.bump();
+                self.bump(1);
             } else {
                 let old_pos = self.cur_pos();
                 v.push_str(&self.scan_identifier_parts());
@@ -464,7 +464,13 @@ impl Lexer<'_> {
                     chunk_start = self.input.cur_pos();
                 }
             } else {
-                self.bump();
+                let len = if ch < 0x80 {
+                    1 // ASCII
+                } else {
+                    // For multi-byte UTF-8, get the full character
+                    self.input().cur_as_char().unwrap().len_utf8()
+                };
+                self.bump(len);
             }
         }
 
@@ -514,12 +520,12 @@ impl Lexer<'_> {
                     v.push(ch as char);
                     self.input_mut().bump_bytes(1);
                 } else if ch == b'\\' {
-                    self.bump(); // bump '\'
+                    self.bump(1); // bump '\'
                     if !self.is(b'u') {
                         self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
                         continue;
                     }
-                    self.bump(); // bump 'u'
+                    self.bump(1); // bump 'u'
                     let Ok(value) = self.read_unicode_escape() else {
                         self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
                         break;
@@ -538,7 +544,7 @@ impl Lexer<'_> {
                 if let Some(c) = self.input().cur_as_char() {
                     if c.is_ident_part() {
                         v.push(c);
-                        self.bump();
+                        self.bump(c.len_utf8());
                     } else {
                         break;
                     }
diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs
index 066d580fbe83..76e8e95f2dfb 100644
--- a/crates/swc_ecma_parser/src/lexer/table.rs
+++ b/crates/swc_ecma_parser/src/lexer/table.rs
@@ -48,7 +48,7 @@ const ERR: ByteHandler = |lexer| {
     };
 
     let start = lexer.cur_pos();
-    lexer.bump();
+    lexer.bump(1);
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 };
 
@@ -312,7 +312,7 @@ const UNI: ByteHandler = |lexer| {
     }
 
     let start = lexer.cur_pos();
-    lexer.bump();
+    lexer.bump(1);
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 };
 
diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs
index e423fc7457ed..46cd7319ec73 100644
--- a/crates/swc_ecma_parser/src/lexer/whitespace.rs
+++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs
@@ -133,10 +133,10 @@ const UNI: ByteHandler = |lexer| {
     };
 
     if is_irregular_whitespace(c) {
-        lexer.bump();
+        lexer.bump(c.len_utf8());
         true
     } else if is_irregular_line_terminator(c) {
-        lexer.bump();
+        lexer.bump(c.len_utf8());
         lexer.state.mark_had_line_break();
         true
     } else {

From 49dce8a6cb88ed68830231850627a6e2e8f3da2b Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 17:46:43 +0900
Subject: [PATCH 11/20] swc_html_parser

---
 crates/swc_html_parser/src/lexer/mod.rs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index 552d2c6934ab..f4b9facfa1ba 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -248,21 +248,11 @@ where
     }
 
     #[inline(always)]
-    fn consume(&mut self) {
+    fn consume(&mut self, len: usize) {
         self.cur = self.input.cur();
         self.cur_pos = self.input.cur_pos();
 
-        if let Some(byte) = self.cur {
-            // Calculate the number of bytes in this UTF-8 character
-            let len = if byte < 0x80 {
-                1 // ASCII
-            } else if byte < 0xe0 {
-                2 // 2-byte UTF-8
-            } else if byte < 0xf0 {
-                3 // 3-byte UTF-8
-            } else {
-                4 // 4-byte UTF-8
-            };
+        if self.cur.is_some() {
             self.input.bump_bytes(len);
         }
     }
@@ -291,17 +281,20 @@ where
         let c = self.next();
 
         // Store the full UTF-8 character before consuming (for helper functions)
-        if let Some(byte) = c {
+        let len = if let Some(byte) = c {
             if is_non_ascii(byte) {
                 self.current_char = self.input.cur_as_char();
+                self.current_char.map(|c| c.len_utf8()).unwrap_or(1)
             } else {
                 self.current_char = Some(byte as char);
+                1
             }
         } else {
             self.current_char = None;
-        }
+            1
+        };
 
-        self.consume();
+        self.consume(len);
 
         c
     }

From 68bf6d906b9b4d7e0642ea279930fd732fe34630 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 17:59:08 +0900
Subject: [PATCH 12/20] consume(len)

---
 crates/swc_css_parser/src/lexer/mod.rs | 160 +++++++++++++++++--------
 1 file changed, 111 insertions(+), 49 deletions(-)

diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 53376b2590f2..33960362f90f 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -214,23 +214,13 @@ where
     }
 
     #[inline(always)]
-    fn consume(&mut self) -> Option<u8> {
+    fn consume(&mut self, len: usize) -> Option<u8> {
         let cur = self.input.cur();
 
         self.cur = cur;
         self.cur_pos = self.input.last_pos();
 
-        if let Some(byte) = cur {
-            // Calculate the number of bytes in this UTF-8 character
-            let len = if byte < 0x80 {
-                1 // ASCII
-            } else if byte < 0xe0 {
-                2 // 2-byte UTF-8
-            } else if byte < 0xf0 {
-                3 // 3-byte UTF-8
-            } else {
-                4 // 4-byte UTF-8
-            };
+        if cur.is_some() {
             self.input.bump_bytes(len);
         }
 
@@ -265,7 +255,16 @@ where
         }
 
         // Consume the next input code point.
-        match self.consume() {
+        let byte_len = if let Some(b) = self.input.cur() {
+            if b < 0x80 {
+                1 // ASCII
+            } else {
+                self.input.cur_as_char().map(|c| c.len_utf8()).unwrap_or(1)
+            }
+        } else {
+            1
+        };
+        match self.consume(byte_len) {
             // whitespace
             // Consume as much whitespace as possible. Return a <whitespace-token>.
             Some(c) if is_whitespace(c) => self.with_buf(|l, buf| {
@@ -276,7 +275,7 @@ where
 
                     match c {
                         Some(c) if is_whitespace(c) => {
-                            l.consume();
+                            l.consume(1);
 
                             buf.push(c as char);
                         }
@@ -364,8 +363,8 @@ where
                 // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
                 // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
                 else if self.next() == Some(b'-') && self.next_next() == Some(b'>') {
-                    self.consume();
-                    self.consume();
+                    self.consume(1); // -
+                    self.consume(1); // >
 
                     return Ok(Token::CDC);
                 }
@@ -410,9 +409,9 @@ where
                     && self.next_next() == Some(b'-')
                     && self.next_next_next() == Some(b'-')
                 {
-                    self.consume(); // !
-                    self.consume(); // -
-                    self.consume(); // -
+                    self.consume(1); // !
+                    self.consume(1); // -
+                    self.consume(1); // -
 
                     return Ok(tok!("<!--"));
                 }
@@ -511,13 +510,22 @@ where
             let cmt_start = self.input.last_pos();
 
             while self.next() == Some(b'/') && self.next_next() == Some(b'*') {
-                self.consume(); // '*'
-                self.consume(); // '/'
+                self.consume(1); // '/'
+                self.consume(1); // '*'
 
                 loop {
-                    match self.consume() {
+                    let byte_len = if let Some(b) = self.input.cur() {
+                        if b < 0x80 {
+                            1 // ASCII
+                        } else {
+                            self.input.cur_as_char().map(|c| c.len_utf8()).unwrap_or(1)
+                        }
+                    } else {
+                        1
+                    };
+                    match self.consume(byte_len) {
                         Some(b'*') if self.next() == Some(b'/') => {
-                            self.consume(); // '/'
+                            self.consume(1); // '/'
 
                             if self.comments.is_some() {
                                 let last_pos = self.input.last_pos();
@@ -553,13 +561,22 @@ where
             && self.next_next() == Some(b'/')
         {
             while self.next() == Some(b'/') && self.next_next() == Some(b'/') {
-                self.consume(); // '/'
-                self.consume(); // '/'
+                self.consume(1); // '/'
+                self.consume(1); // '/'
 
                 let start_of_content = self.input.last_pos();
 
                 loop {
-                    match self.consume() {
+                    let byte_len = if let Some(b) = self.input.cur() {
+                        if b < 0x80 {
+                            1 // ASCII
+                        } else {
+                            self.input.cur_as_char().map(|c| c.len_utf8()).unwrap_or(1)
+                        }
+                    } else {
+                        1
+                    };
+                    match self.consume(byte_len) {
                         Some(c) if is_newline(c) => {
                             if self.comments.is_some() {
                                 let last_pos = self.input.last_pos();
@@ -618,7 +635,7 @@ where
         // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create
         // a <percentage-token> with the same value as number, and return it.
         else if next_first == Some(b'%') {
-            self.consume();
+            self.consume(1);
 
             return Ok(Token::Percentage {
                 value: number.0,
@@ -645,7 +662,7 @@ where
         // If string's value is an ASCII case-insensitive match for "url", and the next
         // input code point is U+0028 LEFT PARENTHESIS ((), consume it.
         if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some(b'(') {
-            self.consume();
+            self.consume(1);
 
             let start_whitespace = self.input.last_pos();
 
@@ -654,7 +671,7 @@ where
             let whitespaces = self.with_buf(|l, buf| {
                 while let (Some(next), Some(next_next)) = (l.next(), l.next_next()) {
                     if is_whitespace(next) && is_whitespace(next_next) {
-                        l.consume();
+                        l.consume(1);
 
                         buf.push(next as char);
                     } else {
@@ -698,7 +715,7 @@ where
         // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
         // Create a <function-token> with its value set to string and return it.
         else if self.next() == Some(b'(') {
-            self.consume();
+            self.consume(1);
 
             return Ok(Token::Function {
                 value: ident_sequence.0,
@@ -741,8 +758,17 @@ where
                 } else {
                     None
                 };
+                let byte_len = if let Some(b) = cur_byte {
+                    if b < 0x80 {
+                        1 // ASCII
+                    } else {
+                        cur_char.map(|c| c.len_utf8()).unwrap_or(1)
+                    }
+                } else {
+                    1
+                };
 
-                match l.consume() {
+                match l.consume(byte_len) {
                     // ending code point
                     // Return the <string-token>.
                     Some(c) if c == ending_code_point.unwrap() => {
@@ -784,7 +810,7 @@ where
                         }
                         // Otherwise, if the next input code point is a newline, consume it.
                         else if l.next().is_some() && is_newline(l.next().unwrap()) {
-                            l.consume();
+                            l.consume(1);
 
                             raw.push(c as char);
                             raw.push(next.unwrap() as char);
@@ -835,7 +861,7 @@ where
                     } else {
                         c as char
                     };
-                    l.consume();
+                    l.consume(1);
 
                     raw.push(ch);
                 } else {
@@ -856,8 +882,17 @@ where
                 } else {
                     None
                 };
+                let byte_len = if let Some(b) = cur_byte {
+                    if b < 0x80 {
+                        1 // ASCII
+                    } else {
+                        cur_char.map(|c| c.len_utf8()).unwrap_or(1)
+                    }
+                } else {
+                    1
+                };
 
-                match l.consume() {
+                match l.consume(byte_len) {
                     // U+0029 RIGHT PARENTHESIS ())
                     // Return the <url-token>.
                     Some(b')') => {
@@ -894,7 +929,7 @@ where
                                     } else {
                                         c as char
                                     };
-                                    l.consume();
+                                    l.consume(1);
 
                                     buf.push(ch);
                                 } else {
@@ -910,7 +945,7 @@ where
                         // encountered, this is a parse error);
                         match l.next() {
                             Some(b')') => {
-                                l.consume();
+                                l.consume(1);
 
                                 raw.push_str(&whitespaces);
 
@@ -1023,9 +1058,18 @@ where
             } else {
                 None
             };
+            let byte_len = if let Some(b) = cur_byte {
+                if b < 0x80 {
+                    1 // ASCII
+                } else {
+                    cur_char.map(|c| c.len_utf8()).unwrap_or(1)
+                }
+            } else {
+                1
+            };
 
             // Consume the next input code point.
-            match l.consume() {
+            match l.consume(byte_len) {
                 // hex digit
                 Some(c) if is_hex_digit(c) => {
                     let mut hex = (c as char).to_digit(16).unwrap();
@@ -1041,7 +1085,7 @@ where
                             None => break,
                         };
 
-                        l.consume();
+                        l.consume(1);
 
                         buf.push(next.unwrap() as char);
                         hex = hex * 16 + digit;
@@ -1052,7 +1096,7 @@ where
 
                     if let Some(next) = next {
                         if is_whitespace(next) {
-                            l.consume();
+                            l.consume(1);
 
                             buf.push(next as char);
                         }
@@ -1251,8 +1295,17 @@ where
                 } else {
                     None
                 };
+                let byte_len = if let Some(b) = cur_byte {
+                    if b < 0x80 {
+                        1 // ASCII
+                    } else {
+                        cur_char.map(|c| c.len_utf8()).unwrap_or(1)
+                    }
+                } else {
+                    1
+                };
 
-                let c = l.consume();
+                let c = l.consume(byte_len);
 
                 match c {
                     // name code point
@@ -1299,7 +1352,7 @@ where
             let next = l.next();
 
             if next == Some(b'+') || next == Some(b'-') {
-                l.consume();
+                l.consume(1);
 
                 out.push(next.unwrap() as char);
             }
@@ -1307,7 +1360,7 @@ where
             // While the next input code point is a digit, consume it and append it to repr.
             while let Some(c) = l.next() {
                 if c.is_ascii_digit() {
-                    l.consume();
+                    l.consume(1);
 
                     out.push(c as char);
                 } else {
@@ -1323,8 +1376,8 @@ where
                 if let Some(n) = l.next_next() {
                     if n.is_ascii_digit() {
                         // Consume them.
-                        l.consume();
-                        l.consume();
+                        l.consume(1);
+                        l.consume(1);
 
                         // Append them to repr.
                         out.push(next.unwrap() as char);
@@ -1337,7 +1390,7 @@ where
                         // repr.
                         while let Some(c) = l.next() {
                             if c.is_ascii_digit() {
-                                l.consume();
+                                l.consume(1);
 
                                 out.push(c as char);
                             } else {
@@ -1364,8 +1417,8 @@ where
                     || next_next.is_some() && next_next.unwrap().is_ascii_digit()
                 {
                     // Consume them.
-                    l.consume();
-                    l.consume();
+                    l.consume(1);
+                    l.consume(1);
 
                     // Append them to repr.
                     out.push(next.unwrap() as char);
@@ -1378,7 +1431,7 @@ where
                     // to repr.
                     while let Some(c) = l.next() {
                         if c.is_ascii_digit() {
-                            l.consume();
+                            l.consume(1);
 
                             out.push(c as char);
                         } else {
@@ -1422,8 +1475,17 @@ where
                 } else {
                     None
                 };
+                let byte_len = if let Some(b) = cur_byte {
+                    if b < 0x80 {
+                        1 // ASCII
+                    } else {
+                        cur_char.map(|c| c.len_utf8()).unwrap_or(1)
+                    }
+                } else {
+                    1
+                };
 
-                match l.consume() {
+                match l.consume(byte_len) {
                     // U+0029 RIGHT PARENTHESIS ())
                     // EOF
                     // Return.

From 7b53e3a1a8deb65cd40bc61a6567f985dec7803d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Donny/=EA=B0=95=EB=8F=99=EC=9C=A4?= <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 18:00:17 +0900
Subject: [PATCH 13/20] Create eleven-carrots-raise.md

---
 .changeset/eleven-carrots-raise.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/eleven-carrots-raise.md

diff --git a/.changeset/eleven-carrots-raise.md b/.changeset/eleven-carrots-raise.md
new file mode 100644
index 000000000000..cc405f249063
--- /dev/null
+++ b/.changeset/eleven-carrots-raise.md
@@ -0,0 +1,5 @@
+---
+swc_common: major
+---
+
+perf(parser): Make all parsers work by byte instead of char 

From f6e3949b3a06c441c9567383ddbce1fd302147d9 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 18:10:40 +0900
Subject: [PATCH 14/20] Patch input

---
 crates/swc_common/src/input.rs | 69 ++++++----------------------------
 1 file changed, 11 insertions(+), 58 deletions(-)

diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs
index 65a8986a5112..56f6bb8418b4 100644
--- a/crates/swc_common/src/input.rs
+++ b/crates/swc_common/src/input.rs
@@ -1,7 +1,5 @@
 use std::str;
 
-use debug_unreachable::debug_unreachable;
-
 use crate::syntax_pos::{BytePos, SourceFile};
 
 pub type SourceFileInput<'a> = StringInput<'a>;
@@ -73,25 +71,6 @@ impl<'a> StringInput<'a> {
         ret
     }
 
-    #[inline]
-    pub fn bump_bytes(&mut self, n: usize) {
-        debug_assert!(n <= self.remaining.len());
-        self.remaining = unsafe { self.remaining.get_unchecked(n..) };
-        self.last_pos.0 += n as u32;
-    }
-
-    #[inline]
-    pub fn bump_one(&mut self) {
-        if !self.remaining.is_empty() {
-            self.remaining = unsafe { self.remaining.get_unchecked(1..) };
-            self.last_pos.0 += 1;
-        } else {
-            unsafe {
-                debug_unreachable!("bump should not be called when cur() == None");
-            }
-        }
-    }
-
     pub fn start_pos(&self) -> BytePos {
         self.orig_start
     }
@@ -130,33 +109,7 @@ impl<'a> Input<'a> for StringInput<'a> {
     }
 
     #[inline]
-    unsafe fn bump(&mut self) {
-        let bytes = self.remaining.as_bytes();
-        if bytes.is_empty() {
-            unsafe {
-                debug_unreachable!("bump should not be called when cur() == None");
-            }
-        }
-
-        let first_byte = unsafe { *bytes.get_unchecked(0) };
-
-        // Calculate the number of bytes in this UTF-8 character
-        let len = if first_byte < 0x80 {
-            1 // ASCII
-        } else if first_byte < 0xe0 {
-            2 // 2-byte UTF-8
-        } else if first_byte < 0xf0 {
-            3 // 3-byte UTF-8
-        } else {
-            4 // 4-byte UTF-8
-        };
-
-        self.remaining = unsafe { self.remaining.get_unchecked(len..) };
-        self.last_pos = self.last_pos + BytePos(len as u32);
-    }
-
-    #[inline]
-    fn bump_bytes(&mut self, n: usize) {
+    unsafe fn bump_bytes(&mut self, n: usize) {
         debug_assert!(n <= self.remaining.len());
         self.remaining = unsafe { self.remaining.get_unchecked(n..) };
         self.last_pos.0 += n as u32;
@@ -288,15 +241,15 @@ pub trait Input<'a>: Clone {
     /// Returns the byte after the next byte without consuming anything.
     fn peek_ahead(&self) -> Option<u8>;
 
-    /// # Safety
-    ///
-    /// This should be called only when `cur()` returns `Some`. i.e.
-    /// when the Input is not empty.
-    unsafe fn bump(&mut self);
-
     /// Advances the input by exactly `n` bytes.
     /// Unlike `bump()`, this does not calculate UTF-8 character boundaries.
-    fn bump_bytes(&mut self, n: usize);
+    ///
+    /// # Safety
+    ///
+    /// - This should be called only when `cur()` returns `Some`. i.e. when the
+    ///   Input is not empty.
+    /// - `n` should be the number of bytes of the current character.
+    unsafe fn bump_bytes(&mut self, n: usize);
 
     /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F).
     /// Returns [None] if it's end of input or if the byte is not ASCII.
@@ -356,7 +309,7 @@ pub trait Input<'a>: Clone {
         if self.is_byte(c) {
             unsafe {
                 // Safety: We are sure that the input is not empty
-                self.bump();
+                self.bump_bytes(1);
             }
             true
         } else {
@@ -419,13 +372,13 @@ mod tests {
             assert_eq!(i.cur(), Some(b'/'));
 
             unsafe {
-                i.bump();
+                i.bump_bytes(1);
             }
             assert_eq!(i.last_pos, BytePos(5));
             assert_eq!(i.cur(), Some(b'd'));
 
             unsafe {
-                i.bump();
+                i.bump_bytes(1);
             }
             assert_eq!(i.last_pos, BytePos(6));
             assert_eq!(i.cur(), None);

From 8dc09ec105690098720b1938e4e0164d38b550aa Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 18:10:47 +0900
Subject: [PATCH 15/20] fix some

---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs | 8 --------
 crates/swc_html_parser/src/lexer/mod.rs       | 6 ++++--
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index 3bc438454779..f8a39102059f 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -151,14 +151,6 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         Span { lo: start, hi: end }
     }
 
-    #[inline(always)]
-    fn bump(&mut self) {
-        unsafe {
-            // Safety: Actually this is not safe but this is an internal method.
-            self.input_mut().bump()
-        }
-    }
-
     #[inline(always)]
     fn is(&self, c: u8) -> bool {
         self.input().is_byte(c)
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index f4b9facfa1ba..6db611dc0250 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -159,8 +159,10 @@ where
         // A leading Byte Order Mark (BOM) causes the character encoding argument to be
         // ignored and will itself be skipped.
         if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') {
-            // Safety: We know that the current character is '\u{feff}' (3 bytes: EF BB BF).
-            lexer.input.bump_bytes(3);
+            unsafe {
+                // Safety: We know that the current character is '\u{feff}' (3 bytes: EF BB BF).
+                lexer.input.bump_bytes(3);
+            }
         }
 
         lexer

From 5812a90e96b10faf05286651fae1ee3ebd0edfc5 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Wed, 26 Nov 2025 18:18:04 +0900
Subject: [PATCH 16/20] fix

---
 crates/swc_css_parser/src/lexer/mod.rs        |  4 +-
 crates/swc_ecma_parser/src/lexer/mod.rs       | 20 +++++--
 crates/swc_ecma_parser/src/lexer/search.rs    |  8 ++-
 crates/swc_ecma_parser/src/lexer/state.rs     |  4 +-
 crates/swc_ecma_parser/src/lexer/table.rs     | 12 +++--
 .../swc_ecma_parser/src/lexer/whitespace.rs   |  2 +
 crates/swc_html_parser/src/lexer/mod.rs       | 52 +++++++++++++------
 crates/swc_xml_parser/src/lexer/mod.rs        | 16 +++---
 8 files changed, 81 insertions(+), 37 deletions(-)

diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs
index 33960362f90f..3c906fdb6f98 100644
--- a/crates/swc_css_parser/src/lexer/mod.rs
+++ b/crates/swc_css_parser/src/lexer/mod.rs
@@ -221,7 +221,9 @@ where
         self.cur_pos = self.input.last_pos();
 
         if cur.is_some() {
-            self.input.bump_bytes(len);
+            unsafe {
+                self.input.bump_bytes(len);
+            }
         }
 
         cur
diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs
index 079ed06814f5..28249cba6bf3 100644
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@@ -450,7 +450,9 @@ impl Lexer<'_> {
                 consume_cooked!();
                 let cooked = cooked.map(|cooked| self.atoms.wtf8_atom(&*cooked));
                 let raw = raw_atom(self);
-                self.input.bump_bytes(2);
+                unsafe {
+                    self.input.bump_bytes(2);
+                }
                 return Ok(if started_with_backtick {
                     self.set_token_value(Some(TokenValue::Template { raw, cooked }));
                     Token::TemplateHead
@@ -538,7 +540,9 @@ impl<'a> Lexer<'a> {
     /// For unknown character length, use `c.len_utf8()` where c is a char.
     #[inline(always)]
     fn bump(&mut self, len: usize) {
-        self.input_mut().bump_bytes(len);
+        unsafe {
+            self.input_mut().bump_bytes(len);
+        }
     }
 
     #[inline(always)]
@@ -642,7 +646,9 @@ impl<'a> Lexer<'a> {
     fn skip_line_comment(&mut self, start_skip: usize) {
         // Position after the initial `//` (or similar)
         let start = self.cur_pos();
-        self.input_mut().bump_bytes(start_skip);
+        unsafe {
+            self.input_mut().bump_bytes(start_skip);
+        }
         let slice_start = self.cur_pos();
 
         // foo // comment for foo
@@ -755,7 +761,9 @@ impl<'a> Lexer<'a> {
         debug_assert_eq!(self.peek(), Some(b'*'));
 
         // Consume initial "/*"
-        self.input_mut().bump_bytes(2);
+        unsafe {
+            self.input_mut().bump_bytes(2);
+        }
 
         // jsdoc
         let slice_start = self.cur_pos();
@@ -1468,7 +1476,9 @@ impl<'a> Lexer<'a> {
         let before_second = self.input().cur_pos();
 
         // Bump `\u`
-        self.input_mut().bump_bytes(2);
+        unsafe {
+            self.input_mut().bump_bytes(2);
+        }
 
         let Some(low) = self.read_int_u32::<16>(4)? else {
             return Ok(None);
diff --git a/crates/swc_ecma_parser/src/lexer/search.rs b/crates/swc_ecma_parser/src/lexer/search.rs
index 711c7998fb96..7edcacf0cdde 100644
--- a/crates/swc_ecma_parser/src/lexer/search.rs
+++ b/crates/swc_ecma_parser/src/lexer/search.rs
@@ -134,7 +134,9 @@ macro_rules! byte_search {
 
                     // We don't find a matched byte in the remaining,
                     // which also means we have reached the end of the input.
-                    $lexer.input_mut().bump_bytes(len);
+                    unsafe {
+                        $lexer.input_mut().bump_bytes(len);
+                    }
                     $eof_handler
                 }
             };
@@ -149,7 +151,9 @@ macro_rules! byte_search {
             break $byte;
         };
 
-        $lexer.input_mut().bump_bytes($pos);
+        unsafe {
+            $lexer.input_mut().bump_bytes($pos);
+        }
         $byte
     }};
 }
diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs
index dd02ff45f464..77f47a354944 100644
--- a/crates/swc_ecma_parser/src/lexer/state.rs
+++ b/crates/swc_ecma_parser/src/lexer/state.rs
@@ -518,7 +518,9 @@ impl Lexer<'_> {
             if ch <= 0x7f {
                 if ch.is_ident_part() {
                     v.push(ch as char);
-                    self.input_mut().bump_bytes(1);
+                    unsafe {
+                        self.input_mut().bump_bytes(1);
+                    }
                 } else if ch == b'\\' {
                     self.bump(1); // bump '\'
                     if !self.is(b'u') {
diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs
index 76e8e95f2dfb..8a9256f48bec 100644
--- a/crates/swc_ecma_parser/src/lexer/table.rs
+++ b/crates/swc_ecma_parser/src/lexer/table.rs
@@ -337,7 +337,9 @@ const PIP: ByteHandler = |lexer| lexer.read_token_logical::<b'|'>();
 macro_rules! single_char {
     ($name:ident, $c:literal, $token:ident) => {
         const $name: ByteHandler = |lexer| {
-            lexer.input.bump_one();
+            unsafe {
+                lexer.input.bump_bytes(1);
+            }
             Ok(Token::$token)
         };
     };
@@ -364,9 +366,13 @@ single_char!(BEC, b'}', RBrace);
 /// `^`
 const CRT: ByteHandler = |lexer| {
     // Bitwise xor
-    lexer.input.bump_one();
+    unsafe {
+        lexer.input.bump_bytes(1);
+    }
     Ok(if lexer.input.cur_as_ascii() == Some(b'=') {
-        lexer.input.bump_one();
+        unsafe {
+            lexer.input.bump_bytes(1);
+        }
         Token::BitXorEq
     } else {
         Token::Caret
diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs
index 46cd7319ec73..ab13ac6d1c53 100644
--- a/crates/swc_ecma_parser/src/lexer/whitespace.rs
+++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs
@@ -1,3 +1,5 @@
+use swc_common::input::Input;
+
 use crate::{byte_search, lexer::search::SafeByteMatchTable, safe_byte_match_table, Lexer};
 
 /// U+000B VERTICAL TAB, abbreviated `<VT>`.
diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs
index 6db611dc0250..323c047ac32b 100644
--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@@ -255,7 +255,9 @@ where
         self.cur_pos = self.input.cur_pos();
 
         if self.cur.is_some() {
-            self.input.bump_bytes(len);
+            unsafe {
+                self.input.bump_bytes(len);
+            }
         }
     }
 
@@ -433,8 +435,10 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -489,8 +493,10 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -527,8 +533,10 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -861,8 +869,10 @@ where
             sub_buf.push('\r');
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -893,8 +903,10 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -1028,8 +1040,10 @@ where
             sub_buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
 
                 sub_buf.push('\n');
             }
@@ -1112,8 +1126,10 @@ where
             buf.push(c as char);
 
             if self.input.cur() == Some(b'\n') {
-                // Safety: cur() is Some(b'\n'), which is 1 byte
-                self.input.bump_bytes(1);
+                unsafe {
+                    // Safety: cur() is Some(b'\n'), which is 1 byte
+                    self.input.bump_bytes(1);
+                }
                 buf.push('\n');
             }
 
@@ -4943,8 +4959,10 @@ where
     #[inline(always)]
     fn skip_whitespaces(&mut self, c: u8) {
         if c == b'\r' && self.input.cur() == Some(b'\n') {
-            // Safety: cur() is Some(b'\n'), which is 1 byte
-            self.input.bump_bytes(1);
+            unsafe {
+                // Safety: cur() is Some(b'\n'), which is 1 byte
+                self.input.bump_bytes(1);
+            }
         }
     }
 }
diff --git a/crates/swc_xml_parser/src/lexer/mod.rs b/crates/swc_xml_parser/src/lexer/mod.rs
index ebac6ad14726..109a19785a66 100644
--- a/crates/swc_xml_parser/src/lexer/mod.rs
+++ b/crates/swc_xml_parser/src/lexer/mod.rs
@@ -175,8 +175,8 @@ where
         // ignored and will itself be skipped.
         if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') {
             unsafe {
-                // Safety: cur_as_char() is Some('\u{feff}')
-                lexer.input.bump();
+                // Safety: cur_as_char() is Some('\u{feff}'), which is 3 bytes (EF BB BF)
+                lexer.input.bump_bytes(3);
             }
         }
 
@@ -252,10 +252,10 @@ where
         self.cur = self.input.cur_as_char();
         self.cur_pos = self.input.cur_pos();
 
-        if self.cur.is_some() {
+        if let Some(c) = self.cur {
             unsafe {
                 // Safety: cur_as_char() is Some(c)
-                self.input.bump();
+                self.input.bump_bytes(c.len_utf8());
             }
         }
     }
@@ -576,7 +576,7 @@ where
                 if self.input.cur() == Some(b'\n') {
                     unsafe {
                         // Safety: cur() is Some(b'\n')
-                        self.input.bump();
+                        self.input.bump_bytes(1);
                     }
 
                     raw.push('\n');
@@ -898,7 +898,7 @@ where
                 if self.input.cur() == Some(b'\n') {
                     unsafe {
                         // Safety: cur() is Some(b'\n')
-                        self.input.bump();
+                        self.input.bump_bytes(1);
                     }
 
                     raw_c.push('\n');
@@ -965,7 +965,7 @@ where
             if self.input.cur() == Some(b'\n') {
                 unsafe {
                     // Safety: cur() is Some(b'\n')
-                    self.input.bump();
+                    self.input.bump_bytes(1);
                 }
 
                 raw.push('\n');
@@ -3107,7 +3107,7 @@ where
         if c == '\r' && self.input.cur() == Some(b'\n') {
             unsafe {
                 // Safety: cur() is Some(b'\n')
-                self.input.bump();
+                self.input.bump_bytes(1);
             }
         }
     }

From 57ed77a9f1537ffb6820ccd1917b01c9be09c060 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 27 Nov 2025 07:23:42 +0900
Subject: [PATCH 17/20] refactor(lexer): Make bump_bytes unsafe and standardize
 byte operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wraps all bump_bytes() calls in unsafe blocks with safety comments and adds a safe bump() helper method for single-byte advances.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs | 59 ++++++++++---------
 .../swc_ecma_lexer/src/common/lexer/search.rs | 12 +++-
 crates/swc_ecma_lexer/src/input.rs            |  2 +-
 crates/swc_ecma_lexer/src/lexer/jsx.rs        |  8 +--
 crates/swc_ecma_lexer/src/lexer/mod.rs        | 10 ++--
 crates/swc_ecma_lexer/src/lexer/state.rs      |  4 +-
 crates/swc_ecma_lexer/src/lexer/table.rs      | 20 +++++--
 7 files changed, 66 insertions(+), 49 deletions(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index f8a39102059f..9861e1f3662b 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -186,6 +186,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         self.input().cur_as_char()
     }
 
+    #[inline(always)]
+    fn bump(&mut self) {
+        unsafe {
+            self.input_mut().bump_bytes(1);
+        }
+    }
+
     #[inline(always)]
     fn cur_pos(&self) -> BytePos {
         self.input().cur_pos()
@@ -252,7 +259,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     fn skip_line_comment(&mut self, start_skip: usize) {
         // Position after the initial `//` (or similar)
         let start = self.cur_pos();
-        self.input_mut().bump_bytes(start_skip);
+        unsafe {
+            self.input_mut().bump_bytes(start_skip);
+        }
         let slice_start = self.cur_pos();
 
         // foo // comment for foo
@@ -365,7 +374,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         debug_assert_eq!(self.peek(), Some(b'*'));
 
         // Consume initial "/*"
-        self.input_mut().bump_bytes(2);
+        unsafe {
+            self.input_mut().bump_bytes(2);
+        }
 
         // jsdoc
         let slice_start = self.cur_pos();
@@ -416,7 +427,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                 b'*' => {
                     if self.peek() == Some(b'/') {
                         // Consume "*/"
-                        self.input_mut().bump_bytes(2);
+                        unsafe {
+                            self.input_mut().bump_bytes(2);
+                        }
 
                         if should_mark_had_line_break {
                             self.state_mut().mark_had_line_break();
@@ -506,7 +519,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                 (skip.offset, skip.newline)
             };
 
-            self.input_mut().bump_bytes(offset as usize);
+            unsafe {
+                self.input_mut().bump_bytes(offset as usize);
+            }
             if newline {
                 self.state_mut().mark_had_line_break();
             }
@@ -603,10 +618,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                     }
 
                     // Ignore this _ character
-                    unsafe {
                         // Safety: cur() returns Some(c) where c is a valid char
-                        self.input_mut().bump();
-                    }
+                        self.bump();
 
                     continue;
                 }
@@ -1053,10 +1066,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     fn read_jsx_str(&mut self, quote: char) -> LexResult<Self::Token> {
         debug_assert!(self.syntax().jsx());
         let start = self.input().cur_pos();
-        unsafe {
             // Safety: cur() was Some(quote)
-            self.input_mut().bump(); // `quote`
-        }
+            self.bump(); // `quote`
         let mut out = String::new();
         let mut chunk_start = self.input().cur_pos();
         loop {
@@ -1120,10 +1131,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
                 chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
             } else {
-                unsafe {
                     // Safety: cur() was Some(ch)
-                    self.input_mut().bump();
-                }
+                    self.bump();
             }
         }
         let cur_pos = self.input().cur_pos();
@@ -1191,7 +1200,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let before_second = self.input().cur_pos();
 
         // Bump `\u`
-        self.input_mut().bump_bytes(2);
+        unsafe {
+            self.input_mut().bump_bytes(2);
+        }
 
         let Some(low) = self.read_int_u32::<16>(4)? else {
             return Ok(None);
@@ -1557,10 +1568,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             _ => c,
         };
 
-        unsafe {
             // Safety: cur() is Some(c) if this method is called.
-            self.input_mut().bump();
-        }
+            self.bump();
 
         Ok(CodePoint::from_u32(c as u32))
     }
@@ -1921,10 +1930,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let had_line_break_before_last = self.had_line_break_before_last();
         let start = self.cur_pos();
 
-        unsafe {
             // Safety: cur() is Some(c as char)
-            self.input_mut().bump();
-        }
+            self.bump();
         let token = if is_bit_and {
             Self::Token::BIT_AND
         } else {
@@ -1943,16 +1950,12 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
         // '||', '&&'
         if self.input().cur() == Some(C) {
-            unsafe {
                 // Safety: cur() is Some(c)
-                self.input_mut().bump();
-            }
+                self.bump();
 
             if self.input().cur() == Some(b'=') {
-                unsafe {
                     // Safety: cur() is Some('=')
-                    self.input_mut().bump();
-                }
+                    self.bump();
 
                 return Ok(if is_bit_and {
                     Self::Token::LOGICAL_AND_EQ
@@ -2101,10 +2104,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                         self.wtf8_atom(Wtf8::from_str(s))
                     };
 
-                    unsafe {
                         // Safety: cur is quote
-                        self.input_mut().bump();
-                    }
+                        self.bump();
 
                     let end = self.cur_pos();
                     let raw = unsafe {
diff --git a/crates/swc_ecma_lexer/src/common/lexer/search.rs b/crates/swc_ecma_lexer/src/common/lexer/search.rs
index 4fa467f61624..356ea44cc306 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/search.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/search.rs
@@ -121,17 +121,23 @@ macro_rules! byte_search {
                     let $pos = i; // Index within current slice
                     if $should_continue {
                         // Continue searching from next position
-                        $lexer.input_mut().bump_bytes(i + 1);
+                        unsafe {
+                            $lexer.input_mut().bump_bytes(i + 1);
+                        }
                         continue;
                     } else {
-                        $lexer.input_mut().bump_bytes(i);
+                        unsafe {
+                            $lexer.input_mut().bump_bytes(i);
+                        }
                         break $byte;
                     }
                 }
                 None => {
                     // Consume remainder then run handler.
                     let len = $lexer.input().as_str().len();
-                    $lexer.input_mut().bump_bytes(len);
+                    unsafe {
+                        $lexer.input_mut().bump_bytes(len);
+                    }
                     $eof_handler
                 }
             }
diff --git a/crates/swc_ecma_lexer/src/input.rs b/crates/swc_ecma_lexer/src/input.rs
index a7878a3b7992..154dc9622bbe 100644
--- a/crates/swc_ecma_lexer/src/input.rs
+++ b/crates/swc_ecma_lexer/src/input.rs
@@ -426,7 +426,7 @@ impl<'a, I: Tokens<TokenAndSpan>> crate::common::parser::buffer::Buffer<'a> for
     }
 
     fn bump(&mut self) {
-        self.bump();
+        let _ = Buffer::bump(self);
     }
 
     fn expect_word_token_and_bump(&mut self) -> swc_atoms::Atom {
diff --git a/crates/swc_ecma_lexer/src/lexer/jsx.rs b/crates/swc_ecma_lexer/src/lexer/jsx.rs
index 82825c00459f..ad5e2c009932 100644
--- a/crates/swc_ecma_lexer/src/lexer/jsx.rs
+++ b/crates/swc_ecma_lexer/src/lexer/jsx.rs
@@ -35,7 +35,7 @@ impl Lexer<'_> {
                         if cur == '<' && self.state.is_expr_allowed {
                             unsafe {
                                 // Safety: cur() was Some('<')
-                                self.input.bump();
+                                self.input.bump_bytes(1);
                             }
                             return Ok(Token::JSXTagStart);
                         }
@@ -73,7 +73,7 @@ impl Lexer<'_> {
                     );
                     unsafe {
                         // Safety: cur() was Some('>')
-                        self.input.bump()
+                        self.input.bump_bytes(1)
                     }
                 }
                 '}' => {
@@ -85,7 +85,7 @@ impl Lexer<'_> {
                     );
                     unsafe {
                         // Safety: cur() was Some('}')
-                        self.input.bump()
+                        self.input.bump_bytes(1)
                     }
                 }
                 '&' => {
@@ -114,7 +114,7 @@ impl Lexer<'_> {
                     } else {
                         unsafe {
                             // Safety: cur() was Some(c)
-                            self.input.bump()
+                            self.input.bump_bytes(1)
                         }
                     }
                 }
diff --git a/crates/swc_ecma_lexer/src/lexer/mod.rs b/crates/swc_ecma_lexer/src/lexer/mod.rs
index 49e5be16bfc5..dfa91468b521 100644
--- a/crates/swc_ecma_lexer/src/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/lexer/mod.rs
@@ -158,7 +158,9 @@ impl<'a> Lexer<'a> {
             Some(handler) => handler(self),
             None => {
                 let start = self.cur_pos();
-                self.input.bump_bytes(1);
+                unsafe {
+                    self.input.bump_bytes(1);
+                }
                 self.error_span(
                     pos_span(start),
                     SyntaxError::UnexpectedChar { c: byte as _ },
@@ -172,14 +174,14 @@ impl<'a> Lexer<'a> {
 
         unsafe {
             // Safety: cur() is Some(c), if this method is called.
-            self.input.bump();
+            self.input.bump_bytes(1);
         }
 
         // '++', '--'
         Ok(if self.input.cur() == Some(C) {
             unsafe {
                 // Safety: cur() is Some(c)
-                self.input.bump();
+                self.input.bump_bytes(1);
             }
 
             // Handle -->
@@ -216,7 +218,7 @@ impl<'a> Lexer<'a> {
 
         unsafe {
             // Safety: cur() is Some(c) if this method is called.
-            self.input.bump();
+            self.input.bump_bytes(1);
         }
 
         Ok(if self.input.eat_byte(b'=') {
diff --git a/crates/swc_ecma_lexer/src/lexer/state.rs b/crates/swc_ecma_lexer/src/lexer/state.rs
index 7731423f510b..f478e55f5cc8 100644
--- a/crates/swc_ecma_lexer/src/lexer/state.rs
+++ b/crates/swc_ecma_lexer/src/lexer/state.rs
@@ -812,7 +812,7 @@ impl Lexer<'_> {
                     if c == '>' {
                         unsafe {
                             // Safety: cur() is Some('>')
-                            self.input.bump();
+                            self.input.bump_bytes(1);
                         }
                         return Ok(Token::JSXTagEnd);
                     }
@@ -830,7 +830,7 @@ impl Lexer<'_> {
 
                     unsafe {
                         // Safety: cur() is Some('<')
-                        self.input.bump();
+                        self.input.bump_bytes(1);
                     }
 
                     if had_line_break_before_last && self.is_str("<<<<<< ") {
diff --git a/crates/swc_ecma_lexer/src/lexer/table.rs b/crates/swc_ecma_lexer/src/lexer/table.rs
index bd8b16aa6c26..812f126ebdae 100644
--- a/crates/swc_ecma_lexer/src/lexer/table.rs
+++ b/crates/swc_ecma_lexer/src/lexer/table.rs
@@ -41,7 +41,9 @@ pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [
 const ___: ByteHandler = None;
 
 const EOF: ByteHandler = Some(|lexer| {
-    lexer.input.bump_bytes(1);
+    unsafe {
+        lexer.input.bump_bytes(1);
+    }
 
     Ok(Token::Eof)
 });
@@ -56,7 +58,7 @@ const ERR: ByteHandler = Some(|lexer| {
     let start = lexer.cur_pos();
     unsafe {
         // Safety: Byte handler is only called for non-last characters
-        lexer.input.bump();
+        lexer.input.bump_bytes(c.len_utf8());
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 });
@@ -379,7 +381,7 @@ const UNI: ByteHandler = Some(|lexer| {
     let start = lexer.cur_pos();
     unsafe {
         // Safety: Byte handler is only called for non-last characters
-        lexer.input.bump();
+        lexer.input.bump_bytes(c.len_utf8());
     }
     lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
 });
@@ -405,7 +407,9 @@ const PIP: ByteHandler = Some(|lexer| lexer.read_token_logical::<b'|'>());
 macro_rules! single_char {
     ($name:ident, $c:literal, $token:ident) => {
         const $name: ByteHandler = Some(|lexer| {
-            lexer.input.bump_bytes(1);
+            unsafe {
+                lexer.input.bump_bytes(1);
+            }
             Ok(Token::$token)
         });
     };
@@ -429,9 +433,13 @@ single_char!(BEC, b'}', RBrace);
 /// `^`
 const CRT: ByteHandler = Some(|lexer| {
     // Bitwise xor
-    lexer.input.bump_bytes(1);
-    Ok(if lexer.input.cur_as_ascii() == Some(b'=') {
+    unsafe {
         lexer.input.bump_bytes(1);
+    }
+    Ok(if lexer.input.cur_as_ascii() == Some(b'=') {
+        unsafe {
+            lexer.input.bump_bytes(1);
+        }
         Token::AssignOp(AssignOp::BitXorAssign)
     } else {
         Token::BinOp(BinOpToken::BitXor)

From 91130b49078f3be6ec27c7ba52774c27cfd36fda Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 27 Nov 2025 08:12:44 +0900
Subject: [PATCH 18/20] fix(lexer): Fix bump() to handle multibyte UTF-8
 characters correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bump() helper was only advancing by 1 byte, which broke multibyte Unicode characters. Now it properly gets the current character and bumps by its full UTF-8 byte length.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index 9861e1f3662b..3a1b0c5da554 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -188,8 +188,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
     #[inline(always)]
     fn bump(&mut self) {
+        let c = self.cur_as_char().unwrap();
         unsafe {
-            self.input_mut().bump_bytes(1);
+            self.input_mut().bump_bytes(c.len_utf8());
         }
     }
 

From 73519b8ec28fa9b0d2302d975c36d44c0b184c1f Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 27 Nov 2025 08:33:15 +0900
Subject: [PATCH 19/20] fmt

---
 crates/swc_ecma_lexer/src/common/lexer/mod.rs | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
index 3a1b0c5da554..af63c0cbec37 100644
--- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs
+++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs
@@ -619,8 +619,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                     }
 
                     // Ignore this _ character
-                        // Safety: cur() returns Some(c) where c is a valid char
-                        self.bump();
+                    // Safety: cur() returns Some(c) where c is a valid char
+                    self.bump();
 
                     continue;
                 }
@@ -1067,8 +1067,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
     fn read_jsx_str(&mut self, quote: char) -> LexResult<Self::Token> {
         debug_assert!(self.syntax().jsx());
         let start = self.input().cur_pos();
-            // Safety: cur() was Some(quote)
-            self.bump(); // `quote`
+        // Safety: cur() was Some(quote)
+        self.bump(); // `quote`
         let mut out = String::new();
         let mut chunk_start = self.input().cur_pos();
         loop {
@@ -1132,8 +1132,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
                 chunk_start = cur_pos + BytePos(ch.len_utf8() as _);
             } else {
-                    // Safety: cur() was Some(ch)
-                    self.bump();
+                // Safety: cur() was Some(ch)
+                self.bump();
             }
         }
         let cur_pos = self.input().cur_pos();
@@ -1569,8 +1569,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
             _ => c,
         };
 
-            // Safety: cur() is Some(c) if this method is called.
-            self.bump();
+        // Safety: cur() is Some(c) if this method is called.
+        self.bump();
 
         Ok(CodePoint::from_u32(c as u32))
     }
@@ -1931,8 +1931,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
         let had_line_break_before_last = self.had_line_break_before_last();
         let start = self.cur_pos();
 
-            // Safety: cur() is Some(c as char)
-            self.bump();
+        // Safety: cur() is Some(c as char)
+        self.bump();
         let token = if is_bit_and {
             Self::Token::BIT_AND
         } else {
@@ -1951,12 +1951,12 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
 
         // '||', '&&'
         if self.input().cur() == Some(C) {
-                // Safety: cur() is Some(c)
-                self.bump();
+            // Safety: cur() is Some(c)
+            self.bump();
 
             if self.input().cur() == Some(b'=') {
-                    // Safety: cur() is Some('=')
-                    self.bump();
+                // Safety: cur() is Some('=')
+                self.bump();
 
                 return Ok(if is_bit_and {
                     Self::Token::LOGICAL_AND_EQ
@@ -2105,8 +2105,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens<TokenAndSpan> + Sized {
                         self.wtf8_atom(Wtf8::from_str(s))
                     };
 
-                        // Safety: cur is quote
-                        self.bump();
+                    // Safety: cur is quote
+                    self.bump();
 
                     let end = self.cur_pos();
                     let raw = unsafe {

From 3b40a249c246e28967e8858ecb8d8219349ebf25 Mon Sep 17 00:00:00 2001
From: DongYun Kang <kdy.1997.dev@gmail.com>
Date: Thu, 27 Nov 2025 08:33:35 +0900
Subject: [PATCH 20/20] drop

---
 Cargo.lock                   |  1 -
 crates/swc_common/Cargo.toml | 41 ++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8705c92cdf85..a43b0927cc69 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5476,7 +5476,6 @@ dependencies = [
  "cbor4ii",
  "either",
  "from_variant",
- "new_debug_unreachable",
  "num-bigint",
  "once_cell",
  "par-iter",
diff --git a/crates/swc_common/Cargo.toml b/crates/swc_common/Cargo.toml
index da0b75aaa3ea..4ce639a24690 100644
--- a/crates/swc_common/Cargo.toml
+++ b/crates/swc_common/Cargo.toml
@@ -51,27 +51,26 @@ shrink-to-fit = ["dep:shrink-to-fit", "swc_atoms/shrink-to-fit"]
 
 
 [dependencies]
-anyhow                = { workspace = true }
-arbitrary             = { workspace = true, features = ["derive"], optional = true }
-bytecheck             = { workspace = true, optional = true }
-bytes-str             = { workspace = true, features = ["serde"] }
-cbor4ii               = { workspace = true, features = ["use_std"], optional = true }
-either                = { workspace = true }
-new_debug_unreachable = { workspace = true }
-num-bigint            = { workspace = true }
-once_cell             = { workspace = true }
-parking_lot           = { workspace = true, optional = true }
-rancor                = { workspace = true, optional = true }
-rkyv                  = { workspace = true, optional = true }
-rustc-hash            = { workspace = true }
-serde                 = { workspace = true, features = ["derive"] }
-shrink-to-fit         = { workspace = true, optional = true }
-siphasher             = { workspace = true }
-swc_sourcemap         = { workspace = true, optional = true }
-termcolor             = { workspace = true, optional = true }
-tracing               = { workspace = true }
-unicode-width         = { workspace = true }
-url                   = { workspace = true }
+anyhow        = { workspace = true }
+arbitrary     = { workspace = true, features = ["derive"], optional = true }
+bytecheck     = { workspace = true, optional = true }
+bytes-str     = { workspace = true, features = ["serde"] }
+cbor4ii       = { workspace = true, features = ["use_std"], optional = true }
+either        = { workspace = true }
+num-bigint    = { workspace = true }
+once_cell     = { workspace = true }
+parking_lot   = { workspace = true, optional = true }
+rancor        = { workspace = true, optional = true }
+rkyv          = { workspace = true, optional = true }
+rustc-hash    = { workspace = true }
+serde         = { workspace = true, features = ["derive"] }
+shrink-to-fit = { workspace = true, optional = true }
+siphasher     = { workspace = true }
+swc_sourcemap = { workspace = true, optional = true }
+termcolor     = { workspace = true, optional = true }
+tracing       = { workspace = true }
+unicode-width = { workspace = true }
+url           = { workspace = true }
 
 ast_node             = { version = "5.0.0", path = "../ast_node" }
 better_scoped_tls    = { version = "1.0.1", path = "../better_scoped_tls" }