diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index cc9e8908..af74bf11 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -621,6 +621,17 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a48563284b67c003ba0fb7243c87fab68885e1532c605704228a80238512e31" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "charset-normalizer-rs" version = "1.0.6" @@ -2659,11 +2670,12 @@ dependencies = [ "actix-web", "byteorder", "chardet", + "chardetng", "charset-normalizer-rs", "dirs-next", "discord-rich-presence", "dll-syringe", - "encoding", + "encoding_rs", "log", "md5", "regex", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 374fc914..e8ebb4ff 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -19,7 +19,8 @@ serde_json = "1.0" tokio = { version = "1.35.1", features = ["full"] } byteorder = "1.5.0" chardet = "0.2.4" -encoding = "0.2.33" +chardetng = "0.1.17" +encoding_rs = "0.8.33" dirs-next = "2.0.0" discord-rich-presence = "0.2.3" regex = "1.10.2" diff --git a/src-tauri/src/helpers.rs b/src-tauri/src/helpers.rs index 9e069135..7f6631cf 100644 --- a/src-tauri/src/helpers.rs +++ b/src-tauri/src/helpers.rs @@ -2,55 +2,54 @@ use std::fs; use std::path::Path; use chardet::{charset2encoding, detect}; +use chardetng::EncodingDetector; use charset_normalizer_rs::from_bytes; -use encoding::label::encoding_from_whatwg_label; -use encoding::DecoderTrap; +use encoding_rs::{Encoding, UTF_8}; use log::info; -pub fn decode_buffer(buf: Vec) -> (String, String, String) { - let buff_output: String; - let first_encoding: String; - let second_encoding: String; - let mut str_encoding: String; +/// Decodes a buffer of bytes into a string, detecting the encoding +pub fn decode_buffer(buf: Vec) -> (String, String) { + // Using chardetng for encoding detection + let mut detector = EncodingDetector::new(); + detector.feed(&buf, true); + let chardetng_encoding = detector.guess(None, true).name(); - // chardet - first_encoding = charset2encoding(&detect(&buf).0).to_string(); + // Using chardet for encoding detection + let chardet_encoding = charset2encoding(&detect(&buf).0).to_string(); - // charset_normalizer_rs - second_encoding = match from_bytes(&buf, None).get_best() { - Some(cd) => cd.encoding().to_string(), - None => "not_found".to_string(), - }; - - str_encoding = first_encoding.clone(); - - if first_encoding == "KOI8-R" - || first_encoding == "MacCyrillic" - || first_encoding == "x-mac-cyrillic" - { - str_encoding = "cp1251".to_string(); - } - - if second_encoding == "koi8-r" || second_encoding == "macintosh" || second_encoding == "ibm866" - { - str_encoding = "cp1251".to_string(); - } - - // if str_encoding.len() < 1 { - // str_encoding = "cp1251".to_string(); - // } - - let coder = encoding_from_whatwg_label(str_encoding.as_str()); - if coder.is_some() { - buff_output = coder - .unwrap() - .decode(&buf, DecoderTrap::Ignore) - .expect("Error"); + // Using charset_normalizer_rs for encoding detection + let charset_normalizer_encoding = from_bytes(&buf, None).get_best() + .map(|cd| cd.encoding().to_string()) + .unwrap_or_else(|| "not_found".to_string()); + + // Determine the most likely actual encoding + let actual_encoding = if charset_normalizer_encoding == "macintosh" { + // Use windows-1251 if charset_normalizer detects macintosh + Encoding::for_label("windows-1251".as_bytes()).unwrap_or(UTF_8) + } else if chardetng_encoding == "GBK" || chardetng_encoding == "GB2312" || + chardet_encoding == "GBK" || chardet_encoding == "GB2312" { + // Use GB18030 for Chinese characters if detected as GBK or GB2312 + Encoding::for_label("GB18030".as_bytes()).unwrap_or(UTF_8) + } else if chardetng_encoding == "windows-1252" && chardet_encoding == "windows-1251" { + // Use windows-1251 if chardetng detects windows-1252 and chardet detects windows-1251 + Encoding::for_label("windows-1251".as_bytes()).unwrap_or(UTF_8) + } else if chardet_encoding == "ISO-8859-1" && charset_normalizer_encoding == "ibm866" { + // Use windows-1252 if chardet detects ISO-8859-1 and charset normalizer detects ibm866 + Encoding::for_label("windows-1252".as_bytes()).unwrap_or(UTF_8) } else { - buff_output = String::from_utf8_lossy(buf.as_slice()).to_string(); - } + // Default to the encoding detected by chardetng + Encoding::for_label(chardetng_encoding.as_bytes()).unwrap_or(UTF_8) + }; + + // Decode the buffer using the determined encoding + // Note: Error handling for decoding errors is intentionally omitted. + // In cases where there are minor errors in the text (like a few corrupted characters), + // this approach ensures that the text is still usable, albeit with some minor imperfections. + let (decoded, _, _had_errors) = actual_encoding.decode(&buf); + let buff_output = decoded.into_owned(); - (buff_output, first_encoding, second_encoding) + // Return the decoded string and the encoding name + (buff_output, actual_encoding.name().to_string()) } pub fn copy_files(src: impl AsRef, dest: impl AsRef) -> Result<(), String> {