Skip to content

Commit

Permalink
Fix Encoding Detection in helpers.rs (#77)
Browse files Browse the repository at this point in the history
* Update Cargo.lock

* Update Cargo.toml

* Update helpers.rs

* Update helpers.rs
  • Loading branch information
thttg authored Jan 17, 2024
1 parent 156bd4c commit 52a70de
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 44 deletions.
14 changes: 13 additions & 1 deletion src-tauri/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ serde_json = "1.0"
tokio = { version = "1.35.1", features = ["full"] }
byteorder = "1.5.0"
chardet = "0.2.4"
encoding = "0.2.33"
chardetng = "0.1.17"
encoding_rs = "0.8.33"
dirs-next = "2.0.0"
discord-rich-presence = "0.2.3"
regex = "1.10.2"
Expand Down
83 changes: 41 additions & 42 deletions src-tauri/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,54 @@ use std::fs;
use std::path::Path;

use chardet::{charset2encoding, detect};
use chardetng::EncodingDetector;
use charset_normalizer_rs::from_bytes;
use encoding::label::encoding_from_whatwg_label;
use encoding::DecoderTrap;
use encoding_rs::{Encoding, UTF_8};
use log::info;

pub fn decode_buffer(buf: Vec<u8>) -> (String, String, String) {
let buff_output: String;
let first_encoding: String;
let second_encoding: String;
let mut str_encoding: String;
/// Decodes a buffer of bytes into a string, detecting the encoding
pub fn decode_buffer(buf: Vec<u8>) -> (String, String) {
// Using chardetng for encoding detection
let mut detector = EncodingDetector::new();
detector.feed(&buf, true);
let chardetng_encoding = detector.guess(None, true).name();

// chardet
first_encoding = charset2encoding(&detect(&buf).0).to_string();
// Using chardet for encoding detection
let chardet_encoding = charset2encoding(&detect(&buf).0).to_string();

// charset_normalizer_rs
second_encoding = match from_bytes(&buf, None).get_best() {
Some(cd) => cd.encoding().to_string(),
None => "not_found".to_string(),
};

str_encoding = first_encoding.clone();

if first_encoding == "KOI8-R"
|| first_encoding == "MacCyrillic"
|| first_encoding == "x-mac-cyrillic"
{
str_encoding = "cp1251".to_string();
}

if second_encoding == "koi8-r" || second_encoding == "macintosh" || second_encoding == "ibm866"
{
str_encoding = "cp1251".to_string();
}

// if str_encoding.len() < 1 {
// str_encoding = "cp1251".to_string();
// }

let coder = encoding_from_whatwg_label(str_encoding.as_str());
if coder.is_some() {
buff_output = coder
.unwrap()
.decode(&buf, DecoderTrap::Ignore)
.expect("Error");
// Using charset_normalizer_rs for encoding detection
let charset_normalizer_encoding = from_bytes(&buf, None).get_best()
.map(|cd| cd.encoding().to_string())
.unwrap_or_else(|| "not_found".to_string());

// Determine the most likely actual encoding
let actual_encoding = if charset_normalizer_encoding == "macintosh" {
// Use windows-1251 if charset_normalizer detects macintosh
Encoding::for_label("windows-1251".as_bytes()).unwrap_or(UTF_8)
} else if chardetng_encoding == "GBK" || chardetng_encoding == "GB2312" ||
chardet_encoding == "GBK" || chardet_encoding == "GB2312" {
// Use GB18030 for Chinese characters if detected as GBK or GB2312
Encoding::for_label("GB18030".as_bytes()).unwrap_or(UTF_8)
} else if chardetng_encoding == "windows-1252" && chardet_encoding == "windows-1251" {
// Use windows-1251 if chardetng detects windows-1252 and chardet detects windows-1251
Encoding::for_label("windows-1251".as_bytes()).unwrap_or(UTF_8)
} else if chardet_encoding == "ISO-8859-1" && charset_normalizer_encoding == "ibm866" {
// Use windows-1252 if chardet detects ISO-8859-1 and charset normalizer detects ibm866
Encoding::for_label("windows-1252".as_bytes()).unwrap_or(UTF_8)
} else {
buff_output = String::from_utf8_lossy(buf.as_slice()).to_string();
}
// Default to the encoding detected by chardetng
Encoding::for_label(chardetng_encoding.as_bytes()).unwrap_or(UTF_8)
};

// Decode the buffer using the determined encoding
// Note: Error handling for decoding errors is intentionally omitted.
// In cases where there are minor errors in the text (like a few corrupted characters),
// this approach ensures that the text is still usable, albeit with some minor imperfections.
let (decoded, _, _had_errors) = actual_encoding.decode(&buf);
let buff_output = decoded.into_owned();

(buff_output, first_encoding, second_encoding)
// Return the decoded string and the encoding name
(buff_output, actual_encoding.name().to_string())
}

pub fn copy_files(src: impl AsRef<Path>, dest: impl AsRef<Path>) -> Result<(), String> {
Expand Down

0 comments on commit 52a70de

Please sign in to comment.