diff --git a/README.md b/README.md index 8e30b7e..e3390fb 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,13 @@ implementation 'com.github.gzu-liyujiang:CJKCharsetDetector:latest.version' } ``` +```groovy + Charset charset = CJKCharsetDetector.detect(new FileInputStream(file)); + String str = new String(bytes, charset.name()); + if (CJKCharsetDetector.inWrongEncoding(str)) { + System.err.println("File was loaded using wrong encoding: " + charset.name()); + } +``` ## License ```text diff --git a/library/ASCII.txt b/library/ASCII.txt new file mode 100644 index 0000000..c3aeced --- /dev/null +++ b/library/ASCII.txt @@ -0,0 +1,4 @@ +ASCII (American Standard Code for information exchange) is a computer coding system based on Latin alphabet, which is mainly used to display modern English and other Western European languages. + +Powered by gzu-liyujiang +2020/8/7 diff --git a/library/UTF-8-BOM.txt b/library/UTF-8-BOM.txt new file mode 100644 index 0000000..d626311 --- /dev/null +++ b/library/UTF-8-BOM.txt @@ -0,0 +1,8 @@ +KOI8-R представляет собой кодирование 8 - битного текста на славянском языке серии KOI-8 для использования на русском и болгарском языках.  + + до того, как Unicode не стал популярным, KOI8-R был наиболее широко используемым русским кодом, который даже выше стандарта ISO-8859-5.  + +我爱中国 我愛中國 中国を愛しています Я люблю китай I love China. + +Powered by 貴州穿青人李裕江 +2020年8月7日 diff --git a/library/src/main/java/com/github/gzuliyujiang/chardet/CJKCharsetDetector.java b/library/src/main/java/com/github/gzuliyujiang/chardet/CJKCharsetDetector.java index 8791de5..761dec7 100644 --- a/library/src/main/java/com/github/gzuliyujiang/chardet/CJKCharsetDetector.java +++ b/library/src/main/java/com/github/gzuliyujiang/chardet/CJKCharsetDetector.java @@ -119,7 +119,7 @@ private void guessCharset(InputStream inputStream, int language) throws Exceptio if (isAscii) { alreadyFound = true; if (DEBUG) { - System.out.println("Is ASCII"); + System.out.println("ASCII first: true"); } probableCharset = "ASCII"; return; @@ -129,18 +129,23 @@ private void guessCharset(InputStream inputStream, int language) throws Exceptio if (DEBUG) { System.out.println("Probable charsets: " + Arrays.toString(probableCharsets)); } + // 先取第一个可能的字符集,然后再赛选其他可能的字符集 probableCharset = probableCharsets[0]; for (String itCharset : probableCharsets) { - if (!itCharset.startsWith("UTF") && !itCharset.startsWith("GB18030")) { - // 可能有多个字符集的情况,范围比较大的UTF系列及GB18030优先级靠后 - // [UTF-16LE, Big5, GB18030, UTF-16BE] - // [GB18030, Shift_JIS, UTF-16BE] + // “UTF-16LE、UTF-16BE、GB18030”这几种范围比较大,目前并不常用,优先级靠后 + // [UTF-16BE, Big5, GB18030] + // [UTF-16LE, Big5, GB18030, UTF-16BE] + // [GB18030, Shift_JIS, UTF-16BE] + if (!(itCharset.startsWith("UTF-16") || itCharset.startsWith("GB18030"))) { probableCharset = itCharset; break; } } } if ("nomatch".equals(probableCharset)) { + if (DEBUG) { + System.out.println("Charset no match"); + } throw new Exception("no match"); } alreadyFound = false; diff --git a/library/src/test/java/com/github/gzuliyujiang/chardet/JUnitTest.java b/library/src/test/java/com/github/gzuliyujiang/chardet/JUnitTest.java index e745988..2ce60c5 100644 --- a/library/src/test/java/com/github/gzuliyujiang/chardet/JUnitTest.java +++ b/library/src/test/java/com/github/gzuliyujiang/chardet/JUnitTest.java @@ -20,6 +20,7 @@ import org.junit.Assert; import org.junit.Test; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; @@ -38,19 +39,24 @@ public class JUnitTest { CJKCharsetDetector.DEBUG = true; } + @Test + public final void detectASCII() { + Assert.assertTrue(guessCharset("ASCII")); + } + @Test public final void detectUTF8() { Assert.assertTrue(guessCharset("UTF-8")); } @Test - public final void detectGBK() { - Assert.assertTrue(guessCharset("GBK")); + public final void detectUTF8WithBOM() { + Assert.assertTrue(guessCharset("UTF-8-BOM", "UTF-8")); } @Test - public final void detectBig5() { - Assert.assertTrue(guessCharset("Big5")); + public final void detectGBK() { + Assert.assertTrue(guessCharset("GBK")); } @Test @@ -63,6 +69,11 @@ public final void detectGB18030() { Assert.assertTrue(guessCharset("GB18030")); } + @Test + public final void detectBig5() { + Assert.assertTrue(guessCharset("Big5")); + } + @Test public final void detectShiftJIS() { Assert.assertTrue(guessCharset("Shift_JIS")); @@ -75,22 +86,28 @@ public final void detectEUCKR() { @Test public final void detectKOI8R() { + // NOTE: KOI8-R 编码 被识别成 Shift_JIS 编码,不知道是不是样本不靠谱? Assert.assertTrue(guessCharset("KOI8-R")); } private static boolean guessCharset(String charsetName) { + return guessCharset(charsetName, charsetName); + } + + private static boolean guessCharset(String fileName, String charsetName) { System.out.println("---------------------------------"); try { System.out.println("Origin charset: " + charsetName); - File file = new File(System.getProperty("user.dir"), charsetName + ".txt"); + File file = new File(System.getProperty("user.dir"), fileName + ".txt"); System.out.println("Target file: " + file); - Charset charset = CJKCharsetDetector.detect(new FileInputStream(file)); - assert charset != null; - System.out.println("Detect charset: " + charset); byte[] bytes = readBytes(new FileInputStream(file)); System.out.println("Bytes length: " + bytes.length); + //Charset charset = CJKCharsetDetector.detect(new FileInputStream(file)); + Charset charset = CJKCharsetDetector.detect(new ByteArrayInputStream(bytes)); + assert charset != null; + System.out.println("Detect charset: " + charset); String str = new String(bytes, charset.name()); - System.out.println("Display text: " + str); + System.out.println("Display text: \n**********\n" + str.trim() + "\n**********"); if (CJKCharsetDetector.inWrongEncoding(str)) { System.err.println("File was loaded in the wrong encoding: " + charset.name()); return false;