diff --git a/src/freenet/client/filter/CSSReadFilter.java b/src/freenet/client/filter/CSSReadFilter.java index 6abd9c449c..7fc73a034a 100644 --- a/src/freenet/client/filter/CSSReadFilter.java +++ b/src/freenet/client/filter/CSSReadFilter.java @@ -127,6 +127,11 @@ static byte[] parse(String s) { @Override public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException { + return detectCharsetFromBOM(input, length); + } + + public static BOMDetection detectCharsetFromBOM(byte[] input, int length) + throws UnsupportedCharsetInFilterException { if(ContentFilter.startsWith(input, ascii, length)) return new BOMDetection("UTF-8", true); if(ContentFilter.startsWith(input, utf16be, length)) diff --git a/src/freenet/client/filter/ContentFilter.java b/src/freenet/client/filter/ContentFilter.java index 3ae493ca55..5281b86ac8 100644 --- a/src/freenet/client/filter/ContentFilter.java +++ b/src/freenet/client/filter/ContentFilter.java @@ -32,8 +32,9 @@ public class ContentFilter { /** The HTML mime types are defined here, to allow other modules to identify it*/ public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"}; + private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64; - private static volatile boolean logMINOR; + private static volatile boolean logMINOR; static { Logger.registerLogThresholdCallback(new LogThresholdCallback(){ @Override @@ -54,7 +55,7 @@ public static void init() { register(new FilterMIMEType("text/plain", "txt", new String[0], new String[] { "text", "pot" }, true, true, null, false, false, false, false, false, false, l10n("textPlainReadAdvice"), - true, "US-ASCII", null, false)); + true, "utf-8", null, false)); // GIF - has a filter register(new FilterMIMEType("image/gif", "gif", new String[0], new String[0], @@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String if(handler.readFilter != null) { if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { int bufferSize = handler.charsetExtractor.getCharsetBufferSize(); - input.mark(bufferSize); byte[] charsetBuffer = new byte[bufferSize]; - int bytesRead = 0, offset = 0, toread=0; - while(true) { - toread = bufferSize - offset; - bytesRead = input.read(charsetBuffer, offset, toread); - if(bytesRead == -1 || toread == 0) break; - offset += bytesRead; - } - input.reset(); + int offset = readIntoBuffer(input, bufferSize, charsetBuffer); charset = detectCharset(charsetBuffer, offset, handler, maybeCharset); } try { @@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String } if(handler.safeToRead) { + if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { + byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE]; + int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer); + BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE); + if (bom != null) { + charset = bom.charset; + } else if (handler.defaultCharset != null){ + charset = handler.defaultCharset; + } + } FileUtil.copy(input, output, -1); output.flush(); return new FilterStatus(charset, typeName); @@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String return null; } + private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer) + throws IOException { + input.mark(bufferSize); + int bytesRead = 0, offset = 0, toread=0; + while(true) { + toread = bufferSize - offset; + bytesRead = input.read(charsetBuffer, offset, toread); + if(bytesRead == -1 || toread == 0) break; + offset += bytesRead; + } + input.reset(); + return offset; + } + public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException { // Detect charset String charset = detectBOM(input, length); diff --git a/test/freenet/client/filter/ContentFilterTest.java b/test/freenet/client/filter/ContentFilterTest.java index a2a861920d..ceac60605d 100644 --- a/test/freenet/client/filter/ContentFilterTest.java +++ b/test/freenet/client/filter/ContentFilterTest.java @@ -380,6 +380,37 @@ public void testEvilCharset() throws IOException { } } + + @Test + public void byteOrderMarkForUtf8IsDetectedCorrectly() throws IOException { + byte[] buf = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf, 0x40 }; + ArrayBucket out = new ArrayBucket(); + FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); + assertTrue("utf-8".equals(fo.charset)); + } + + @Test + public void byteOrderMarkForUtf16BeIsDetectedCorrectly() throws IOException { + byte[] buf = { (byte) 0xfe, (byte) 0xff, 0x00, 0x40 }; + ArrayBucket out = new ArrayBucket(); + FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); + assertTrue("UTF-16BE".equals(fo.charset)); + } + + @Test + public void byteOrderMarkForUtf16LeIsDetectedCorrectly() throws IOException { + byte[] buf = { (byte) 0xff, (byte) 0xfe, 0x40, 0x00 }; + ArrayBucket out = new ArrayBucket(); + FilterStatus fo = ContentFilter.filter( + new ArrayBucket(buf).getInputStream(), + out.getOutputStream(), + "text/plain", + null, + null, + null); + assertTrue("UTF-16LE".equals(fo.charset)); + } + public static String htmlFilter(String data) throws Exception { if (data.startsWith("