diff --git a/src/freenet/client/filter/CSSReadFilter.java b/src/freenet/client/filter/CSSReadFilter.java index 6abd9c449c..7fc73a034a 100644 --- a/src/freenet/client/filter/CSSReadFilter.java +++ b/src/freenet/client/filter/CSSReadFilter.java @@ -127,6 +127,11 @@ static byte[] parse(String s) { @Override public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException { + return detectCharsetFromBOM(input, length); + } + + public static BOMDetection detectCharsetFromBOM(byte[] input, int length) + throws UnsupportedCharsetInFilterException { if(ContentFilter.startsWith(input, ascii, length)) return new BOMDetection("UTF-8", true); if(ContentFilter.startsWith(input, utf16be, length)) diff --git a/src/freenet/client/filter/ContentFilter.java b/src/freenet/client/filter/ContentFilter.java index df05e7e670..5281b86ac8 100644 --- a/src/freenet/client/filter/ContentFilter.java +++ b/src/freenet/client/filter/ContentFilter.java @@ -32,8 +32,9 @@ public class ContentFilter { /** The HTML mime types are defined here, to allow other modules to identify it*/ public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"}; + private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64; - private static volatile boolean logMINOR; + private static volatile boolean logMINOR; static { Logger.registerLogThresholdCallback(new LogThresholdCallback(){ @Override @@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String if(handler.readFilter != null) { if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { int bufferSize = handler.charsetExtractor.getCharsetBufferSize(); - input.mark(bufferSize); byte[] charsetBuffer = new byte[bufferSize]; - int bytesRead = 0, offset = 0, toread=0; - while(true) { - toread = bufferSize - offset; - bytesRead = input.read(charsetBuffer, offset, toread); - if(bytesRead == -1 || toread == 0) break; - offset += bytesRead; - } - input.reset(); + int offset = readIntoBuffer(input, bufferSize, charsetBuffer); charset = detectCharset(charsetBuffer, offset, handler, maybeCharset); } try { @@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String } if(handler.safeToRead) { + if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) { + byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE]; + int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer); + BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE); + if (bom != null) { + charset = bom.charset; + } else if (handler.defaultCharset != null){ + charset = handler.defaultCharset; + } + } FileUtil.copy(input, output, -1); output.flush(); return new FilterStatus(charset, typeName); @@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String return null; } + private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer) + throws IOException { + input.mark(bufferSize); + int bytesRead = 0, offset = 0, toread=0; + while(true) { + toread = bufferSize - offset; + bytesRead = input.read(charsetBuffer, offset, toread); + if(bytesRead == -1 || toread == 0) break; + offset += bytesRead; + } + input.reset(); + return offset; + } + public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException { // Detect charset String charset = detectBOM(input, length); diff --git a/test/freenet/client/filter/ContentFilterTest.java b/test/freenet/client/filter/ContentFilterTest.java index a2a861920d..0180a7c780 100644 --- a/test/freenet/client/filter/ContentFilterTest.java +++ b/test/freenet/client/filter/ContentFilterTest.java @@ -380,6 +380,24 @@ public void testEvilCharset() throws IOException { } } + @Test + public void charsetDetectionUsesUTF8DefaultForEmptyText() throws IOException { + String s = ""; + byte[] buf = s.getBytes(StandardCharsets.UTF_8); + ArrayBucket out = new ArrayBucket(); + FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); + assertTrue("utf-8".equals(fo.charset)); + } + + @Test + public void charsetDetectionUsesBomForText() throws IOException { + String s = new String(CSSReadFilter.utf16be); + byte[] buf = s.getBytes(StandardCharsets.UTF_8); + ArrayBucket out = new ArrayBucket(); + FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null); + assertTrue("UTF-16BE".equals(fo.charset)); + } + public static String htmlFilter(String data) throws Exception { if (data.startsWith("