Skip to content

Commit

Permalink
Actually detect charset with plain text, and add test
Browse files Browse the repository at this point in the history
  • Loading branch information
ArneBab committed Nov 8, 2024
1 parent e7436a3 commit 85013fb
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 10 deletions.
5 changes: 5 additions & 0 deletions src/freenet/client/filter/CSSReadFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ static byte[] parse(String s) {

@Override
public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException {
return detectCharsetFromBOM(input, length);
}

public static BOMDetection detectCharsetFromBOM(byte[] input, int length)
throws UnsupportedCharsetInFilterException {
if(ContentFilter.startsWith(input, ascii, length))
return new BOMDetection("UTF-8", true);
if(ContentFilter.startsWith(input, utf16be, length))
Expand Down
37 changes: 27 additions & 10 deletions src/freenet/client/filter/ContentFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ public class ContentFilter {

/** The HTML mime types are defined here, to allow other modules to identify it*/
public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"};
private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64;

private static volatile boolean logMINOR;
private static volatile boolean logMINOR;
static {
Logger.registerLogThresholdCallback(new LogThresholdCallback(){
@Override
Expand Down Expand Up @@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
if(handler.readFilter != null) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
int bufferSize = handler.charsetExtractor.getCharsetBufferSize();
input.mark(bufferSize);
byte[] charsetBuffer = new byte[bufferSize];
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
int offset = readIntoBuffer(input, bufferSize, charsetBuffer);
charset = detectCharset(charsetBuffer, offset, handler, maybeCharset);
}
try {
Expand All @@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
}

if(handler.safeToRead) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE];
int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer);
BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE);
if (bom != null) {
charset = bom.charset;
} else if (handler.defaultCharset != null){
charset = handler.defaultCharset;
}
}
FileUtil.copy(input, output, -1);
output.flush();
return new FilterStatus(charset, typeName);
Expand All @@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
return null;
}

private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer)
throws IOException {
input.mark(bufferSize);
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
return offset;
}

public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException {
// Detect charset
String charset = detectBOM(input, length);
Expand Down
18 changes: 18 additions & 0 deletions test/freenet/client/filter/ContentFilterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,24 @@ public void testEvilCharset() throws IOException {
}
}

@Test
public void charsetDetectionUsesUTF8DefaultForEmptyText() throws IOException {
String s = "";
byte[] buf = s.getBytes(StandardCharsets.UTF_8);
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("utf-8".equals(fo.charset));
}

@Test
public void charsetDetectionUsesBomForText() throws IOException {
String s = new String(CSSReadFilter.utf16be);
byte[] buf = s.getBytes(StandardCharsets.UTF_8);
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("UTF-16BE".equals(fo.charset));
}

public static String htmlFilter(String data) throws Exception {
if (data.startsWith("<html")) return htmlFilter(data, false);
if (data.startsWith("<?")) return htmlFilter(data, false);
Expand Down

0 comments on commit 85013fb

Please sign in to comment.