Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide text with UTF-8 MIME Type by default #970

Open
wants to merge 3 commits into
base: next
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/freenet/client/filter/CSSReadFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ static byte[] parse(String s) {

@Override
public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException, IOException {
return detectCharsetFromBOM(input, length);
}

public static BOMDetection detectCharsetFromBOM(byte[] input, int length)
throws UnsupportedCharsetInFilterException {
if(ContentFilter.startsWith(input, ascii, length))
return new BOMDetection("UTF-8", true);
if(ContentFilter.startsWith(input, utf16be, length))
Expand Down
39 changes: 28 additions & 11 deletions src/freenet/client/filter/ContentFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ public class ContentFilter {

/** The HTML mime types are defined here, to allow other modules to identify it*/
public static final String[] HTML_MIME_TYPES=new String[]{"text/html", "application/xhtml+xml", "text/xml+xhtml", "text/xhtml", "application/xhtml"};
private static final int CHARSET_DETECTION_FALLBACK_BUFFERSIZE = 64;

private static volatile boolean logMINOR;
private static volatile boolean logMINOR;
static {
Logger.registerLogThresholdCallback(new LogThresholdCallback(){
@Override
Expand All @@ -54,7 +55,7 @@ public static void init() {
register(new FilterMIMEType("text/plain", "txt", new String[0], new String[] { "text", "pot" },
true, true, null, false, false, false, false, false, false,
l10n("textPlainReadAdvice"),
true, "US-ASCII", null, false));
true, "utf-8", null, false));

// GIF - has a filter
register(new FilterMIMEType("image/gif", "gif", new String[0], new String[0],
Expand Down Expand Up @@ -343,16 +344,8 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
if(handler.readFilter != null) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
int bufferSize = handler.charsetExtractor.getCharsetBufferSize();
input.mark(bufferSize);
byte[] charsetBuffer = new byte[bufferSize];
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
int offset = readIntoBuffer(input, bufferSize, charsetBuffer);
charset = detectCharset(charsetBuffer, offset, handler, maybeCharset);
}
try {
Expand All @@ -374,6 +367,16 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
}

if(handler.safeToRead) {
if(handler.takesACharset && ((charset == null) || (charset.isEmpty()))) {
byte[] charsetBuffer = new byte[CHARSET_DETECTION_FALLBACK_BUFFERSIZE];
int offset = readIntoBuffer(input, CHARSET_DETECTION_FALLBACK_BUFFERSIZE, charsetBuffer);
BOMDetection bom = CSSReadFilter.detectCharsetFromBOM(charsetBuffer, CHARSET_DETECTION_FALLBACK_BUFFERSIZE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m pretty sure this is 100% wrong. That method detects an encoding from the representation of the string @charset. It is also gloriously misnamed as it has nothing to do with a BOM. 😀

if (bom != null) {
charset = bom.charset;
} else if (handler.defaultCharset != null){
charset = handler.defaultCharset;
}
}
FileUtil.copy(input, output, -1);
output.flush();
return new FilterStatus(charset, typeName);
Expand All @@ -384,6 +387,20 @@ public static FilterStatus filter(InputStream input, OutputStream output, String
return null;
}

private static int readIntoBuffer(InputStream input, int bufferSize, byte[] charsetBuffer)
throws IOException {
input.mark(bufferSize);
int bytesRead = 0, offset = 0, toread=0;
while(true) {
toread = bufferSize - offset;
bytesRead = input.read(charsetBuffer, offset, toread);
if(bytesRead == -1 || toread == 0) break;
offset += bytesRead;
}
input.reset();
return offset;
}

public static String detectCharset(byte[] input, int length, FilterMIMEType handler, String maybeCharset) throws IOException {
// Detect charset
String charset = detectBOM(input, length);
Expand Down
31 changes: 31 additions & 0 deletions test/freenet/client/filter/ContentFilterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,37 @@ public void testEvilCharset() throws IOException {
}
}


@Test
public void byteOrderMarkForUtf8IsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf, 0x40 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("utf-8".equals(fo.charset));
}

@Test
public void byteOrderMarkForUtf16BeIsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xfe, (byte) 0xff, 0x00, 0x40 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(new ArrayBucket(buf).getInputStream(), out.getOutputStream(), "text/plain", null, null, null);
assertTrue("UTF-16BE".equals(fo.charset));
}

@Test
public void byteOrderMarkForUtf16LeIsDetectedCorrectly() throws IOException {
byte[] buf = { (byte) 0xff, (byte) 0xfe, 0x40, 0x00 };
ArrayBucket out = new ArrayBucket();
FilterStatus fo = ContentFilter.filter(
new ArrayBucket(buf).getInputStream(),
out.getOutputStream(),
"text/plain",
null,
null,
null);
assertTrue("UTF-16LE".equals(fo.charset));
}

public static String htmlFilter(String data) throws Exception {
if (data.startsWith("<html")) return htmlFilter(data, false);
if (data.startsWith("<?")) return htmlFilter(data, false);
Expand Down