Skip to content

Commit b4acc47

Browse files
Fixes the segment logic from splitting surrogates in half and not encoding correctly (#1474)
1 parent 4975a1e commit b4acc47

File tree

2 files changed

+61
-9
lines changed

2 files changed

+61
-9
lines changed

src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,11 @@ private final void _writeStringSegments(String text, boolean addQuotes) throws I
13441344

13451345
while (left > 0) {
13461346
int len = Math.min(_outputMaxContiguous, left);
1347+
// [core#1473]: avoid splitting surrogates between two segments.
1348+
// if len == 1 (edge case) don't apply to avoid infinite loop
1349+
if (len > 1 && _isStartOfSurrogatePair(text.charAt(offset + len-1))) {
1350+
--len;
1351+
}
13471352
if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
13481353
_flushBuffer();
13491354
}
@@ -1370,6 +1375,11 @@ private final void _writeStringSegments(char[] cbuf, int offset, int totalLen) t
13701375
{
13711376
do {
13721377
int len = Math.min(_outputMaxContiguous, totalLen);
1378+
// [core#1473]: avoid splitting surrogates between two segments.
1379+
// if len == 1 (edge case) don't apply to avoid infinite loop
1380+
if (len > 1 && _isStartOfSurrogatePair(cbuf[offset + len-1])) {
1381+
--len;
1382+
}
13731383
if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
13741384
_flushBuffer();
13751385
}
@@ -1383,6 +1393,11 @@ private final void _writeStringSegments(String text, int offset, int totalLen) t
13831393
{
13841394
do {
13851395
int len = Math.min(_outputMaxContiguous, totalLen);
1396+
// [core#1473]: avoid splitting surrogates between two segments.
1397+
// if len == 1 (edge case) don't apply to avoid infinite loop
1398+
if (len > 1 && _isStartOfSurrogatePair(text.charAt(offset + len-1))) {
1399+
--len;
1400+
}
13861401
if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
13871402
_flushBuffer();
13881403
}
@@ -1869,6 +1884,11 @@ private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
18691884
{
18701885
do {
18711886
int len = Math.min(_outputMaxContiguous, totalLen);
1887+
// [core#1473]: avoid splitting surrogates between two segments.
1888+
// if len == 1 (edge case) don't apply to avoid infinite loop
1889+
if (len > 1 && _isStartOfSurrogatePair(utf8[offset + len-1])) {
1890+
--len;
1891+
}
18721892
_writeUTF8Segment(utf8, offset, len);
18731893
offset += len;
18741894
totalLen -= len;

src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.io.ByteArrayOutputStream;
44
import java.io.StringWriter;
55
import java.io.Writer;
6+
import java.nio.charset.StandardCharsets;
67

78
import org.junit.jupiter.api.Test;
89

@@ -17,6 +18,10 @@ class SurrogateWrite223Test extends JUnit5TestBase
1718
{
1819
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();
1920

21+
private final JsonFactory SURROGATE_COMBINING_JSON_F = JsonFactory.builder()
22+
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
23+
.build();
24+
2025
// for [core#223]
2126
@Test
2227
void surrogatesDefaultSetting() throws Exception {
@@ -35,9 +40,7 @@ void surrogatesByteBacked() throws Exception
3540

3641
out = new ByteArrayOutputStream();
3742

38-
JsonFactory f = JsonFactory.builder()
39-
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
40-
.build();
43+
JsonFactory f = SURROGATE_COMBINING_JSON_F;
4144
g = f.createGenerator(out);
4245
g.writeStartArray();
4346
g.writeString(toQuote);
@@ -96,9 +99,7 @@ void surrogatesCharBacked() throws Exception
9699
//https://github.com/FasterXML/jackson-core/issues/1359
97100
@Test
98101
void checkNonSurrogates() throws Exception {
99-
JsonFactory f = JsonFactory.builder()
100-
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
101-
.build();
102+
JsonFactory f = SURROGATE_COMBINING_JSON_F;
102103
ByteArrayOutputStream out = new ByteArrayOutputStream();
103104
try (JsonGenerator gen = f.createGenerator(out)) {
104105
gen.writeStartObject();
@@ -126,9 +127,7 @@ void checkNonSurrogates() throws Exception {
126127

127128
@Test
128129
void checkSurrogateWithCharacterEscapes() throws Exception {
129-
JsonFactory f = JsonFactory.builder()
130-
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
131-
.build();
130+
JsonFactory f = SURROGATE_COMBINING_JSON_F;
132131
f.setCharacterEscapes(JsonpCharacterEscapes.instance());
133132
ByteArrayOutputStream out = new ByteArrayOutputStream();
134133
try (JsonGenerator gen = f.createGenerator(out)) {
@@ -140,4 +139,37 @@ void checkSurrogateWithCharacterEscapes() throws Exception {
140139
String json = out.toString("UTF-8");
141140
assertEquals("{\"test_emoji\":\"\uD83D\uDE0A\"}", json);
142141
}
142+
143+
//https://github.com/FasterXML/jackson-core/issues/1473
144+
@Test
145+
void surrogateCharSplitInTwoSegments() throws Exception
146+
{
147+
// UTF8JsonGenerator must avoid splitting surrogate chars
148+
// into separate segments. We want to test the third segment
149+
// split to make sure indexes, offsets, etc are all correct.
150+
// By default, segments split in every 1000 chars.
151+
// Thus, we need a string with length 2001 where the surrogate is
152+
// at 2000 and 2001 positions.
153+
int count = 1999;
154+
char[] chars = new char[count];
155+
java.util.Arrays.fill(chars, 'x');
156+
String base = new String(chars);
157+
158+
final String VALUE = base + "\uD83E\uDEE1";
159+
160+
ByteArrayOutputStream bb = new ByteArrayOutputStream();
161+
try (JsonGenerator g = SURROGATE_COMBINING_JSON_F.createGenerator(bb)) {
162+
g.enable(JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8);
163+
164+
g.writeStartArray();
165+
g.writeString(VALUE);
166+
g.writeEndArray();
167+
}
168+
169+
String result = new String(bb.toByteArray(), StandardCharsets.UTF_8);
170+
171+
// +2 and -2 to remove array and quotes: result should contain ["xxxx....🫡"]
172+
// "\uD83E\uDEE1" is the combined surrogate form of the emoji
173+
assertEquals("\uD83E\uDEE1", result.substring(count+2, result.length()-2));
174+
}
143175
}

0 commit comments

Comments
 (0)