Fixes the segment logic from splitting surrogates in half and not encoding correctly (#1474)

vitorpamplona · web-flow · commit b4acc47d4c01 · 2025-09-19T11:05:15.000-07:00
diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java
@@ -1344,6 +1344,11 @@ private final void _writeStringSegments(String text, boolean addQuotes) throws I
 
         while (left > 0) {
             int len = Math.min(_outputMaxContiguous, left);
+            // [core#1473]: avoid splitting surrogates between two segments.
+            // if len == 1 (edge case) don't apply to avoid infinite loop
+            if (len > 1 && _isStartOfSurrogatePair(text.charAt(offset + len-1))) {
+                --len;
+            }
             if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
                 _flushBuffer();
             }
@@ -1370,6 +1375,11 @@ private final void _writeStringSegments(char[] cbuf, int offset, int totalLen) t
     {
         do {
             int len = Math.min(_outputMaxContiguous, totalLen);
+            // [core#1473]: avoid splitting surrogates between two segments.
+            // if len == 1 (edge case) don't apply to avoid infinite loop
+            if (len > 1 && _isStartOfSurrogatePair(cbuf[offset + len-1])) {
+                --len;
+            }
             if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
                 _flushBuffer();
             }
@@ -1383,6 +1393,11 @@ private final void _writeStringSegments(String text, int offset, int totalLen) t
     {
         do {
             int len = Math.min(_outputMaxContiguous, totalLen);
+            // [core#1473]: avoid splitting surrogates between two segments.
+            // if len == 1 (edge case) don't apply to avoid infinite loop
+            if (len > 1 && _isStartOfSurrogatePair(text.charAt(offset + len-1))) {
+                --len;
+            }
             if ((_outputTail + len) > _outputEnd) { // caller must ensure enough space
                 _flushBuffer();
             }
@@ -1869,6 +1884,11 @@ private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
     {
         do {
             int len = Math.min(_outputMaxContiguous, totalLen);
+            // [core#1473]: avoid splitting surrogates between two segments.
+            // if len == 1 (edge case) don't apply to avoid infinite loop
+            if (len > 1 && _isStartOfSurrogatePair(utf8[offset + len-1])) {
+                --len;
+            }
             _writeUTF8Segment(utf8, offset, len);
             offset += len;
             totalLen -= len;
diff --git a/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java b/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java
@@ -3,6 +3,7 @@
 import java.io.ByteArrayOutputStream;
 import java.io.StringWriter;
 import java.io.Writer;
+import java.nio.charset.StandardCharsets;
 
 import org.junit.jupiter.api.Test;
 
@@ -17,6 +18,10 @@ class SurrogateWrite223Test extends JUnit5TestBase
 {
     private final JsonFactory DEFAULT_JSON_F = newStreamFactory();
 
+    private final JsonFactory SURROGATE_COMBINING_JSON_F = JsonFactory.builder()
+            .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
+            .build();
+
     // for [core#223]
     @Test
     void surrogatesDefaultSetting() throws Exception {
@@ -35,9 +40,7 @@ void surrogatesByteBacked() throws Exception
 
         out = new ByteArrayOutputStream();
 
-        JsonFactory f = JsonFactory.builder()
-                .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
-                .build();
+        JsonFactory f = SURROGATE_COMBINING_JSON_F;
         g = f.createGenerator(out);
         g.writeStartArray();
         g.writeString(toQuote);
@@ -96,9 +99,7 @@ void surrogatesCharBacked() throws Exception
     //https://github.com/FasterXML/jackson-core/issues/1359
     @Test
     void checkNonSurrogates() throws Exception {
-        JsonFactory f = JsonFactory.builder()
-                .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
-                .build();
+        JsonFactory f = SURROGATE_COMBINING_JSON_F;
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         try (JsonGenerator gen = f.createGenerator(out)) {
             gen.writeStartObject();
@@ -126,9 +127,7 @@ void checkNonSurrogates() throws Exception {
 
     @Test
     void checkSurrogateWithCharacterEscapes() throws Exception {
-        JsonFactory f = JsonFactory.builder()
-                .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
-                .build();
+        JsonFactory f = SURROGATE_COMBINING_JSON_F;
         f.setCharacterEscapes(JsonpCharacterEscapes.instance());
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         try (JsonGenerator gen = f.createGenerator(out)) {
@@ -140,4 +139,37 @@ void checkSurrogateWithCharacterEscapes() throws Exception {
         String json = out.toString("UTF-8");
         assertEquals("{\"test_emoji\":\"\uD83D\uDE0A\"}", json);
     }
+
+    //https://github.com/FasterXML/jackson-core/issues/1473
+    @Test
+    void surrogateCharSplitInTwoSegments() throws Exception
+    {
+        // UTF8JsonGenerator must avoid splitting surrogate chars
+        // into separate segments. We want to test the third segment
+        // split to make sure indexes, offsets, etc are all correct.
+        // By default, segments split in every 1000 chars.
+        // Thus, we need a string with length 2001 where the surrogate is
+        // at 2000 and 2001 positions.
+        int count = 1999;
+        char[] chars = new char[count];
+        java.util.Arrays.fill(chars, 'x');
+        String base = new String(chars);
+
+        final String VALUE = base + "\uD83E\uDEE1";
+
+        ByteArrayOutputStream bb = new ByteArrayOutputStream();
+        try (JsonGenerator g = SURROGATE_COMBINING_JSON_F.createGenerator(bb)) {
+            g.enable(JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8);
+    
+            g.writeStartArray();
+            g.writeString(VALUE);
+            g.writeEndArray();
+        }
+
+        String result = new String(bb.toByteArray(), StandardCharsets.UTF_8);
+
+        // +2 and -2 to remove array and quotes: result should contain ["xxxx....🫡"]
+        // "\uD83E\uDEE1" is the combined surrogate form of the emoji
+        assertEquals("\uD83E\uDEE1", result.substring(count+2, result.length()-2));
+    }
 }