Change RopeByteString's Utf8 validity behavior to avoid needing to have our Utf8 validation library handle validation in the face of straddling surrogate sequences (where it has to handle cases where a given byte sequence is invalid UTF8 but _could_ be valid when concatenated with another byte sequence).

protobuf-github-bot · copybara-github · commit 3159b1b84fea · 2025-11-07T10:31:34.000-08:00
PiperOrigin-RevId: 829490247
diff --git a/java/core/src/main/java/com/google/protobuf/ByteString.java b/java/core/src/main/java/com/google/protobuf/ByteString.java
@@ -22,8 +22,6 @@
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.InvalidMarkException;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
@@ -957,23 +955,6 @@ public final String toStringUtf8() {
    */
   public abstract boolean isValidUtf8();
 
-  /**
-   * Tells whether the given byte sequence is a well-formed, malformed, or incomplete UTF-8 byte
-   * sequence. This method accepts and returns a partial state result, allowing the bytes for a
-   * complete UTF-8 byte sequence to be composed from multiple {@code ByteString} segments.
-   *
-   * @param state either {@code 0} (if this is the initial decoding operation) or the value returned
-   *     from a call to a partial decoding method for the previous bytes
-   * @param offset offset of the first byte to check
-   * @param length number of bytes to check
-   * @return {@code -1} if the partial byte sequence is definitely malformed, {@code 0} if it is
-   *     well-formed (no additional input needed), or, if the byte sequence is "incomplete", i.e.
-   *     apparently terminated in the middle of a character, an opaque integer "state" value
-   *     containing enough information to decode the character when passed to a subsequent
-   *     invocation of a partial decoding method.
-   */
-  protected abstract int partialIsValidUtf8(int state, int offset, int length);
-
   // =================================================================
   // equals() and hashCode()
 
@@ -1563,11 +1544,6 @@ public boolean isValidUtf8() {
       return Utf8.isValidUtf8(bytes);
     }
 
-    @Override
-    protected int partialIsValidUtf8(int state, int offset, int length) {
-      return Utf8.partialIsValidUtf8(state, bytes, offset, offset + length);
-    }
-
     // =================================================================
     // equals() and hashCode()
 
@@ -1754,12 +1730,6 @@ public boolean isValidUtf8() {
       return Utf8.isValidUtf8(bytes, offset, offset + length);
     }
 
-    @Override
-    protected int partialIsValidUtf8(int state, int offset, int length) {
-      int index = this.offset + offset;
-      return Utf8.partialIsValidUtf8(state, bytes, index, index + length);
-    }
-
     @Override
     protected boolean equalsInternal(ByteString other) {
       // If the other side is a LiteralByteString or BoundedByteString, implement equals by doing
diff --git a/java/core/src/main/java/com/google/protobuf/RopeByteString.java b/java/core/src/main/java/com/google/protobuf/RopeByteString.java
@@ -454,23 +454,41 @@ protected String toStringInternal(Charset charset) {
 
   @Override
   public boolean isValidUtf8() {
-    int leftPartial = left.partialIsValidUtf8(Utf8.COMPLETE, 0, leftLength);
-    int state = right.partialIsValidUtf8(leftPartial, 0, right.size());
-    return state == Utf8.COMPLETE;
+    // If every piece is valid UTF-8, then the concatenation of them is also valid UTF-8. Almost
+    // always when this method is called this will be the case.
+    if (allPiecesValidUtf8()) {
+      return true;
+    }
+
+    // There were some individual pieces that were not valid UTF-8. Almost always this will mean
+    // the total string is not valid UTF-8, but it is possible that some pieces were invalid only
+    // due to leading-or-trailing surrogates and that concatenation will make them valid.
+    // We fall back to building the complete byte[] and checking if it is valid UTF-8. This is
+    // expensive but will be executed extremely rarely, and in the rare scenario this is executed
+    // the check will nearly always return false, which will lead to an result in an exception
+    // thrown up the stack, so the real performance implications of this slow check are small.
+    //
+    // There are a number of conditions that could be additionally checked here that could
+    // better optimize for detecting definitely-invalid cases, since the only way that concatenation
+    // helps is in cases of some contiguous span of N invalid pieces. In theory this could just
+    // concatenate and check only those spans, but since this is a very cold path, we do the
+    // simplest thing and check the entire byte array.
+    return Utf8.isValidUtf8(toByteArray());
   }
 
-  @Override
-  protected int partialIsValidUtf8(int state, int offset, int length) {
-    int toIndex = offset + length;
-    if (toIndex <= leftLength) {
-      return left.partialIsValidUtf8(state, offset, length);
-    } else if (offset >= leftLength) {
-      return right.partialIsValidUtf8(state, offset - leftLength, length);
-    } else {
-      int leftLength = this.leftLength - offset;
-      int leftPartial = left.partialIsValidUtf8(state, offset, leftLength);
-      return right.partialIsValidUtf8(leftPartial, 0, length - leftLength);
+  /**
+   * Returns true if all pieces in this rope individually are valid UTF-8. If this returns true,
+   * then the top level Rope is also valid UTF-8. If this returns false, it probably is invalid but
+   * may be valid when pieces are concatenated.
+   */
+  private boolean allPiecesValidUtf8() {
+    PieceIterator pieces = new PieceIterator(this);
+    while (pieces.hasNext()) {
+      if (!pieces.next().isValidUtf8()) {
+        return false;
+      }
     }
+    return true;
   }
 
   // =================================================================
diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java
@@ -130,31 +130,11 @@ static boolean isValidUtf8(byte[] bytes) {
    * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
    * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
    * exclusive.
-   *
-   * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
-   * == Utf8.COMPLETE}.
    */
   static boolean isValidUtf8(byte[] bytes, int index, int limit) {
     return processor.isValidUtf8(bytes, index, limit);
   }
 
-  /**
-   * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
-   * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
-   * {@code limit}, exclusive.
-   *
-   * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
-   *     value returned from a call to a partial decoding method for the previous bytes
-   * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
-   *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
-   *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
-   *     "state" value containing enough information to decode the character when passed to a
-   *     subsequent invocation of a partial decoding method.
-   */
-  static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
-    return processor.partialIsValidUtf8(state, bytes, index, limit);
-  }
-
   private static int incompleteStateFor(int byte1) {
     return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
   }
@@ -292,19 +272,6 @@ static boolean isValidUtf8(ByteBuffer buffer) {
     return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
   }
 
-  /**
-   * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
-   *
-   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
-   * and the capabilities of the platform.
-   *
-   * @param buffer the buffer to check.
-   * @see Utf8#partialIsValidUtf8(int, byte[], int, int)
-   */
-  static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
-    return processor.partialIsValidUtf8(state, buffer, index, limit);
-  }
-
   /**
    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
    *
@@ -388,7 +355,7 @@ final boolean isValidUtf8(byte[] bytes, int index, int limit) {
      *     "state" value containing enough information to decode the character when passed to a
      *     subsequent invocation of a partial decoding method.
      */
-    abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
+    protected abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
 
     /**
      * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
@@ -420,15 +387,15 @@ final int partialIsValidUtf8(
     }
 
     /** Performs validation for direct {@link ByteBuffer} instances. */
-    abstract int partialIsValidUtf8Direct(
+    protected abstract int partialIsValidUtf8Direct(
         final int state, final ByteBuffer buffer, int index, final int limit);
 
     /**
      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
      * than potentially faster approaches. This first completes validation for the current character
      * (provided by {@code state}) and then finishes validation for the sequence.
      */
-    final int partialIsValidUtf8Default(
+    protected final int partialIsValidUtf8Default(
         final int state, final ByteBuffer buffer, int index, final int limit) {
       if (state != COMPLETE) {
         // The previous decoding operation was incomplete (or malformed).
@@ -783,7 +750,7 @@ final void encodeUtf8(String in, ByteBuffer out) {
   /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
   static final class SafeProcessor extends Processor {
     @Override
-    int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
+    protected int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
       if (state != COMPLETE) {
         // The previous decoding operation was incomplete (or malformed).
         // We look for a well-formed sequence consisting of bytes from
@@ -871,7 +838,7 @@ int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
     }
 
     @Override
-    int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
+    protected int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
       // For safe processing, we have to use the ByteBuffer API.
       return partialIsValidUtf8Default(state, buffer, index, limit);
     }
@@ -1163,7 +1130,7 @@ static boolean isAvailable() {
     }
 
     @Override
-    int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
+    protected int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
       // Bitwise OR combines the sign bits so any negative value fails the check.
       if ((index | limit | bytes.length - limit) < 0) {
         throw new ArrayIndexOutOfBoundsException(
@@ -1258,7 +1225,7 @@ int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit
     }
 
     @Override
-    int partialIsValidUtf8Direct(
+    protected int partialIsValidUtf8Direct(
         final int state, ByteBuffer buffer, final int index, final int limit) {
       // Bitwise OR combines the sign bits so any negative value fails the check.
       if ((index | limit | buffer.limit() - limit) < 0) {