Skip to content

Commit 3159b1b

Browse files
Change RopeByteString's Utf8 validity behavior to avoid needing to have our Utf8 validation library handle validation in the face of straddling surrogate sequences (where it has to handle cases where a given byte sequence is invalid UTF8 but _could_ be valid when concatenated with another byte sequence).
PiperOrigin-RevId: 829490247
1 parent cba9ec3 commit 3159b1b

File tree

3 files changed

+39
-84
lines changed

3 files changed

+39
-84
lines changed

java/core/src/main/java/com/google/protobuf/ByteString.java

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
import java.io.Serializable;
2323
import java.io.UnsupportedEncodingException;
2424
import java.nio.ByteBuffer;
25-
import java.nio.ByteOrder;
26-
import java.nio.InvalidMarkException;
2725
import java.nio.charset.Charset;
2826
import java.nio.charset.UnsupportedCharsetException;
2927
import java.util.ArrayList;
@@ -957,23 +955,6 @@ public final String toStringUtf8() {
957955
*/
958956
public abstract boolean isValidUtf8();
959957

960-
/**
961-
* Tells whether the given byte sequence is a well-formed, malformed, or incomplete UTF-8 byte
962-
* sequence. This method accepts and returns a partial state result, allowing the bytes for a
963-
* complete UTF-8 byte sequence to be composed from multiple {@code ByteString} segments.
964-
*
965-
* @param state either {@code 0} (if this is the initial decoding operation) or the value returned
966-
* from a call to a partial decoding method for the previous bytes
967-
* @param offset offset of the first byte to check
968-
* @param length number of bytes to check
969-
* @return {@code -1} if the partial byte sequence is definitely malformed, {@code 0} if it is
970-
* well-formed (no additional input needed), or, if the byte sequence is "incomplete", i.e.
971-
* apparently terminated in the middle of a character, an opaque integer "state" value
972-
* containing enough information to decode the character when passed to a subsequent
973-
* invocation of a partial decoding method.
974-
*/
975-
protected abstract int partialIsValidUtf8(int state, int offset, int length);
976-
977958
// =================================================================
978959
// equals() and hashCode()
979960

@@ -1563,11 +1544,6 @@ public boolean isValidUtf8() {
15631544
return Utf8.isValidUtf8(bytes);
15641545
}
15651546

1566-
@Override
1567-
protected int partialIsValidUtf8(int state, int offset, int length) {
1568-
return Utf8.partialIsValidUtf8(state, bytes, offset, offset + length);
1569-
}
1570-
15711547
// =================================================================
15721548
// equals() and hashCode()
15731549

@@ -1754,12 +1730,6 @@ public boolean isValidUtf8() {
17541730
return Utf8.isValidUtf8(bytes, offset, offset + length);
17551731
}
17561732

1757-
@Override
1758-
protected int partialIsValidUtf8(int state, int offset, int length) {
1759-
int index = this.offset + offset;
1760-
return Utf8.partialIsValidUtf8(state, bytes, index, index + length);
1761-
}
1762-
17631733
@Override
17641734
protected boolean equalsInternal(ByteString other) {
17651735
// If the other side is a LiteralByteString or BoundedByteString, implement equals by doing

java/core/src/main/java/com/google/protobuf/RopeByteString.java

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -454,23 +454,41 @@ protected String toStringInternal(Charset charset) {
454454

455455
@Override
456456
public boolean isValidUtf8() {
457-
int leftPartial = left.partialIsValidUtf8(Utf8.COMPLETE, 0, leftLength);
458-
int state = right.partialIsValidUtf8(leftPartial, 0, right.size());
459-
return state == Utf8.COMPLETE;
457+
// If every piece is valid UTF-8, then the concatenation of them is also valid UTF-8. Almost
458+
// always when this method is called this will be the case.
459+
if (allPiecesValidUtf8()) {
460+
return true;
461+
}
462+
463+
// There were some individual pieces that were not valid UTF-8. Almost always this will mean
464+
// the total string is not valid UTF-8, but it is possible that some pieces were invalid only
465+
// due to leading-or-trailing surrogates and that concatenation will make them valid.
466+
// We fall back to building the complete byte[] and checking if it is valid UTF-8. This is
467+
// expensive but will be executed extremely rarely, and in the rare scenario this is executed
468+
// the check will nearly always return false, which will lead to an result in an exception
469+
// thrown up the stack, so the real performance implications of this slow check are small.
470+
//
471+
// There are a number of conditions that could be additionally checked here that could
472+
// better optimize for detecting definitely-invalid cases, since the only way that concatenation
473+
// helps is in cases of some contiguous span of N invalid pieces. In theory this could just
474+
// concatenate and check only those spans, but since this is a very cold path, we do the
475+
// simplest thing and check the entire byte array.
476+
return Utf8.isValidUtf8(toByteArray());
460477
}
461478

462-
@Override
463-
protected int partialIsValidUtf8(int state, int offset, int length) {
464-
int toIndex = offset + length;
465-
if (toIndex <= leftLength) {
466-
return left.partialIsValidUtf8(state, offset, length);
467-
} else if (offset >= leftLength) {
468-
return right.partialIsValidUtf8(state, offset - leftLength, length);
469-
} else {
470-
int leftLength = this.leftLength - offset;
471-
int leftPartial = left.partialIsValidUtf8(state, offset, leftLength);
472-
return right.partialIsValidUtf8(leftPartial, 0, length - leftLength);
479+
/**
480+
* Returns true if all pieces in this rope individually are valid UTF-8. If this returns true,
481+
* then the top level Rope is also valid UTF-8. If this returns false, it probably is invalid but
482+
* may be valid when pieces are concatenated.
483+
*/
484+
private boolean allPiecesValidUtf8() {
485+
PieceIterator pieces = new PieceIterator(this);
486+
while (pieces.hasNext()) {
487+
if (!pieces.next().isValidUtf8()) {
488+
return false;
489+
}
473490
}
491+
return true;
474492
}
475493

476494
// =================================================================

java/core/src/main/java/com/google/protobuf/Utf8.java

Lines changed: 7 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -130,31 +130,11 @@ static boolean isValidUtf8(byte[] bytes) {
130130
* Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
131131
* range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
132132
* exclusive.
133-
*
134-
* <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
135-
* == Utf8.COMPLETE}.
136133
*/
137134
static boolean isValidUtf8(byte[] bytes, int index, int limit) {
138135
return processor.isValidUtf8(bytes, index, limit);
139136
}
140137

141-
/**
142-
* Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
143-
* sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
144-
* {@code limit}, exclusive.
145-
*
146-
* @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
147-
* value returned from a call to a partial decoding method for the previous bytes
148-
* @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
149-
* #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
150-
* "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
151-
* "state" value containing enough information to decode the character when passed to a
152-
* subsequent invocation of a partial decoding method.
153-
*/
154-
static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
155-
return processor.partialIsValidUtf8(state, bytes, index, limit);
156-
}
157-
158138
private static int incompleteStateFor(int byte1) {
159139
return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
160140
}
@@ -292,19 +272,6 @@ static boolean isValidUtf8(ByteBuffer buffer) {
292272
return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
293273
}
294274

295-
/**
296-
* Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
297-
*
298-
* <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
299-
* and the capabilities of the platform.
300-
*
301-
* @param buffer the buffer to check.
302-
* @see Utf8#partialIsValidUtf8(int, byte[], int, int)
303-
*/
304-
static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
305-
return processor.partialIsValidUtf8(state, buffer, index, limit);
306-
}
307-
308275
/**
309276
* Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
310277
*
@@ -388,7 +355,7 @@ final boolean isValidUtf8(byte[] bytes, int index, int limit) {
388355
* "state" value containing enough information to decode the character when passed to a
389356
* subsequent invocation of a partial decoding method.
390357
*/
391-
abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
358+
protected abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
392359

393360
/**
394361
* Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
@@ -420,15 +387,15 @@ final int partialIsValidUtf8(
420387
}
421388

422389
/** Performs validation for direct {@link ByteBuffer} instances. */
423-
abstract int partialIsValidUtf8Direct(
390+
protected abstract int partialIsValidUtf8Direct(
424391
final int state, final ByteBuffer buffer, int index, final int limit);
425392

426393
/**
427394
* Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
428395
* than potentially faster approaches. This first completes validation for the current character
429396
* (provided by {@code state}) and then finishes validation for the sequence.
430397
*/
431-
final int partialIsValidUtf8Default(
398+
protected final int partialIsValidUtf8Default(
432399
final int state, final ByteBuffer buffer, int index, final int limit) {
433400
if (state != COMPLETE) {
434401
// The previous decoding operation was incomplete (or malformed).
@@ -783,7 +750,7 @@ final void encodeUtf8(String in, ByteBuffer out) {
783750
/** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
784751
static final class SafeProcessor extends Processor {
785752
@Override
786-
int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
753+
protected int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
787754
if (state != COMPLETE) {
788755
// The previous decoding operation was incomplete (or malformed).
789756
// We look for a well-formed sequence consisting of bytes from
@@ -871,7 +838,7 @@ int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
871838
}
872839

873840
@Override
874-
int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
841+
protected int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
875842
// For safe processing, we have to use the ByteBuffer API.
876843
return partialIsValidUtf8Default(state, buffer, index, limit);
877844
}
@@ -1163,7 +1130,7 @@ static boolean isAvailable() {
11631130
}
11641131

11651132
@Override
1166-
int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
1133+
protected int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
11671134
// Bitwise OR combines the sign bits so any negative value fails the check.
11681135
if ((index | limit | bytes.length - limit) < 0) {
11691136
throw new ArrayIndexOutOfBoundsException(
@@ -1258,7 +1225,7 @@ int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit
12581225
}
12591226

12601227
@Override
1261-
int partialIsValidUtf8Direct(
1228+
protected int partialIsValidUtf8Direct(
12621229
final int state, ByteBuffer buffer, final int index, final int limit) {
12631230
// Bitwise OR combines the sign bits so any negative value fails the check.
12641231
if ((index | limit | buffer.limit() - limit) < 0) {

0 commit comments

Comments
 (0)