diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java index 0bbe8fcdf..5bc01fda4 100644 --- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java +++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java @@ -5,7 +5,9 @@ import java.math.BigInteger; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Stack; import com.fasterxml.jackson.core.*; import com.fasterxml.jackson.core.base.ParserMinimalBase; @@ -13,7 +15,9 @@ import com.fasterxml.jackson.core.io.NumberInput; import com.fasterxml.jackson.core.json.DupDetector; import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; -import com.fasterxml.jackson.core.util.*; +import com.fasterxml.jackson.core.util.ByteArrayBuilder; +import com.fasterxml.jackson.core.util.JacksonFeatureSet; +import com.fasterxml.jackson.core.util.TextBuffer; import static com.fasterxml.jackson.dataformat.cbor.CBORConstants.*; @@ -2289,10 +2293,9 @@ protected void _finishToken() throws IOException if ((available >= len) // if not, could we read? NOTE: we do not require it, just attempt to read - || ((_inputBuffer.length >= len) - && _tryToLoadToHaveAtLeast(len))) { - _finishShortText(len); - return; + || _tryToLoadToHaveAtLeast(len)) { + _finishShortText(len); + return; } // If not enough space, need handling similar to chunked _finishLongText(len); @@ -2331,11 +2334,9 @@ protected String _finishTextToken(int ch) throws IOException // due to inputBuffer never being even close to that big). final int available = _inputEnd - _inputPtr; - if ((available >= len) // if not, could we read? NOTE: we do not require it, just attempt to read - || ((_inputBuffer.length >= len) - && _tryToLoadToHaveAtLeast(len))) { + || _tryToLoadToHaveAtLeast(len)) { return _finishShortText(len); } // If not enough space, need handling similar to chunked @@ -2364,19 +2365,22 @@ private final String _finishShortText(int len) throws IOException // Let's actually do a tight loop for ASCII first: final int end = _inputPtr; - - int i; - while ((i = inputBuf[inPtr]) >= 0) { + int i = 0; + while (inPtr < end && i >= 0) { + i = inputBuf[inPtr++]; outBuf[outPtr++] = (char) i; - if (++inPtr == end) { - String str = _textBuffer.setCurrentAndReturn(outPtr); - if (stringRefs != null) { - stringRefs.stringRefs.add(str); - _sharedString = str; - } - return str; + } + if (inPtr == end && i >= 0) { + String str = _textBuffer.setCurrentAndReturn(outPtr); + if (stringRefs != null) { + stringRefs.stringRefs.add(str); + _sharedString = str; } + return str; } + // Correct extra increments + outPtr -= 1; + inPtr -= 1; final int[] codes = UTF8_UNIT_CODES; do { i = inputBuf[inPtr++] & 0xFF; @@ -2443,10 +2447,17 @@ private final String _finishShortText(int len) throws IOException private final String _finishLongText(int len) throws IOException { - char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); - int outPtr = 0; - final int[] codes = UTF8_UNIT_CODES; + StringRefList stringRefs = null; + if (!_stringRefs.empty() && + shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) { + stringRefs = _stringRefs.peek(); + } + // First a tight loop for ASCII. + len = _finishLongTextAscii(len); + char[] outBuf = _textBuffer.getBufferWithoutReset(); + int outPtr = _textBuffer.getCurrentSegmentSize(); int outEnd = outBuf.length; + final int[] codes = UTF8_UNIT_CODES; while (--len >= 0) { int c = _nextByte() & 0xFF; @@ -2500,14 +2511,51 @@ private final String _finishLongText(int len) throws IOException outBuf[outPtr++] = (char) c; } String str = _textBuffer.setCurrentAndReturn(outPtr); - if (!_stringRefs.empty() && - shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) { - _stringRefs.peek().stringRefs.add(str); + if (stringRefs != null) { + stringRefs.stringRefs.add(str); _sharedString = str; } return str; } + /** + * Consumes as many ascii chars as possible in a tight loop. Returns the amount of bytes remaining. + */ + private final int _finishLongTextAscii(int len) throws IOException + { + char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); + final byte[] input = _inputBuffer; + while (len > 0) { + // load as much input as possible + int size = Math.min(len, Math.min(outBuf.length, input.length)); + if (!_tryToLoadToHaveAtLeast(size)) { + return len; + } + int outEnd = size; + int outPtr = 0; + int inPtr = _inputPtr; + int i = 0; + // Tight loop to copy into the output buffer, bail if a non-ascii char is found + while (outPtr < outEnd && i >= 0) { + i = input[inPtr++]; + outBuf[outPtr++] = (char) i; + } + // Found a non-ascii char, correct pointers and return to the caller. + if (i < 0) { + --outPtr; + _inputPtr = inPtr - 1; + _textBuffer.setCurrentLength(outPtr); + return len - outPtr; + } + _inputPtr = inPtr; + if (outPtr >= outBuf.length) { + outBuf = _textBuffer.finishCurrentSegment(); + } + len -= size; + } + return len; + } + private final void _finishChunkedText() throws IOException { char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); @@ -2532,7 +2580,6 @@ private final void _finishChunkedText() throws IOException } break; } - _chunkLeft = len; int end = _inputPtr + len; if (end <= _inputEnd) { // all within buffer _chunkLeft = 0; @@ -2541,19 +2588,22 @@ private final void _finishChunkedText() throws IOException _chunkLeft = (end - _inputEnd); _chunkEnd = _inputEnd; } - } - // besides of which just need to ensure there's content - if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk - loadMoreGuaranteed(); - int end = _inputPtr + _chunkLeft; - if (end <= _inputEnd) { // all within buffer - _chunkLeft = 0; - _chunkEnd = end; - } else { // stretches beyond - _chunkLeft = (end - _inputEnd); - _chunkEnd = _inputEnd; + // start of a new chunk + // First a tight loop for ASCII. + _textBuffer.setCurrentLength(outPtr); + if (_finishChunkedTextAscii()) { + // chunk fully consumed, let's get the next one + outBuf = _textBuffer.getBufferWithoutReset(); + outPtr = _textBuffer.getCurrentSegmentSize(); + outEnd = outBuf.length; + continue; } + outBuf = _textBuffer.getBufferWithoutReset(); + outPtr = _textBuffer.getCurrentSegmentSize(); + outEnd = outBuf.length; } + // besides of which just need to ensure there's content + _loadMoreForChunkIfNeeded(); } int c = input[_inputPtr++] & 0xFF; int code = codes[c]; @@ -2563,9 +2613,9 @@ private final void _finishChunkedText() throws IOException } switch (code) { - case 0: - break; - case 1: // 2-byte UTF + case 0: + break; + case 1: // 2-byte UTF { int d = _nextChunkedByte(); if ((d & 0xC0) != 0x080) { @@ -2574,24 +2624,24 @@ private final void _finishChunkedText() throws IOException c = ((c & 0x1F) << 6) | (d & 0x3F); } break; - case 2: // 3-byte UTF - c = _decodeChunkedUTF8_3(c); - break; - case 3: // 4-byte UTF - c = _decodeChunkedUTF8_4(c); - // Let's add first part right away: - if (outPtr >= outBuf.length) { - outBuf = _textBuffer.finishCurrentSegment(); - outPtr = 0; - outEnd = outBuf.length; - } - outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); - c = 0xDC00 | (c & 0x3FF); - // And let the other char output down below - break; - default: - // Is this good enough error message? - _reportInvalidInitial(c); + case 2: // 3-byte UTF + c = _decodeChunkedUTF8_3(c); + break; + case 3: // 4-byte UTF + c = _decodeChunkedUTF8_4(c); + // Let's add first part right away: + if (outPtr >= outBuf.length) { + outBuf = _textBuffer.finishCurrentSegment(); + outPtr = 0; + outEnd = outBuf.length; + } + outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); + c = 0xDC00 | (c & 0x3FF); + // And let the other char output down below + break; + default: + // Is this good enough error message? + _reportInvalidInitial(c); } // Need more room? if (outPtr >= outEnd) { @@ -2602,9 +2652,75 @@ private final void _finishChunkedText() throws IOException // Ok, let's add char to output: outBuf[outPtr++] = (char) c; } + _textBuffer.setCurrentLength(outPtr); } + /** + * Reads in a tight loop ASCII text until a non-ASCII char is found. If any, then it returns false to signal the + * caller that the chunk wasn't finished. The caller will keep adding to the _outBuf at the _outPtr position to + * finish the current text buffer segment + */ + private final boolean _finishChunkedTextAscii() throws IOException + { + final byte[] input = _inputBuffer; + int outPtr = _textBuffer.getCurrentSegmentSize(); + char[] outBuf = _textBuffer.getBufferWithoutReset(); + int outEnd = outBuf.length; + while (true) { + // besides of which just need to ensure there's content + _loadMoreForChunkIfNeeded(); + + // Find the size of the loop + int inSize = _chunkEnd - _inputPtr; + int outSize = outEnd - outPtr; + int inputPtr = _inputPtr; + int inputPtrEnd = _inputPtr + Math.min(inSize, outSize); + int i = 0; + // loop with copying what we can. + while (inputPtr < inputPtrEnd && i >= 0) { + i = input[inputPtr++]; + char val = (char) i; + outBuf[outPtr++] = val; + } + _inputPtr = inputPtr; + + if (i < 0) { + // Found a non-ascii char, correct pointers and return to the caller. + _inputPtr -= 1; + _textBuffer.setCurrentLength(outPtr - 1); + // return false to signal this to the calling code to allow the multi-byte code-path to kick. + return false; + } + // Need more room? + if (outPtr >= outEnd) { + outBuf = _textBuffer.finishCurrentSegment(); + outPtr = 0; + outEnd = outBuf.length; + } + if (_inputPtr < _chunkEnd || _chunkLeft > 0) { + continue; + } + _textBuffer.setCurrentLength(outPtr); + return true; + } + } + + private final void _loadMoreForChunkIfNeeded() throws IOException + { + if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk + loadMoreGuaranteed(); + int end = _inputPtr + _chunkLeft; + if (end <= _inputEnd) { // all within buffer + _chunkLeft = 0; + _chunkEnd = end; + } else { // stretches beyond + _chunkLeft = (end - _inputEnd); + _chunkEnd = _inputEnd; + } + } + } + private final int _nextByte() throws IOException { int inPtr = _inputPtr; if (inPtr < _inputEnd) { @@ -3716,6 +3832,10 @@ protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOExcep if (_inputStream == null) { return false; } + // The code below assumes this is true, so we check it here. + if (_inputBuffer.length < minAvailable) { + return false; + } // Need to move remaining data in front? int amount = _inputEnd - _inputPtr; if (amount > 0 && _inputPtr > 0) { diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java index cd5cfb8c2..70ee0da61 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java @@ -216,6 +216,10 @@ protected static String generateUnicodeString(int length) { return generateUnicodeString(length, new Random(length)); } + protected static String generateUnicodeStringWithAsciiPrefix(int asciiPrefixLen, int length) { + return generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, length, new Random(length)); + } + protected static String generateUnicodeString(int length, Random rnd) { StringBuilder sw = new StringBuilder(length+10); @@ -241,6 +245,31 @@ protected static String generateUnicodeString(int length, Random rnd) return sw.toString(); } + protected static String generateUnicodeStringWithAsciiPrefix(int asciiLength, int length, Random rnd) + { + StringBuilder sw = new StringBuilder(length+10); + // add a prefix of ascii chars + int num = asciiLength; + while (--num >= 0) { + sw.append((char) ('A' + (num % 32))); + } + do { + // Then a unicode char of 2, 3 or 4 bytes long + switch (rnd.nextInt() % 3) { + case 0: + sw.append((char) (256 + rnd.nextInt() & 511)); + break; + case 1: + sw.append((char) (2048 + rnd.nextInt() & 4095)); + break; + default: + sw.append((char) (65536 + rnd.nextInt() & 0x3FFF)); + break; + } + } while (sw.length() < length); + return sw.toString(); + } + protected static String generateLongAsciiString(int length) { return generateLongAsciiString(length, new Random(length)); } diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java index dc287d8a1..4c02675a8 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java @@ -1,6 +1,7 @@ package com.fasterxml.jackson.dataformat.cbor.parse; import java.io.*; +import java.util.Arrays; import org.junit.jupiter.api.Test; @@ -62,12 +63,28 @@ public void testMediumText() throws Exception _testMedium(3900); } - private void _testMedium(int len) throws Exception + @Test + public void testMediumText2() throws Exception + { + for (int prefix : Arrays.asList(197, 198, 199, 200, 201, 497, 499, 500, 501)) { + _testMedium(prefix, 1300); + _testMedium(prefix, 1900); + _testMedium(prefix, 2300); + _testMedium(prefix, 3900); + } + } + + private void _testMedium(int len) throws Exception { + _testMedium(0, len); + } + + private void _testMedium(int asciiPrefixLen, int len) throws Exception { // First, use size that should fit in output buffer, but ByteArrayOutputStream out = new ByteArrayOutputStream(); CBORGenerator gen = cborGenerator(out); - final String MEDIUM = generateUnicodeString(len); + final String MEDIUM = asciiPrefixLen == 0 ? + generateUnicodeString(len) : generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, len); gen.writeString(MEDIUM); gen.close(); @@ -165,6 +182,16 @@ public void testLongChunkedText() throws Exception _testLongChunkedText(generateUnicodeString(21000)); } + @Test + public void testLongChunkedText2() throws Exception + { + // The text buffer starting size is 200 bytes, let's cycle around that + // amount to verify the tight ascii loop. + for (int prefix = 194; prefix < 202; ++prefix) { + _testLongChunkedText(generateUnicodeStringWithAsciiPrefix(prefix, 21000)); + } + } + @SuppressWarnings("resource") public void _testLongChunkedText(String input) throws Exception {