-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Backport PR #16968 to 8.16: Fix BufferedTokenizer to properly resume …
…after a buffer full condition respecting the encoding of the input string (#16968) (#17021) Backport PR #16968 to 8.16 branch, original message: ---- Permit to use effectively the tokenizer also in context where a line is bigger than a limit. Fixes an issues related to token size limit error, when the offending token was bigger than the input fragment in happened that the tokenzer wasn't unable to recover the token stream from the first delimiter after the offending token but messed things, loosing part of tokens. ## How solve the problem This is a second take to fix the processing of tokens from the tokenizer after a buffer full error. The first try #16482 was rollbacked to the encoding error #16694. The first try failed on returning the tokens in the same encoding of the input. This PR does a couple of things: - accumulates the tokens, so that after a full condition can resume with the next tokens after the offending one. - respect the encoding of the input string. Use `concat` method instead of `addAll`, which avoid to convert RubyString to String and back to RubyString. When return the head `StringBuilder` it enforce the encoding with the input charset. (cherry picked from commit 1c8cf54) Co-authored-by: Andrea Selva <[email protected]>
- Loading branch information
1 parent
32e6def
commit 002d489
Showing
4 changed files
with
426 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
161 changes: 161 additions & 0 deletions
161
logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
/* | ||
* Licensed to Elasticsearch B.V. under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch B.V. licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.logstash.common; | ||
|
||
import org.jruby.RubyArray; | ||
import org.jruby.RubyEncoding; | ||
import org.jruby.RubyString; | ||
import org.jruby.runtime.ThreadContext; | ||
import org.jruby.runtime.builtin.IRubyObject; | ||
import org.junit.Before; | ||
import org.junit.Test; | ||
import org.logstash.RubyTestBase; | ||
import org.logstash.RubyUtil; | ||
|
||
import java.util.List; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertTrue; | ||
import static org.logstash.RubyUtil.RUBY; | ||
|
||
@SuppressWarnings("unchecked") | ||
public final class BufferedTokenizerExtTest extends RubyTestBase { | ||
|
||
private BufferedTokenizerExt sut; | ||
private ThreadContext context; | ||
|
||
@Before | ||
public void setUp() { | ||
sut = new BufferedTokenizerExt(RubyUtil.RUBY, RubyUtil.BUFFERED_TOKENIZER); | ||
context = RUBY.getCurrentContext(); | ||
IRubyObject[] args = {}; | ||
sut.init(context, args); | ||
} | ||
|
||
@Test | ||
public void shouldTokenizeASingleToken() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo\n")); | ||
|
||
assertEquals(List.of("foo"), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldMergeMultipleToken() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo")); | ||
assertTrue(tokens.isEmpty()); | ||
|
||
tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("bar\n")); | ||
assertEquals(List.of("foobar"), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldTokenizeMultipleToken() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo\nbar\n")); | ||
|
||
assertEquals(List.of("foo", "bar"), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldIgnoreEmptyPayload() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("")); | ||
assertTrue(tokens.isEmpty()); | ||
|
||
tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo\nbar")); | ||
assertEquals(List.of("foo"), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldTokenizeEmptyPayloadWithNewline() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("\n")); | ||
assertEquals(List.of(""), tokens); | ||
|
||
tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("\n\n\n")); | ||
assertEquals(List.of("", "", ""), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldNotChangeEncodingOfTokensAfterPartitioning() { | ||
RubyString rubyString = RubyString.newString(RUBY, new byte[]{(byte) 0xA3, 0x0A, 0x41}); // £ character, newline, A | ||
IRubyObject rubyInput = rubyString.force_encoding(context, RUBY.newString("ISO8859-1")); | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>)sut.extract(context, rubyInput); | ||
|
||
// read the first token, the £ string | ||
IRubyObject firstToken = tokens.shift(context); | ||
assertEquals("£", firstToken.toString()); | ||
|
||
// verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion | ||
RubyEncoding encoding = (RubyEncoding) firstToken.callMethod(context, "encoding"); | ||
assertEquals("ISO-8859-1", encoding.toString()); | ||
} | ||
|
||
@Test | ||
public void shouldNotChangeEncodingOfTokensAfterPartitioningInCaseMultipleExtractionInInvoked() { | ||
RubyString rubyString = RubyString.newString(RUBY, new byte[]{(byte) 0xA3}); // £ character | ||
IRubyObject rubyInput = rubyString.force_encoding(context, RUBY.newString("ISO8859-1")); | ||
sut.extract(context, rubyInput); | ||
IRubyObject capitalAInLatin1 = RubyString.newString(RUBY, new byte[]{(byte) 0x41}) | ||
.force_encoding(context, RUBY.newString("ISO8859-1")); | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>)sut.extract(context, capitalAInLatin1); | ||
assertTrue(tokens.isEmpty()); | ||
|
||
tokens = (RubyArray<RubyString>)sut.extract(context, RubyString.newString(RUBY, new byte[]{(byte) 0x0A})); | ||
|
||
// read the first token, the £ string | ||
IRubyObject firstToken = tokens.shift(context); | ||
assertEquals("£A", firstToken.toString()); | ||
|
||
// verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion | ||
RubyEncoding encoding = (RubyEncoding) firstToken.callMethod(context, "encoding"); | ||
assertEquals("ISO-8859-1", encoding.toString()); | ||
} | ||
|
||
@Test | ||
public void shouldNotChangeEncodingOfTokensAfterPartitioningWhenRetrieveLastFlushedToken() { | ||
RubyString rubyString = RubyString.newString(RUBY, new byte[]{(byte) 0xA3, 0x0A, 0x41}); // £ character, newline, A | ||
IRubyObject rubyInput = rubyString.force_encoding(context, RUBY.newString("ISO8859-1")); | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>)sut.extract(context, rubyInput); | ||
|
||
// read the first token, the £ string | ||
IRubyObject firstToken = tokens.shift(context); | ||
assertEquals("£", firstToken.toString()); | ||
|
||
// flush and check that the remaining A is still encoded in ISO8859-1 | ||
IRubyObject lastToken = sut.flush(context); | ||
assertEquals("A", lastToken.toString()); | ||
|
||
// verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion | ||
RubyEncoding encoding = (RubyEncoding) lastToken.callMethod(context, "encoding"); | ||
assertEquals("ISO-8859-1", encoding.toString()); | ||
} | ||
|
||
@Test | ||
public void givenDirectFlushInvocationUTF8EncodingIsApplied() { | ||
RubyString rubyString = RubyString.newString(RUBY, new byte[]{(byte) 0xA3, 0x41}); // £ character, A | ||
IRubyObject rubyInput = rubyString.force_encoding(context, RUBY.newString("ISO8859-1")); | ||
|
||
// flush and check that the remaining A is still encoded in ISO8859-1 | ||
IRubyObject lastToken = sut.flush(context); | ||
assertEquals("", lastToken.toString()); | ||
|
||
// verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion | ||
RubyEncoding encoding = (RubyEncoding) lastToken.callMethod(context, "encoding"); | ||
assertEquals("UTF-8", encoding.toString()); | ||
} | ||
} |
66 changes: 66 additions & 0 deletions
66
logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtWithDelimiterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Licensed to Elasticsearch B.V. under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch B.V. licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.logstash.common; | ||
|
||
import org.jruby.RubyArray; | ||
import org.jruby.RubyString; | ||
import org.jruby.runtime.ThreadContext; | ||
import org.jruby.runtime.builtin.IRubyObject; | ||
import org.junit.Before; | ||
import org.junit.Test; | ||
import org.logstash.RubyTestBase; | ||
import org.logstash.RubyUtil; | ||
|
||
import java.util.List; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertTrue; | ||
import static org.logstash.RubyUtil.RUBY; | ||
|
||
@SuppressWarnings("unchecked") | ||
public final class BufferedTokenizerExtWithDelimiterTest extends RubyTestBase { | ||
|
||
private BufferedTokenizerExt sut; | ||
private ThreadContext context; | ||
|
||
@Before | ||
public void setUp() { | ||
sut = new BufferedTokenizerExt(RubyUtil.RUBY, RubyUtil.BUFFERED_TOKENIZER); | ||
context = RUBY.getCurrentContext(); | ||
IRubyObject[] args = {RubyUtil.RUBY.newString("||")}; | ||
sut.init(context, args); | ||
} | ||
|
||
@Test | ||
public void shouldTokenizeMultipleToken() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo||b|r||")); | ||
|
||
assertEquals(List.of("foo", "b|r"), tokens); | ||
} | ||
|
||
@Test | ||
public void shouldIgnoreEmptyPayload() { | ||
RubyArray<RubyString> tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("")); | ||
assertTrue(tokens.isEmpty()); | ||
|
||
tokens = (RubyArray<RubyString>) sut.extract(context, RubyUtil.RUBY.newString("foo||bar")); | ||
assertEquals(List.of("foo"), tokens); | ||
} | ||
} |
Oops, something went wrong.