Skip to content

Commit 497a7f0

Browse files
rzo1mawiesne
andauthored
OPENNLP-421 - Replace String.intern() in StringList (#568)
* OPENNLP-421 - Adds simple JMH benchmark for "StringList" * OPENNLP-421 - Remove String interning from "StringList" * OPENNLP-421 - Make char array a constant * OPENNLP-421 - Add GC Profiler * OPENNLP-421 - adds StringDeduplicationBenchmark (jmh), adapted from A. Shipilëv for benchmarking on different operating systems / Java environments - fixes inconsistent path in build-helper-maven-plugin config for jmh test addition * OPENNLP-412 - Adjust Benchmark for StringList impl OPENNLP-412 - Add some interner implementations based on the examples of Aleksey Shipilëv. Introduces mechanismn similar to Hadoop for project wide usage. OPENNLP-412 - Use our user provided StringInterner in StringList OPENNLP-412 - Update Interner Benchmarker with new OpenNLP classes. * OPENNLP-421 - Fix incomplete JavaDoc * OPENNLP-421 - Update exec plans for 2nd benchmark * OPENNLP-421 - Updates docs * OPENNLP-421 - Update JavaDoc --------- Co-authored-by: Martin Wiesner <[email protected]>
1 parent 186ecf9 commit 497a7f0

15 files changed

+637
-4
lines changed

opennlp-docs/src/docbkx/introduction.xml

+23
Original file line numberDiff line numberDiff line change
@@ -323,4 +323,27 @@ $ opennlp ToolNameEvaluator -model en-model-name.bin -lang en -data input.test -
323323
</section>
324324
</section>
325325

326+
<section id="intro.sysprops">
327+
<title>System Properties</title>
328+
<section id="intro.sysprops.interner">
329+
<title>String Interning</title>
330+
<para>
331+
OpenNLP provides different implementations for String interning to reduce
332+
memory footprint. By default, OpenNLP uses a custom String interner
333+
implementation.
334+
</para>
335+
<para>
336+
Users may override by setting the following system property:
337+
<screen>
338+
<![CDATA[
339+
-Dopennlp.interner.class=opennlp.tools.util.jvm.JvmStringInterner]]>
340+
</screen>
341+
</para>
342+
<para>
343+
In addition, users can provide custom String interner implementations by implementing
344+
the interface 'StringInterner' and specify this class via 'opennlp.interner.class'.
345+
</para>
346+
</section>
347+
</section>
348+
326349
</chapter>

opennlp-tools/pom.xml

+43
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
<version>1.1.2</version>
7878
<scope>test</scope>
7979
</dependency>
80+
8081
</dependencies>
8182

8283
<build>
@@ -153,5 +154,47 @@
153154
</pluginManagement>
154155
</build>
155156
</profile>
157+
158+
<profile>
159+
<id>jmh</id>
160+
<dependencies>
161+
<dependency>
162+
<groupId>org.openjdk.jmh</groupId>
163+
<artifactId>jmh-core</artifactId>
164+
<version>${jmh.version}</version>
165+
<scope>test</scope>
166+
</dependency>
167+
168+
<dependency>
169+
<groupId>org.openjdk.jmh</groupId>
170+
<artifactId>jmh-generator-annprocess</artifactId>
171+
<version>${jmh.version}</version>
172+
<scope>test</scope>
173+
</dependency>
174+
</dependencies>
175+
<build>
176+
<plugins>
177+
<plugin>
178+
<groupId>org.codehaus.mojo</groupId>
179+
<artifactId>build-helper-maven-plugin</artifactId>
180+
<version>3.2.0</version>
181+
<executions>
182+
<execution>
183+
<id>add-test-source</id>
184+
<phase>generate-test-sources</phase>
185+
<goals>
186+
<goal>add-test-source</goal>
187+
</goals>
188+
<configuration>
189+
<sources>
190+
<source>src/jmh/java</source>
191+
</sources>
192+
</configuration>
193+
</execution>
194+
</executions>
195+
</plugin>
196+
</plugins>
197+
</build>
198+
</profile>
156199
</profiles>
157200
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.tools.util.jvm;
18+
19+
public class BenchmarkRunner {
20+
21+
public static void main(String[] args) throws Exception {
22+
org.openjdk.jmh.Main.main(args);
23+
}
24+
25+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.tools.util.jvm;
18+
19+
import org.openjdk.jmh.annotations.Benchmark;
20+
import org.openjdk.jmh.annotations.Param;
21+
import org.openjdk.jmh.annotations.Scope;
22+
import org.openjdk.jmh.annotations.Setup;
23+
import org.openjdk.jmh.annotations.State;
24+
import org.openjdk.jmh.infra.Blackhole;
25+
26+
/**
27+
* A benchmark class / setup by Aleksey Shipilëv.
28+
* Resides here to investigate performance of String deduplication approaches
29+
* on different environments.
30+
* <p>
31+
* Origin:
32+
* <a href="https://shipilev.net/jvm/anatomy-quarks/10-string-intern/">
33+
* https://shipilev.net/jvm/anatomy-quarks/10-string-intern/</a>
34+
* <p>
35+
* His conclusion:<br>
36+
* "Do not use String.intern() without thinking very hard about it, okay?"
37+
*/
38+
@State(Scope.Benchmark)
39+
public class StringDeduplicationBenchmark {
40+
41+
@Param({"1", "100", "10000", "1000000"})
42+
private int size;
43+
44+
private JvmStringInterner str;
45+
private CHMStringInterner chm;
46+
private HMStringInterner hm;
47+
private CHMStringDeduplicator chmd05;
48+
private NoOpStringInterner noop;
49+
50+
@Setup
51+
public void setup() {
52+
str = new JvmStringInterner();
53+
chm = new CHMStringInterner();
54+
hm = new HMStringInterner();
55+
chmd05 = new CHMStringDeduplicator();
56+
noop = new NoOpStringInterner();
57+
}
58+
59+
@Benchmark
60+
public void intern(Blackhole bh) {
61+
for (int c = 0; c < size; c++) {
62+
bh.consume(str.intern("String" + c));
63+
}
64+
}
65+
66+
@Benchmark
67+
public void chm(Blackhole bh) {
68+
for (int c = 0; c < size; c++) {
69+
bh.consume(chm.intern("String" + c));
70+
}
71+
}
72+
73+
@Benchmark
74+
public void hm(Blackhole bh) {
75+
for (int c = 0; c < size; c++) {
76+
bh.consume(hm.intern("String" + c));
77+
}
78+
}
79+
80+
@Benchmark
81+
public void chmd05(Blackhole bh) {
82+
for (int c = 0; c < size; c++) {
83+
bh.consume(chmd05.intern("String" + c));
84+
}
85+
}
86+
87+
@Benchmark
88+
public void noop(Blackhole bh) {
89+
for (int c = 0; c < size; c++) {
90+
bh.consume(noop.intern("String" + c));
91+
}
92+
}
93+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.tools.util.jvm;
18+
19+
import org.openjdk.jmh.annotations.Benchmark;
20+
import org.openjdk.jmh.infra.Blackhole;
21+
22+
import opennlp.tools.util.jvm.jmh.ExecutionPlan;
23+
import opennlp.tools.util.StringList;
24+
25+
/**
26+
* A benchmark class to test different implementation of {@link StringList} within OpenNLP
27+
*/
28+
public class StringListBenchmark {
29+
30+
@Benchmark
31+
public void newWithArrayConstructor(Blackhole blackhole, ExecutionPlan exec) {
32+
blackhole.consume(new StringList(exec.strings));
33+
}
34+
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.tools.util.jvm.jmh;
18+
19+
import java.util.Random;
20+
21+
import org.openjdk.jmh.annotations.Level;
22+
import org.openjdk.jmh.annotations.Param;
23+
import org.openjdk.jmh.annotations.Scope;
24+
import org.openjdk.jmh.annotations.Setup;
25+
import org.openjdk.jmh.annotations.State;
26+
27+
@State(Scope.Benchmark)
28+
public class ExecutionPlan {
29+
30+
private static final String CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
31+
private static final Random RANDOM = new Random(42);
32+
33+
@Param({"1", "100", "10000", "1000000"})
34+
private int size;
35+
36+
@Param({"opennlp.tools.util.jvm.CHMStringDeduplicator",
37+
"opennlp.tools.util.jvm.CHMStringInterner",
38+
"opennlp.tools.util.jvm.HMStringInterner",
39+
"opennlp.tools.util.jvm.JvmStringInterner",
40+
"opennlp.tools.util.jvm.NoOpStringInterner"})
41+
private String internerClazz;
42+
43+
public String[] strings;
44+
45+
@Setup(Level.Iteration)
46+
public void setUp() {
47+
System.setProperty("opennlp.interner.class", internerClazz);
48+
49+
strings = new String[size];
50+
for (int i = 0; i < size; i++) {
51+
strings[i] = generateRandomString(15);
52+
}
53+
}
54+
55+
private static String generateRandomString(int length) {
56+
final StringBuilder randomString = new StringBuilder();
57+
58+
for (int i = 0; i < length; i++) {
59+
int index = RANDOM.nextInt(CHARS.length());
60+
randomString.append(CHARS.charAt(index));
61+
}
62+
63+
return randomString.toString();
64+
}
65+
}

opennlp-tools/src/main/java/opennlp/tools/util/StringList.java

+6-4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import java.util.NoSuchElementException;
2323
import java.util.Objects;
2424

25+
import opennlp.tools.util.jvm.StringInterners;
26+
2527
/**
2628
* A {@link StringList} is an immutable list of {@link String}s.
2729
*/
@@ -33,19 +35,19 @@ public class StringList implements Iterable<String> {
3335
* Initializes a {@link StringList} instance.
3436
* <p>
3537
* Note: <br>
36-
* Token String will be replaced by identical internal String object.
38+
* Token String will be interned via {@link StringInterners}.
3739
*
3840
* @param singleToken One single token
3941
*/
4042
public StringList(String singleToken) {
41-
tokens = new String[]{singleToken.intern()};
43+
tokens = new String[]{StringInterners.intern(singleToken)};
4244
}
4345

4446
/**
4547
* Initializes a {@link StringList} instance.
4648
* <p>
4749
* Note: <br>
48-
* Token Strings will be replaced by identical internal String object.
50+
* Token Strings will be interned via {@link StringInterners}.
4951
*
5052
* @param tokens The string parts of the new {@link StringList}.
5153
* Must not be an empty tokens array or {@code null}.
@@ -63,7 +65,7 @@ public StringList(String... tokens) {
6365
this.tokens = new String[tokens.length];
6466

6567
for (int i = 0; i < tokens.length; i++) {
66-
this.tokens[i] = tokens[i].intern();
68+
this.tokens[i] = StringInterners.intern(tokens[i]);
6769
}
6870
}
6971

0 commit comments

Comments
 (0)