Skip to content

Commit

Permalink
OPENNLP-1541 Conduct cleanup in opennlp.tools.chunker package
Browse files Browse the repository at this point in the history
  • Loading branch information
mawiesne committed Jan 1, 2024
1 parent 08468cc commit e329113
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

Expand Down Expand Up @@ -65,9 +64,9 @@ public ChunkSample(List<String> sentence, List<String> tags, List<String> preds)

validateArguments(sentence.size(), tags.size(), preds.size());

this.sentence = Collections.unmodifiableList(new ArrayList<>(sentence));
this.tags = Collections.unmodifiableList(new ArrayList<>(tags));
this.preds = Collections.unmodifiableList(new ArrayList<>(preds));
this.sentence = List.copyOf(sentence);
this.tags = List.copyOf(tags);
this.preds = List.copyOf(preds);
}

/**
Expand Down
17 changes: 11 additions & 6 deletions opennlp-tools/src/main/java/opennlp/tools/chunker/Chunker.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,36 @@ public interface Chunker {
String[] chunk(String[] toks, String[] tags);

/**
* Generates tagged chunk spans for the given sequence returning the result in a span array.
* Generates tagged chunk {@link Span spans} for the given sequence returning
* the result in a {@link Span span} array.
*
* @param toks an array of the tokens or words of the sequence.
* @param tags an array of the pos tags of the sequence.
*
* @return an array of spans with chunk tags for each chunk in the sequence.
* @return an array of {@link Span spans} with chunk tags for each chunk in the sequence.
*/
Span[] chunkAsSpans(String[] toks, String[] tags);

/**
* Returns the top k chunk sequences for the specified sentence with the specified pos-tags
* Computes the top k chunk {@link Sequence sequences} for the specified sentence with
* the specified pos-tags.
*
* @param sentence The tokens of the sentence.
* @param tags The pos-tags for the specified sentence.
*
* @return the top k chunk sequences for the specified sentence.
* @return the top k chunk {@link Sequence sequences} for the specified sentence.
*/
Sequence[] topKSequences(String[] sentence, String[] tags);

/**
* Returns the top k chunk sequences for the specified sentence with the specified pos-tags
* Computes the top k chunk {@link Sequence sequences} for the specified sentence with
* the specified pos-tags.
*
* @param sentence The tokens of the sentence.
* @param tags The pos-tags for the specified sentence.
* @param minSequenceScore A lower bound on the score of a returned sequence.
*
* @return the top k chunk sequences for the specified sentence.
* @return the top k chunk {@link Sequence sequences} for the specified sentence.
*/
Sequence[] topKSequences(String[] sentence, String[] tags, double minSequenceScore);
}
46 changes: 15 additions & 31 deletions opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,37 +51,13 @@ public class ChunkerME implements Chunker {
/**
* The model used to assign chunk tags to a sequence of tokens.
*/
protected SequenceClassificationModel<TokenTag> model;
private final SequenceClassificationModel<TokenTag> model;

private final ChunkerContextGenerator contextGenerator;
private final SequenceValidator<TokenTag> sequenceValidator;

/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this {@link Chunker}.
* @param beamSize The size of the beam that should be used when decoding sequences.
*
* @deprecated {@code beamSize} is now stored inside the model
*/
@Deprecated
private ChunkerME(ChunkerModel model, int beamSize) {

contextGenerator = model.getFactory().getContextGenerator();
sequenceValidator = model.getFactory().getSequenceValidator();

if (model.getChunkerSequenceModel() != null) {
this.model = model.getChunkerSequenceModel();
}
else {
this.model = new opennlp.tools.ml.BeamSearch<>(beamSize,
model.getChunkerModel(), 0);
}
}

/**
* Initializes the {@link Chunker} by downloading a default model.
* Initializes a {@link Chunker} by downloading a default model.
*
* @param language The language of the model.
* @throws IOException Thrown if the model cannot be downloaded or saved.
Expand All @@ -91,13 +67,21 @@ public ChunkerME(String language) throws IOException {
}

/**
* Initializes the current instance with the specified {@link ChunkerModel}.
* Initializes a {@link Chunker} with the specified {@link ChunkerModel}.
* The {@link #DEFAULT_BEAM_SIZE} is used.
*
* @param model A valid {@link ChunkerModel model} instance.
*/
public ChunkerME(ChunkerModel model) {
this(model, DEFAULT_BEAM_SIZE);
contextGenerator = model.getFactory().getContextGenerator();
sequenceValidator = model.getFactory().getSequenceValidator();

if (model.getChunkerSequenceModel() != null) {
this.model = model.getChunkerSequenceModel();
}
else {
this.model = new BeamSearch<>(DEFAULT_BEAM_SIZE, model.getChunkerModel(), 0);
}
}

@Override
Expand Down Expand Up @@ -143,7 +127,7 @@ public void probs(double[] probs) {

/**
* Returns an array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to {@code chunk}.
* sequence was determined based on the previous call to {@link #chunk(String[], String[])}.
*
* @return An array with the same number of probabilities as tokens when
* {@link ChunkerME#chunk(String[], String[])} was last called.
Expand All @@ -162,6 +146,7 @@ public double[] probs() {
*
* @return A valid, trained {@link ChunkerModel} instance.
* @throws IOException Thrown if IO errors occurred.
* @throws IllegalArgumentException Thrown if the specified {@link TrainerType} is not supported.
*/
public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
TrainingParameters mlParams, ChunkerFactory factory) throws IOException {
Expand All @@ -176,8 +161,7 @@ public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,

if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
ObjectStream<Event> es = new ChunkerEventStream(in, factory.getContextGenerator());
EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams,
manifestInfoEntries);
EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);
chunkerModel = trainer.train(es);
}
else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
* limitations under the License.
*/


package opennlp.tools.chunker;

import java.io.File;
Expand Down Expand Up @@ -159,11 +158,10 @@ protected void validateArtifactMap() throws InvalidFormatException {

// Since 1.8.0 we changed the ChunkerFactory signature. This will check the if the model
// declares a not default factory, and if yes, check if it was created before 1.8
if ( (getManifestProperty(FACTORY_NAME) != null
&& !getManifestProperty(FACTORY_NAME).equals("opennlp.tools.chunker.ChunkerFactory") )
&& this.getVersion().getMajor() <= 1
&& this.getVersion().getMinor() < 8) {
throw new InvalidFormatException("The Chunker factory '" + getManifestProperty(FACTORY_NAME) +
final String factoryName = getManifestProperty(FACTORY_NAME);
if ( (factoryName != null && !factoryName.equals("opennlp.tools.chunker.ChunkerFactory") )
&& this.getVersion().getMajor() <= 1 && this.getVersion().getMinor() < 8) {
throw new InvalidFormatException("The Chunker factory '" + factoryName +
"' is no longer compatible. Please update it to match the latest ChunkerFactory.");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
* limitations under the License.
*/


package opennlp.tools.chunker;

import opennlp.tools.util.TokenTag;
Expand All @@ -33,11 +32,6 @@ public class DefaultChunkerContextGenerator implements ChunkerContextGenerator {
public DefaultChunkerContextGenerator() {
}

public String[] getContext(int index, String[] tokens, String[] postags,
String[] priorDecisions, Object[] additionalContext) {
return getContext(index, tokens, postags, priorDecisions);
}

@Override
public String[] getContext(int i, String[] toks, String[] tags, String[] preds) {
// Words in a 5-word window
Expand Down Expand Up @@ -151,6 +145,6 @@ public String[] getContext(int index, TokenTag[] sequence, String[] priorDecisio
Object[] additionalContext) {
String[] token = TokenTag.extractTokens(sequence);
String[] tags = TokenTag.extractTags(sequence);
return getContext(index, token, tags, priorDecisions, additionalContext);
return getContext(index, token, tags, priorDecisions);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void run(String[] args) {

// TODO: It should not just throw Exception.

EntityLinker entityLinker;
EntityLinker<? extends Span> entityLinker;
try {
entityLinker = EntityLinkerFactory.getLinker(entityType, properties);
}
Expand Down Expand Up @@ -124,7 +124,7 @@ public void run(String[] args) {
text.append("\n");
}

List<Span> linkedSpans =
List<? extends Span> linkedSpans =
entityLinker.find(text.toString(), sentences, tokensBySentence, namesBySentence);

for (Span linkedSpan : linkedSpans) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public interface BeamSearchContextGenerator<T> {
* @param sequence The sequence of {@link T items} over which the beam search is performed.
* @param priorDecisions The sequence of decisions made prior to the context for
* which this decision is being made.
* @param additionalContext Any addition context specific to a class implementing this interface.
* @param additionalContext Any additional context specific to a class implementing this interface.
*
* @return The context for the specified {@code index} in the specified {@code sequence}.
*/
Expand Down

0 comments on commit e329113

Please sign in to comment.