From ed2682ccdb9553970acf683fe0f724d5c57e7c9d Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Mon, 23 Dec 2024 07:15:23 +0100 Subject: [PATCH] OPENNLP-1447: Reenable Cmdline Tool execution tests (#720) - removes @Disabled from multiple cmdline execution tests - adjusts TokenizerTrainerTool to handle existing yet "empty" abb-dict files better --- .../tokenizer/TokenizerTrainerTool.java | 12 +- .../cmdline/TokenNameFinderToolTest.java | 105 +++++++++-------- .../tokenizer/TokenizerTrainerToolTest.java | 107 ++++++++++-------- .../src/test/resources/logback-test.xml | 6 +- 4 files changed, 131 insertions(+), 99 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java index f51b8c67e..4f5389ab2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import opennlp.tools.cmdline.AbstractTrainerTool; import opennlp.tools.cmdline.CmdLineUtil; @@ -33,6 +34,7 @@ import opennlp.tools.tokenize.TokenSample; import opennlp.tools.tokenize.TokenizerFactory; import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.ModelUtil; @@ -53,9 +55,15 @@ public String getShortDescription() { static Dictionary loadDict(File f) throws IOException { Dictionary dict = null; - if (f != null) { + if (f != null && f.exists()) { CmdLineUtil.checkInputFile("abb dict", f); - dict = new Dictionary(new BufferedInputStream(new FileInputStream(f))); + try (InputStream in = new BufferedInputStream(new FileInputStream(f))) { + if (in.available() == 0) { + throw new InvalidFormatException("Encountered an empty dictionary file?!"); + } else { + dict = new Dictionary(in); + } + } } return dict; } diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java index e8594c59e..190fa9d97 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java @@ -19,19 +19,22 @@ import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Disabled; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.LoggerContext; +import nl.altindag.log.LogCaptor; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; import opennlp.tools.cmdline.namefind.TokenNameFinderTool; import opennlp.tools.namefind.NameFinderME; @@ -44,75 +47,80 @@ import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; -public class TokenNameFinderToolTest { - - @Test - //TODO OPENNLP-1447 - @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " + - "We need to find a way to redirect log output (i.e. implement " + - "a custom log adapter and plug it in, if we want to do such tests.") - void run() throws IOException { +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; - File model1 = trainModel(); +public class TokenNameFinderToolTest { - String[] args = new String[] {model1.getAbsolutePath()}; + /* + * Programmatic change to debug log to ensure that we can see log messages to + * confirm no duplicate download is happening + */ + @BeforeAll + public static void prepare() { + LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory(); + Logger logger = context.getLogger("opennlp.tools.cmdline.namefind"); + logger.setLevel(Level.INFO); + } - final String in = "It is Stefanie Schmidt.\n\nNothing in this sentence."; - InputStream stream = new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8)); + /* + * Programmatic restore the default log level (= OFF) after the test + */ + @AfterAll + public static void cleanup() { + LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory(); + Logger logger = context.getLogger("opennlp.tools.cmdline.namefind"); + logger.setLevel(Level.OFF); + } - System.setIn(stream); + @Test + void run() throws IOException { + try (LogCaptor logCaptor = LogCaptor.forClass(TokenNameFinderTool.class)) { + File model1 = trainModel(); + String[] args = new String[] {model1.getAbsolutePath()}; - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(baos); - System.setOut(ps); + final String in = "It is Stefanie Schmidt.\n"; + InputStream stream = new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8)); - TokenNameFinderTool tool = new TokenNameFinderTool(); - tool.run(args); + System.setIn(stream); - final String content = baos.toString(StandardCharsets.UTF_8); - Assertions.assertTrue(content.contains("It is Stefanie Schmidt. ")); + TokenNameFinderTool tool = new TokenNameFinderTool(); + tool.run(args); - Assertions.assertTrue(model1.delete()); + assertEquals(1, logCaptor.getInfoLogs().size()); + final String content = logCaptor.getInfoLogs().get(0); + logCaptor.clearLogs(); + assertEquals("It is Stefanie Schmidt. ", content); + assertTrue(model1.delete()); + } } @Test void invalidModel() { - - Assertions.assertThrows(TerminateToolException.class, () -> { - + assertThrows(TerminateToolException.class, () -> { String[] args = new String[] {"invalidmodel.bin"}; - TokenNameFinderTool tool = new TokenNameFinderTool(); tool.run(args); }); - - } @Test - //TODO OPENNLP-1447 - @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " + - "We need to find a way to redirect log output (i.e. implement " + - "a custom log adapter and plug it in, if we want to do such tests.") void usage() { + try (LogCaptor logCaptor = LogCaptor.forClass(TokenNameFinderTool.class)) { + String[] args = new String[] {}; - String[] args = new String[] {}; - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(baos); - System.setOut(ps); - - TokenNameFinderTool tool = new TokenNameFinderTool(); - tool.run(args); - - final String content = baos.toString(StandardCharsets.UTF_8); - Assertions.assertEquals(tool.getHelp(), content.trim()); + TokenNameFinderTool tool = new TokenNameFinderTool(); + tool.run(args); + assertEquals(1, logCaptor.getInfoLogs().size()); + final String content = logCaptor.getInfoLogs().get(0); + assertEquals(tool.getHelp(), content.trim()); + } } private File trainModel() throws IOException { - ObjectStream lineStream = new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), @@ -123,7 +131,6 @@ private File trainModel() throws IOException { params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel model; - TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory(); try (ObjectStream sampleStream = new NameSampleDataStream(lineStream)) { @@ -132,12 +139,10 @@ private File trainModel() throws IOException { } File modelFile = Files.createTempFile("model", ".bin").toFile(); - try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); } - return modelFile; } diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java index 65993fb09..6059a2b1b 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java @@ -18,33 +18,40 @@ package opennlp.tools.cmdline.tokenizer; import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Disabled; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.LoggerContext; +import nl.altindag.log.LogCaptor; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; import opennlp.tools.AbstractTempDirTest; +import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.InvalidFormatException; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Tests for the {@link TokenizerTrainerTool} class. */ public class TokenizerTrainerToolTest extends AbstractTempDirTest { - private TokenizerTrainerTool tokenizerTrainerTool; - private final String sampleSuccessData = "Pierre Vinken, 61 years old, will join the board as a nonexecutive " + "director Nov. 29.\n" + @@ -54,10 +61,31 @@ public class TokenizerTrainerToolTest extends AbstractTempDirTest { private final String sampleFailureData = "It is Fail Test Case.\n\nNothing in this sentence."; + /* + * Programmatic change to debug log to ensure that we can see log messages to + * confirm no duplicate download is happening + */ + @BeforeAll + public static void prepare() { + LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory(); + Logger logger = context.getLogger("opennlp.tools.cmdline.CmdLineUtil"); + logger.setLevel(Level.INFO); + } + + /* + * Programmatic restore the default log level (= OFF) after the test + */ + @AfterAll + public static void cleanup() { + LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory(); + Logger logger = context.getLogger("opennlp.tools.cmdline.CmdLineUtil"); + logger.setLevel(Level.OFF); + } + @Test public void testGetShortDescription() { - tokenizerTrainerTool = new TokenizerTrainerTool(); - Assertions.assertEquals("Trainer for the learnable tokenizer", + TokenizerTrainerTool tokenizerTrainerTool = new TokenizerTrainerTool(); + assertEquals("Trainer for the learnable tokenizer", tokenizerTrainerTool.getShortDescription()); } @@ -65,44 +93,38 @@ public void testGetShortDescription() { public void testLoadDictHappyCase() throws IOException { File dictFile = new File("lang/ga/abb_GA.xml"); Dictionary dict = TokenizerTrainerTool.loadDict(dictFile); - Assertions.assertNotNull(dict); + assertNotNull(dict); } @Test public void testLoadDictFailCase() { - Assertions.assertThrows(InvalidFormatException.class , () -> + assertThrows(InvalidFormatException.class , () -> TokenizerTrainerTool.loadDict(prepareDataFile(""))); } - //TODO OPENNLP-1447 - @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " + - "We need to find a way to redirect log output (i.e. implement " + - "a custom log adapter and plug it in, if we want to do such tests.") + @Test public void testTestRunHappyCase() throws IOException { - File model = tempDir.resolve("model-en.bin").toFile(); - - String[] args = - new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" , - "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" }; - - InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8)); - System.setIn(stream); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(baos); - System.setOut(ps); - - tokenizerTrainerTool = new TokenizerTrainerTool(); - tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args); - - final String content = baos.toString(StandardCharsets.UTF_8); - Assertions.assertTrue(content.contains("Number of Event Tokens: 171")); - Assertions.assertTrue(model.delete()); + try (LogCaptor logCaptor = LogCaptor.forClass(CmdLineUtil.class)) { + File model = tempDir.resolve("model-en.bin").toFile(); + + String[] args = + new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" , + "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" }; + + InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8)); + System.setIn(stream); + + TokenizerTrainerTool trainerTool = new TokenizerTrainerTool(); + trainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args); + + assertEquals(3, logCaptor.getInfoLogs().size()); + final String content = logCaptor.getInfoLogs().get(2); + assertTrue(content.startsWith("Wrote tokenizer model to path:")); + assertTrue(model.delete()); + } } - //TODO OPENNLP-1447 - @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " + - "We need to find a way to redirect log output (i.e. implement " + - "a custom log adapter and plug it in, if we want to do such tests.") + @Test public void testTestRunExceptionCase() throws IOException { File model = tempDir.resolve("model-en.bin").toFile(); model.deleteOnExit(); @@ -111,17 +133,10 @@ public void testTestRunExceptionCase() throws IOException { new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" , "-data" , String.valueOf(prepareDataFile(sampleFailureData)) , "-encoding" , "UTF-8" }; - InputStream stream = new ByteArrayInputStream(sampleFailureData.getBytes(StandardCharsets.UTF_8)); - System.setIn(stream); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(baos); - System.setOut(ps); - - Assertions.assertThrows(TerminateToolException.class , () -> { - tokenizerTrainerTool = new TokenizerTrainerTool(); - tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args); + assertThrows(TerminateToolException.class , () -> { + TokenizerTrainerTool trainerTool = new TokenizerTrainerTool(); + trainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args); }); - } // This is guaranteed to be deleted after the test finishes. diff --git a/opennlp-tools/src/test/resources/logback-test.xml b/opennlp-tools/src/test/resources/logback-test.xml index b3cbcf203..1baae2912 100644 --- a/opennlp-tools/src/test/resources/logback-test.xml +++ b/opennlp-tools/src/test/resources/logback-test.xml @@ -23,12 +23,16 @@ - %date{HH:mm:ss.SSS} [%thread] %-5level %class{36}.%method:%line - %msg%n + %date{HH:mm:ss.SSS} [%thread] %-4level %class{36}.%method:%line - %msg%n + + + +