apache · mawiesne · Dec 23, 2024 · Dec 21, 2024
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
@@ -21,6 +21,7 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 
 import opennlp.tools.cmdline.AbstractTrainerTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -33,6 +34,7 @@
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.tokenize.TokenizerFactory;
 import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -53,9 +55,15 @@ public String getShortDescription() {
 
   static Dictionary loadDict(File f) throws IOException {
     Dictionary dict = null;
-    if (f != null) {
+    if (f != null && f.exists()) {
       CmdLineUtil.checkInputFile("abb dict", f);
-      dict = new Dictionary(new BufferedInputStream(new FileInputStream(f)));
+      try (InputStream in = new BufferedInputStream(new FileInputStream(f))) {
+        if (in.available() == 0) {
+          throw new InvalidFormatException("Encountered an empty dictionary file?!");
+        } else {
+          dict = new Dictionary(in);
+        }
+      }
     }
     return dict;
   }

diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -19,19 +19,22 @@
 
 import java.io.BufferedOutputStream;
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.PrintStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Disabled;
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import ch.qos.logback.classic.LoggerContext;
+import nl.altindag.log.LogCaptor;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
+import org.slf4j.LoggerFactory;
 
 import opennlp.tools.cmdline.namefind.TokenNameFinderTool;
 import opennlp.tools.namefind.NameFinderME;
@@ -44,75 +47,80 @@
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
 
-public class TokenNameFinderToolTest {
-
-  @Test
-  //TODO OPENNLP-1447
-  @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " +
-          "We need to find a way to redirect log output (i.e. implement " +
-          "a custom log adapter and plug it in, if we want to do such tests.")
-  void run() throws IOException {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
-    File model1 = trainModel();
+public class TokenNameFinderToolTest {
 
-    String[] args = new String[] {model1.getAbsolutePath()};
+  /*
+   * Programmatic change to debug log to ensure that we can see log messages to
+   * confirm no duplicate download is happening
+   */
+  @BeforeAll
+  public static void prepare() {
+    LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
+    Logger logger = context.getLogger("opennlp.tools.cmdline.namefind");
+    logger.setLevel(Level.INFO);
+  }
 
-    final String in = "It is Stefanie Schmidt.\n\nNothing in this sentence.";
-    InputStream stream = new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8));
+  /*
+   * Programmatic restore the default log level (= OFF) after the test
+   */
+  @AfterAll
+  public static void cleanup() {
+    LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
+    Logger logger = context.getLogger("opennlp.tools.cmdline.namefind");
+    logger.setLevel(Level.OFF);
+  }
 
-    System.setIn(stream);
+  @Test
+  void run() throws IOException {
+    try (LogCaptor logCaptor = LogCaptor.forClass(TokenNameFinderTool.class)) {
+      File model1 = trainModel();
+      String[] args = new String[] {model1.getAbsolutePath()};
 
-    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-    PrintStream ps = new PrintStream(baos);
-    System.setOut(ps);
+      final String in = "It is Stefanie Schmidt.\n";
+      InputStream stream = new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8));
 
-    TokenNameFinderTool tool = new TokenNameFinderTool();
-    tool.run(args);
+      System.setIn(stream);
 
-    final String content = baos.toString(StandardCharsets.UTF_8);
-    Assertions.assertTrue(content.contains("It is <START:person> Stefanie Schmidt. <END>"));
+      TokenNameFinderTool tool = new TokenNameFinderTool();
+      tool.run(args);
 
-    Assertions.assertTrue(model1.delete());
+      assertEquals(1, logCaptor.getInfoLogs().size());
+      final String content = logCaptor.getInfoLogs().get(0);
+      logCaptor.clearLogs();
+      assertEquals("It is <START:person> Stefanie Schmidt. <END>", content);
+      assertTrue(model1.delete());
+    }
   }
 
   @Test
   void invalidModel() {
-
-    Assertions.assertThrows(TerminateToolException.class, () -> {
-
+    assertThrows(TerminateToolException.class, () -> {
       String[] args = new String[] {"invalidmodel.bin"};
-
       TokenNameFinderTool tool = new TokenNameFinderTool();
       tool.run(args);
 
     });
-
-
   }
 
   @Test
-  //TODO OPENNLP-1447
-  @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " +
-          "We need to find a way to redirect log output (i.e. implement " +
-          "a custom log adapter and plug it in, if we want to do such tests.")
   void usage() {
+    try (LogCaptor logCaptor = LogCaptor.forClass(TokenNameFinderTool.class)) {
+      String[] args = new String[] {};
 
-    String[] args = new String[] {};
-
-    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-    PrintStream ps = new PrintStream(baos);
-    System.setOut(ps);
-
-    TokenNameFinderTool tool = new TokenNameFinderTool();
-    tool.run(args);
-
-    final String content = baos.toString(StandardCharsets.UTF_8);
-    Assertions.assertEquals(tool.getHelp(), content.trim());
+      TokenNameFinderTool tool = new TokenNameFinderTool();
+      tool.run(args);
 
+      assertEquals(1, logCaptor.getInfoLogs().size());
+      final String content = logCaptor.getInfoLogs().get(0);
+      assertEquals(tool.getHelp(), content.trim());
+    }
   }
 
   private File trainModel() throws IOException {
-
     ObjectStream<String> lineStream =
         new PlainTextByLineStream(new MockInputStreamFactory(
             new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")),
@@ -123,7 +131,6 @@ private File trainModel() throws IOException {
     params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel model;
-
     TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
 
     try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) {
@@ -132,12 +139,10 @@ private File trainModel() throws IOException {
     }
 
     File modelFile = Files.createTempFile("model", ".bin").toFile();
-
     try (OutputStream modelOut =
              new BufferedOutputStream(new FileOutputStream(modelFile))) {
       model.serialize(modelOut);
     }
-
     return modelFile;
   }
 

diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
@@ -18,33 +18,40 @@
 package opennlp.tools.cmdline.tokenizer;
 
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.PrintStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Disabled;
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import ch.qos.logback.classic.LoggerContext;
+import nl.altindag.log.LogCaptor;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
+import org.slf4j.LoggerFactory;
 
 import opennlp.tools.AbstractTempDirTest;
+import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.StreamFactoryRegistry;
 import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.InvalidFormatException;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
 /**
  * Tests for the {@link TokenizerTrainerTool} class.
  */
 public class TokenizerTrainerToolTest extends AbstractTempDirTest {
 
-  private TokenizerTrainerTool tokenizerTrainerTool;
-
   private final String sampleSuccessData =
       "Pierre Vinken<SPLIT>, 61 years old<SPLIT>, will join the board as a nonexecutive " +
           "director Nov. 29<SPLIT>.\n" +
@@ -54,55 +61,70 @@ public class TokenizerTrainerToolTest extends AbstractTempDirTest {
 
   private final String sampleFailureData = "It is Fail Test Case.\n\nNothing in this sentence.";
 
+  /*
+   * Programmatic change to debug log to ensure that we can see log messages to
+   * confirm no duplicate download is happening
+   */
+  @BeforeAll
+  public static void prepare() {
+    LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
+    Logger logger = context.getLogger("opennlp.tools.cmdline.CmdLineUtil");
+    logger.setLevel(Level.INFO);
+  }
+
+  /*
+   * Programmatic restore the default log level (= OFF) after the test
+   */
+  @AfterAll
+  public static void cleanup() {
+    LoggerContext context = (LoggerContext) LoggerFactory.getILoggerFactory();
+    Logger logger = context.getLogger("opennlp.tools.cmdline.CmdLineUtil");
+    logger.setLevel(Level.OFF);
+  }
+
   @Test
   public void testGetShortDescription() {
-    tokenizerTrainerTool = new TokenizerTrainerTool();
-    Assertions.assertEquals("Trainer for the learnable tokenizer",
+    TokenizerTrainerTool tokenizerTrainerTool = new TokenizerTrainerTool();
+    assertEquals("Trainer for the learnable tokenizer",
             tokenizerTrainerTool.getShortDescription());
   }
 
   @Test
   public void testLoadDictHappyCase() throws IOException {
     File dictFile = new File("lang/ga/abb_GA.xml");
     Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
-    Assertions.assertNotNull(dict);
+    assertNotNull(dict);
   }
 
   @Test
   public void testLoadDictFailCase() {
-    Assertions.assertThrows(InvalidFormatException.class , () ->
+    assertThrows(InvalidFormatException.class , () ->
             TokenizerTrainerTool.loadDict(prepareDataFile("")));
   }
 
-  //TODO OPENNLP-1447
-  @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " +
-          "We need to find a way to redirect log output (i.e. implement " +
-          "a custom log adapter and plug it in, if we want to do such tests.")
+  @Test
   public void testTestRunHappyCase() throws IOException {
-    File model = tempDir.resolve("model-en.bin").toFile();
-
-    String[] args =
-        new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
-            "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" };
-
-    InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8));
-    System.setIn(stream);
-    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-    PrintStream ps = new PrintStream(baos);
-    System.setOut(ps);
-
-    tokenizerTrainerTool = new TokenizerTrainerTool();
-    tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
-
-    final String content = baos.toString(StandardCharsets.UTF_8);
-    Assertions.assertTrue(content.contains("Number of Event Tokens: 171"));
-    Assertions.assertTrue(model.delete());
+    try (LogCaptor logCaptor = LogCaptor.forClass(CmdLineUtil.class)) {
+      File model = tempDir.resolve("model-en.bin").toFile();
+
+      String[] args =
+          new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
+              "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" };
+
+      InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8));
+      System.setIn(stream);
+
+      TokenizerTrainerTool trainerTool = new TokenizerTrainerTool();
+      trainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+
+      assertEquals(3, logCaptor.getInfoLogs().size());
+      final String content = logCaptor.getInfoLogs().get(2);
+      assertTrue(content.startsWith("Wrote tokenizer model to path:"));
+      assertTrue(model.delete());
+    }
   }
 
-  //TODO OPENNLP-1447
-  @Disabled(value = "OPENNLP-1447: These kind of tests won't work anymore. " +
-          "We need to find a way to redirect log output (i.e. implement " +
-          "a custom log adapter and plug it in, if we want to do such tests.")
+  @Test
   public void testTestRunExceptionCase() throws IOException {
     File model = tempDir.resolve("model-en.bin").toFile();
     model.deleteOnExit();
@@ -111,17 +133,10 @@ public void testTestRunExceptionCase() throws IOException {
         new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
             "-data" , String.valueOf(prepareDataFile(sampleFailureData)) , "-encoding" , "UTF-8" };
 
-    InputStream stream = new ByteArrayInputStream(sampleFailureData.getBytes(StandardCharsets.UTF_8));
-    System.setIn(stream);
-    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-    PrintStream ps = new PrintStream(baos);
-    System.setOut(ps);
-
-    Assertions.assertThrows(TerminateToolException.class , () -> {
-      tokenizerTrainerTool = new TokenizerTrainerTool();
-      tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+    assertThrows(TerminateToolException.class , () -> {
+      TokenizerTrainerTool trainerTool = new TokenizerTrainerTool();
+      trainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
     });
-
   }
 
   // This is guaranteed to be deleted after the test finishes.

diff --git a/opennlp-tools/src/test/resources/logback-test.xml b/opennlp-tools/src/test/resources/logback-test.xml
@@ -23,12 +23,16 @@
 
     <appender name="consoleAppender" class="ch.qos.logback.core.ConsoleAppender">
         <encoder>
-            <pattern>%date{HH:mm:ss.SSS} [%thread] %-5level %class{36}.%method:%line - %msg%n</pattern>
+            <pattern>%date{HH:mm:ss.SSS} [%thread] %-4level %class{36}.%method:%line - %msg%n</pattern>
         </encoder>
     </appender>
 
     <logger name="opennlp" level="off"/>
 
+    <logger name="opennlp.tools.cmdline.namefind" level="off"/>
+
+    <logger name="opennlp.tools.cmdline.CmdLineUtil" level="off"/>
+
     <root level="off">
         <appender-ref ref="consoleAppender" />
     </root>