OPENNLP-1660: Switch to pre-trained UD models in Dev Manual (#702)

apache · Dec 3, 2024 · 1d72200 · 1d72200
1 parent 3cf8b91
commit 1d72200
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 92 deletions.
diff --git a/opennlp-docs/src/docbkx/langdetect.xml b/opennlp-docs/src/docbkx/langdetect.xml
@@ -147,7 +147,7 @@ lav     Egija Tri-Active procedūru īpaši iesaka izmantot siltākajos gadalaik
 		<section id="tools.langdetect.training.tool">
 			<title>Training Tool</title>
 			<para>
-				The following command will train the language detector and write the model to langdetect.bin:
+				The following command will train the language detector and write the model to langdetect-custom.bin:
 				<screen>
 					<![CDATA[
 $ bin/opennlp LanguageDetectorTrainer[.leipzig] -model modelFile [-params paramsFile] \
@@ -214,7 +214,7 @@ params.put(TrainingParameters.CUTOFF_PARAM, 0);
 LanguageDetectorFactory factory = new LanguageDetectorFactory();
 
 LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, factory);
-model.serialize(new File("langdetect.bin"));]]>
+model.serialize(new File("langdetect-custom.bin"));]]>
 	</programlisting>
 		</para>
 		</section>

diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml
@@ -41,31 +41,31 @@
 			<para>
 				<screen>
 		   <![CDATA[
-$ opennlp LemmatizerME en-lemmatizer.bin < sentences]]>
+$ opennlp LemmatizerME opennlp-en-ud-ewt-lemmas-1.2-2.5.0.bin < sentences]]>
 		  </screen>
 				The Lemmatizer now reads a pos tagged sentence(s) per line from
 				standard input. For example, you can copy this sentence to the
 				console:
 				<screen>
 		    <![CDATA[
-Rockwell_NNP International_NNP Corp._NNP 's_POS Tulsa_NNP unit_NN said_VBD it_PRP 
-signed_VBD a_DT tentative_JJ agreement_NN extending_VBG its_PRP$ contract_NN with_IN
-Boeing_NNP Co._NNP to_TO provide_VB structural_JJ parts_NNS for_IN Boeing_NNP 's_POS 
-747_CD jetliners_NNS ._.]]>
+Rockwell_PROPN International_ADJ Corp_NOUN 's_PUNCT Tulsa_PROPN unit_NOUN said_VERB it_PRON
+signed_VERB a_DET tentative_NOUN agreement_NOUN extending_VERB its_PRON contract_NOUN
+with_ADP Boeing_PROPN Co._NOUN to_PART provide_VERB structural_ADJ parts_NOUN for_ADP
+Boeing_PROPN 's_PUNCT 747_NUM jetliners_NOUN ._PUNCT]]>
 		  </screen>
 				The Lemmatizer will now echo the lemmas for each word postag pair to
 				the console:
 				<screen>
 		    <![CDATA[
-Rockwell NNP rockwell
-International NNP international
-Corp. NNP corp.
-'s POS 's
-Tulsa NNP tulsa
-unit NN unit
-said VBD say
-it PRP it
-signed VBD sign
+Rockwell	PROPN	rockwell
+International	ADJ	international
+Corp	NOUN	corp
+'s	PUNCT	's
+Tulsa	PROPN	tulsa
+unit	NOUN	unit
+said	VERB	say
+it	PRON	it
+signed	VERB	sign
 ...
 ]]>
 		  </screen>
@@ -89,7 +89,7 @@ signed VBD sign
 				<programlisting language="java">
 		<![CDATA[
 LemmatizerModel model = null;
-try (InputStream modelIn = new FileInputStream("en-lemmatizer.bin"))) {
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-lemmas-1.2-2.5.0.bin"))) {
   model = new LemmatizerModel(modelIn);
 }
 ]]>
@@ -116,10 +116,10 @@ String[] tokens = new String[] { "Rockwell", "International", "Corp.", "'s",
     "provide", "structural", "parts", "for", "Boeing", "'s", "747",
     "jetliners", "." };
 
-String[] postags = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN",
-    "VBD", "PRP", "VBD", "DT", "JJ", "NN", "VBG", "PRP$", "NN", "IN",
-    "NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS",
-    "." };
+String[] postags = new String[] { "PROPN", "ADJ", "NOUN", "PUNCT", "PROPN", "NOUN",
+    "VERB", "PRON", "VERB", "DET", "NOUN", "NOUN", "VERB", "PRON", "NOUN", "ADP",
+    "PROPN", "NOUN", "PART", "VERB", "ADJ", "NOUN", "ADP", "PROPN", "PUNCT", "NUM", "NOUN",
+    "PUNCT" };
 
 String[] lemmas = lemmatizer.lemmatize(tokens, postags);]]>
 		</programlisting>
@@ -136,31 +136,31 @@ String[] lemmas = lemmatizer.lemmatize(tokens, postags);]]>
 				corresponding lemma, each column separated by a tab character.
 				<screen>
 		<![CDATA[
-show		NN	show
-showcase	NN	showcase
-showcases	NNS	showcase
-showdown	NN	showdown
-showdowns	NNS	showdown
-shower		NN	shower
-showers		NNS	shower
-showman		NN	showman
-showmanship	NN	showmanship
-showmen		NNS	showman
-showroom	NN	showroom
-showrooms	NNS	showroom
-shows		NNS	show
-shrapnel	NN	shrapnel
+show		NOUN	show
+showcase	NOUN	showcase
+showcases	NOUN	showcase
+showdown	NOUN	showdown
+showdowns	NOUN	showdown
+shower		NOUN	shower
+showers		NOUN	shower
+showman		NOUN	showman
+showmanship	NOUN	showmanship
+showmen		NOUN	showman
+showroom	NOUN	showroom
+showrooms	NOUN	showroom
+shows		NOUN	show
+shrapnel	NOUN	shrapnel
 		]]>
 		</screen>
 				Alternatively, if a (word,postag) pair can output multiple lemmas, the
 				the lemmatizer dictionary would consist of a text file containing, for
 				each row, a word, its postag and the corresponding lemmas separated by "#":
 				<screen>
 		<![CDATA[
-muestras	NN	muestra
-cantaba		V	cantar
-fue		V	ir#ser
-entramos	V	entrar
+muestras	NOUN	muestra
+cantaba		VERB	cantar
+fue		VERB	ir#ser
+entramos	VERB	entrar
 		]]>
 					</screen>
 				First the dictionary must be loaded into memory from disk or another
@@ -170,7 +170,7 @@ entramos	V	entrar
 				<![CDATA[
 InputStream dictLemmatizer = null;
 
-try (dictLemmatizer = new FileInputStream("english-lemmatizer.txt")) {
+try (dictLemmatizer = new FileInputStream("english-dict-lemmatizer.txt")) {
 
 }
 ]]>
@@ -217,22 +217,22 @@ String[] lemmas = lemmatizer.lemmatize(tokens, postags);
 				Sample sentence of the training data:
 				<screen>
 		<![CDATA[
-He        PRP  he
-reckons   VBZ  reckon
-the       DT   the
-current   JJ   current
-accounts  NNS  account
-deficit   NN   deficit
-will      MD   will
-narrow    VB   narrow
-to        TO   to
-only      RB   only
+He        PRON  he
+reckons   VERB  reckon
+the       DET   the
+current   ADJ   current
+accounts  NOUN  account
+deficit   NOUN   deficit
+will      AUX   will
+narrow    VERB   narrow
+to        PART   to
+only      ADV   only
 #         #    #
-1.8       CD   1.8
-millions  CD   million
-in        IN   in
-September NNP  september
-.         .    O]]>
+1.8       NUM   1.8
+millions  NOUN   million
+in        ADP   in
+September PROPN  september
+.         PUNCT   O]]>
 		</screen>
 				The Universal Dependencies Treebank and the CoNLL 2009 datasets
 				distribute training data for many languages.
@@ -267,11 +267,11 @@ Arguments description:
 		</screen>
 					Its now assumed that the english lemmatizer model should be trained
 					from a file called
-					'en-lemmatizer.train' which is encoded as UTF-8. The following command will train the
-					lemmatizer and write the model to en-lemmatizer.bin:
+					'en-custom-lemmatizer.train' which is encoded as UTF-8. The following command will train the
+					lemmatizer and write the model to en-custom-lemmatizer.bin:
 					<screen>
 		<![CDATA[
-$ opennlp LemmatizerTrainerME -model en-lemmatizer.bin -params PerceptronTrainerParams.txt -lang en -data en-lemmatizer.train -encoding UTF-8]]>
+$ opennlp LemmatizerTrainerME -model en-custom-lemmatizer.bin -params PerceptronTrainerParams.txt -lang en -data en-custom-lemmatizer.train -encoding UTF-8]]>
 		</screen>
 				</para>
 			</section>
@@ -294,7 +294,7 @@ $ opennlp LemmatizerTrainerME -model en-lemmatizer.bin -params PerceptronTrainer
 InputStreamFactory inputStreamFactory = null;
     try {
       inputStreamFactory = new MarkableFileInputStreamFactory(
-          new File(en-lemmatizer.train));
+          new File(en-custom-lemmatizer.train));
     } catch (FileNotFoundException e) {
       e.printStackTrace();
     }
@@ -345,7 +345,7 @@ InputStreamFactory inputStreamFactory = null;
 					The following command shows how the tool can be run:
 					<screen>
 				<![CDATA[
-$ opennlp LemmatizerEvaluator -model en-lemmatizer.bin -data en-lemmatizer.test -encoding utf-8]]>
+$ opennlp LemmatizerEvaluator -model en-custom-lemmatizer.bin -data en-custom-lemmatizer.test -encoding utf-8]]>
 			 </screen>
 					This will display the resulting accuracy score, e.g.:
 					<screen>

diff --git a/opennlp-docs/src/docbkx/postagger.xml b/opennlp-docs/src/docbkx/postagger.xml
@@ -41,7 +41,7 @@ under the License.
 		Download the English maxent pos model and start the POS Tagger Tool with this command:
 		<screen>
 			<![CDATA[
-$ opennlp POSTagger en-pos-maxent.bin]]>
+$ opennlp POSTagger opennlp-en-ud-ewt-pos-1.2-2.5.0.bin]]>
 		 </screen>
 		The POS Tagger now reads a tokenized sentence per line from stdin.
 		Copy these two sentences to the console:
@@ -53,9 +53,9 @@ Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .]]>
 		 The POS Tagger will now echo the sentences with pos tags to the console:
 		<screen>
 			<![CDATA[
-Pierre_NNP Vinken_NNP ,_, 61_CD years_NNS old_JJ ,_, will_MD join_VB the_DT board_NN as_IN
-    a_DT nonexecutive_JJ director_NN Nov._NNP 29_CD ._.
-Mr._NNP Vinken_NNP is_VBZ chairman_NN of_IN Elsevier_NNP N.V._NNP ,_, the_DT Dutch_NNP publishing_VBG group_NN]]>
+Pierre_PROPN Vinken_PROPN ,_PUNCT 61_NUM years_NOUN old_ADJ ,_PUNCT will_AUX join_VERB the_DET board_NOUN as_ADP
+		a_DET nonexecutive_ADJ director_NOUN Nov._PROPN 29_NUM ._PUNCT
+Mr._PROPN Vinken_PROPN is_AUX chairman_NOUN of_ADP Elsevier_ADJ N.V._PROPN ,_PUNCT the_DET Dutch_PROPN publishing_VERB group_NOUN .]]>
 		 </screen>
 		 The tag set used by the English pos model is the <ulink url="https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">Penn Treebank tag set</ulink>.
 		</para>
@@ -69,7 +69,7 @@ Mr._NNP Vinken_NNP is_VBZ chairman_NN of_IN Elsevier_NNP N.V._NNP ,_, the_DT Dut
 			In the sample below it is loaded from disk.
 			<programlisting language="java">
 				<![CDATA[
-try (InputStream modelIn = new FileInputStream("en-pos-maxent.bin"){
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-pos-1.2-2.5.0.bin"){
   POSModel model = new POSModel(modelIn);
 }]]>
 			</programlisting>
@@ -125,8 +125,8 @@ Sequence[] topSequences = tagger.topKSequences(sent);]]>
 			The native POS Tagger training material looks like this:
 			<screen>
 		  <![CDATA[
-About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
-That_DT sounds_VBZ good_JJ ._.]]>
+About_ADV 10_NUM Euro_PROPN ,_PUNCT I_PRON reckon._PUNCT
+That_PRON sounds_VERB good_ADJ ._PUNCT]]>
 			</screen>
 			Each sentence must be in one line. The token/tag pairs are combined with "_".
 			The token/tag pairs are whitespace separated. The data format does not
@@ -180,8 +180,8 @@ Arguments description:
 		    The following command illustrates how an English part-of-speech model can be trained:
 		    <screen>
 		  <![CDATA[
-$ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \
-                           -lang en -data en-pos.train -encoding UTF-8]]>
+$ opennlp POSTaggerTrainer -type maxent -model en-custom-pos-maxent.bin \
+                           -lang en -data en-custom-pos.train -encoding UTF-8]]>
 		    </screen>
 		</para>
 		</section>
@@ -207,7 +207,8 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \
 POSModel model = null;
 
 try {
-  ObjectStream<String> lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(new File("en-pos.train")), StandardCharsets.UTF_8);
+  ObjectStream<String> lineStream = new PlainTextByLineStream(
+  	new MarkableFileInputStreamFactory(new File("en-custom-pos-maxent.bin")), StandardCharsets.UTF_8);
 
   ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream);
 

diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -63,13 +63,13 @@ Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC,
 		Download the english sentence detector model and start the Sentence Detector Tool with this command:
         <screen>
         <![CDATA[
-$ opennlp SentenceDetector en-sent.bin]]>
+$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin]]>
 		</screen>
 		Just copy the sample text from above to the console. The Sentence Detector will read it and echo one sentence per line to the console.
 		Usually the input is read from a file and the output is redirected to another file. This can be achieved with the following command.
 		<screen>
 				<![CDATA[
-$ opennlp SentenceDetector en-sent.bin < input.txt > output.txt]]>
+$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin < input.txt > output.txt]]>
 		</screen>
 		For the english sentence model from the website the input text should not be tokenized.
 		</para>
@@ -81,8 +81,7 @@ $ opennlp SentenceDetector en-sent.bin < input.txt > output.txt]]>
 		To instantiate the Sentence Detector the sentence model must be loaded first.
 		<programlisting language="java">
 				<![CDATA[
-
-try (InputStream modelIn = new FileInputStream("en-sent.bin")) {
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin")) {
   SentenceModel model = new SentenceModel(modelIn);
 }]]>
 		</programlisting>
@@ -148,7 +147,7 @@ Arguments description:
 		To train an English sentence detector use the following command:
         <screen>
 				<![CDATA[
-$ opennlp SentenceDetectorTrainer -model en-sent.bin -lang en -data en-sent.train -encoding UTF-8
+$ opennlp SentenceDetectorTrainer -model en-custom-sent.bin -lang en -data en-custom-sent.train -encoding UTF-8
                         ]]>
         </screen>
             It should produce the following output:
@@ -183,7 +182,7 @@ Performing 100 iterations.
  99:  .. loglikelihood=-284.24296917223916	0.9834118369854598
 100:  .. loglikelihood=-283.2785335773966	0.9834118369854598
 Wrote sentence detector model.
-Path: en-sent.bin
+Path: en-custom-sent.bin
 ]]>
 		</screen>
 		</para>
@@ -209,7 +208,7 @@ Path: en-sent.bin
 				<![CDATA[
 
 ObjectStream<String> lineStream =
-  new PlainTextByLineStream(new MarkableFileInputStreamFactory(new File("en-sent.train")), StandardCharsets.UTF_8);
+  new PlainTextByLineStream(new MarkableFileInputStreamFactory(new File("en-custom-sent.train")), StandardCharsets.UTF_8);
 
 SentenceModel model;
 
@@ -235,7 +234,7 @@ try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(model
                 The command shows how the evaluator tool can be run:
                 <screen>
 				<![CDATA[
-$ opennlp SentenceDetectorEvaluator -model en-sent.bin -data en-sent.eval -encoding UTF-8
+$ opennlp SentenceDetectorEvaluator -model en-custom-sent.bin -data en-custom-sent.eval -encoding UTF-8
 
 Loading model ... done
 Evaluating ... done
@@ -244,7 +243,7 @@ Precision: 0.9465737514518002
 Recall: 0.9095982142857143
 F-Measure: 0.9277177006260672]]>
                 </screen>
-                The en-sent.eval file has the same format as the training data.
+                The en-custom-sent.eval file has the same format as the training data.
 			</para>
 		</section>
 	</section>