cadmiumcr · hugoabonizio · Oct 29, 2019
diff --git a/spec/cadmium/lemmatizer.cr b/spec/cadmium/lemmatizer.cr
@@ -0,0 +1 @@
+require "./lemmatizer/*"
diff --git a/spec/cadmium/lemmatizer/wordnet_spec.cr b/spec/cadmium/lemmatizer/wordnet_spec.cr
@@ -0,0 +1,25 @@
+require "../../spec_helper"
+
+describe Cadmium::Lemmatizer::WordNetLemmatizer do
+  subject = Cadmium::Lemmatizer::WordNetLemmatizer
+
+  it "should preform lemmatization" do
+    subject.lemmatize("dogs").should eq("dog")
+    subject.lemmatize("churches").should eq("church")
+    subject.lemmatize("aardwolves").should eq("aardwolf")
+    subject.lemmatize("abaci").should eq("abacus")
+    subject.lemmatize("hardrock").should eq("hardrock")
+  end
+
+  it "should lemmatize with String#lemmatize" do
+    "dogs".lemmatize.should eq("dog")
+  end
+
+  it "should tokenize and lemmatize with String#tokenize_and_lemmatize" do
+    "My dogs are very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize.should eq(["dog", "fun", "play", "poodle"])
+  end
+
+  it "should tokenize and lemmatize including stopwords" do
+    "My dog is very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize(keep_stops: true).should eq(["my", "dog", "be", "very", "fun", "to", "play", "with", "and", "another", "thing", "he", "be", "a", "poodle"])
+  end
+end
diff --git a/src/cadmium/lemmatizer.cr b/src/cadmium/lemmatizer.cr
@@ -0,0 +1,13 @@
+require "./lemmatizer/*"
+
+module Cadmium
+  module StringExtension
+    def lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer)
+      lemmatizer.lemmatize(self)
+    end
+
+    def tokenize_and_lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer, keep_stops = false)
+      lemmatizer.tokenize_and_lemmatize(self, keep_stops)
+    end
+  end
+end
diff --git a/src/cadmium/lemmatizer/wordnet.cr b/src/cadmium/lemmatizer/wordnet.cr
@@ -0,0 +1,35 @@
+require "../wordnet"
+require "../i18n/stop_words"
+require "../tokenizer/aggressive_tokenizer"
+
+module Cadmium
+  module Lemmatizer
+    module WordNetLemmatizer
+      include Cadmium::I18n::StopWords
+
+      def self.lemmatize(token, pos : Symbol | String? = nil)
+        lemmas = if pos
+                   WordNet.morphy(token, pos)
+                 else
+                   WordNet.morphy(token)
+                 end
+        return token if lemmas.empty?
+        lemmas.min_by(&.size)
+      end
+
+      def self.tokenize_and_lemmatize(text, keep_stops = false)
+        lemmatized_tokens = [] of String
+        lowercase_text = text.downcase
+        tokens = Cadmium::AggressiveTokenizer.new.tokenize(lowercase_text)
+
+        if keep_stops
+          tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) }
+        else
+          tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) unless @@stop_words.includes?(token) }
+        end
+
+        lemmatized_tokens
+      end
+    end
+  end
+end