Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WordNet lemmatizer #28

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions spec/cadmium/lemmatizer.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
require "./lemmatizer/*"
25 changes: 25 additions & 0 deletions spec/cadmium/lemmatizer/wordnet_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "../../spec_helper"

describe Cadmium::Lemmatizer::WordNetLemmatizer do
subject = Cadmium::Lemmatizer::WordNetLemmatizer

it "should preform lemmatization" do
subject.lemmatize("dogs").should eq("dog")
subject.lemmatize("churches").should eq("church")
subject.lemmatize("aardwolves").should eq("aardwolf")
subject.lemmatize("abaci").should eq("abacus")
subject.lemmatize("hardrock").should eq("hardrock")
end

it "should lemmatize with String#lemmatize" do
"dogs".lemmatize.should eq("dog")
end

it "should tokenize and lemmatize with String#tokenize_and_lemmatize" do
"My dogs are very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize.should eq(["dog", "fun", "play", "poodle"])
end

it "should tokenize and lemmatize including stopwords" do
"My dog is very fun TO play with And another thing, he is A poodle.".tokenize_and_lemmatize(keep_stops: true).should eq(["my", "dog", "be", "very", "fun", "to", "play", "with", "and", "another", "thing", "he", "be", "a", "poodle"])
end
end
13 changes: 13 additions & 0 deletions src/cadmium/lemmatizer.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
require "./lemmatizer/*"

module Cadmium
module StringExtension
def lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer)
lemmatizer.lemmatize(self)
end

def tokenize_and_lemmatize(lemmatizer = Cadmium::Lemmatizer::WordNetLemmatizer, keep_stops = false)
lemmatizer.tokenize_and_lemmatize(self, keep_stops)
end
end
end
35 changes: 35 additions & 0 deletions src/cadmium/lemmatizer/wordnet.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
require "../wordnet"
require "../i18n/stop_words"
require "../tokenizer/aggressive_tokenizer"

module Cadmium
module Lemmatizer
module WordNetLemmatizer
include Cadmium::I18n::StopWords

def self.lemmatize(token, pos : Symbol | String? = nil)
lemmas = if pos
WordNet.morphy(token, pos)
else
WordNet.morphy(token)
end
return token if lemmas.empty?
lemmas.min_by(&.size)
end

def self.tokenize_and_lemmatize(text, keep_stops = false)
lemmatized_tokens = [] of String
lowercase_text = text.downcase
tokens = Cadmium::AggressiveTokenizer.new.tokenize(lowercase_text)

if keep_stops
tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) }
else
tokens.each { |token| lemmatized_tokens.push(lemmatize(token)) unless @@stop_words.includes?(token) }
end

lemmatized_tokens
end
end
end
end