Skip to content

Commit

Permalink
Fixes bug reported in JuliaText/TextAnalysis.jl#149
Browse files Browse the repository at this point in the history
  • Loading branch information
zgornel committed May 7, 2019
1 parent b250d15 commit db5bed3
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/stemmer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ end

function stem!(stemmer::Stemmer, d::NGramDocument)
for token in keys(d.ngrams)
new_token = stem(stemmer, token)
new_token = join(stem(stemmer, split(token)), " ")
if new_token != token
if haskey(d.ngrams, new_token)
d.ngrams[new_token] = d.ngrams[new_token] + d.ngrams[token]
Expand Down
2 changes: 1 addition & 1 deletion test/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
map(x->prepare!(x, strip_everything_stem), [sdoc, ndoc, tdoc, crps])
@test prepare(poem_no_1, strip_everything_stem) == "pin"
@test text(sdoc) == "pin"
@test ngrams(ndoc) == Dict("tag"=>2,"pin "=>1,"hold"=>1,"thrill "=>1)
@test ngrams(ndoc) == Dict("tag"=>2,"pin"=>1,"hold"=>1,"thrill"=>1)
@test tokens(tdoc) == ["pin ", "tag", "hold", "tag", "thrill "]
@test text(crps[1]) == "pin"
# Flag generation
Expand Down
11 changes: 11 additions & 0 deletions test/stemmer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,15 @@
@test try stem!(fdoc); false
catch; true end

# Test 3-gram document
doc = NGramDocument("parts of language", DocumentMetadata(), 3)
stem!(doc)
@test begin
"of" in keys(doc.ngrams)
"languag" in keys(doc.ngrams)
"part of languag" in keys(doc.ngrams)
"part of" in keys(doc.ngrams)
"part" in keys(doc.ngrams)
"of languag" in keys(doc.ngrams)
end
end

0 comments on commit db5bed3

Please sign in to comment.