bug fixes for DTM constructor and for remove_patterns (#94)

* Fixes case where DocumentTermMatrix(crps, lex) would construct a dtm of wrong dimensions if a term provided in lex does not occur in the crps * added DocumentTermMatrix constructor that takes a crps and a prespecified terms vector * fixed remove_patterns to use nextind() to find starting position of next unstripped substring. Closes #92 * removed leftover info() statement in prepare! test
JuliaText · Aug 30, 2018 · 9e425f5 · 9e425f5
1 parent fbb18a7
commit 9e425f5
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 14 deletions.
diff --git a/src/dtm.jl b/src/dtm.jl
@@ -26,14 +26,16 @@ function columnindices(terms::Vector{String})
     column_indices
 end
 
-function DocumentTermMatrix(crps::Corpus, lex)
-    terms = sort(collect(keys(lex)))
+function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
     column_indices = columnindices(terms)
 
+    m = length(crps)
+    n = length(terms)
+
     rows = Array{Int}(0)
     columns = Array{Int}(0)
     values = Array{Int}(0)
-    for i in 1:length(crps)
+    for i in 1:m
         doc = crps.documents[i]
         ngs = ngrams(doc)
         for ngram in keys(ngs)
@@ -47,14 +49,16 @@ function DocumentTermMatrix(crps::Corpus, lex)
         end
     end
     if length(rows) > 0
-        dtm = sparse(rows, columns, values)
+        dtm = sparse(rows, columns, values, m, n)
     else
-        dtm = spzeros(Int, length(crps), 0)
+        dtm = spzeros(Int, m, n)
     end
     DocumentTermMatrix(dtm, terms, column_indices)
 end
 DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps))
 
+DocumentTermMatrix(crps::Corpus, lex::Associative) = DocumentTermMatrix(crps, sort(collect(keys(lex))))
+
 DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{String}) = DocumentTermMatrix(dtm, terms, columnindices(terms))
 
 ##############################################################################
@@ -99,7 +103,7 @@ function dtm_entries(d::AbstractDocument, lex::Dict{String, Int})
     values = Array{Int}(0)
     terms = sort(collect(keys(lex)))
     column_indices = columnindices(terms)
-    
+
     for ngram in keys(ngs)
         if haskey(column_indices, ngram)
             push!(indices, column_indices[ngram])

diff --git a/src/preprocessing.jl b/src/preprocessing.jl
@@ -258,7 +258,7 @@ function remove_patterns(s::AbstractString, rex::Regex)
             Base.write_sub(iob, v, ibegin, len)
             write(iob, ' ')
         end
-        ibegin = m.endof+m.offset+1
+        ibegin = nextind(s, m.endof+m.offset)
     end
     len = length(v) - ibegin + 1
     (len > 0) && Base.write_sub(iob, v, ibegin, len)
@@ -276,7 +276,7 @@ function remove_patterns{T <: String}(s::SubString{T}, rex::Regex)
             Base.write_sub(iob, data, ibegin+ioffset, len)
             write(iob, ' ')
         end
-        ibegin = m.endof+m.offset+1
+        ibegin = nextind(s, m.endof+m.offset)
     end
     len = s.endof - ibegin + 1
     (len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len)

diff --git a/test/dtm.jl b/test/dtm.jl
@@ -31,6 +31,20 @@
     tdm(crps)
     hash_tdm(crps)
 
+    # construct a DocumentTermMatrix from a crps and a custom terms vector
+    terms = ["And", "notincrps"]
+    m = DocumentTermMatrix(crps,terms)
+    @test size(dtm(m),1) == length(terms)
+    @test terms == m.terms
+    @test size(dtm(m),2) == length(crps)
+
+    # construct a DocumentTermMatrix from a crps and a custom lexicon
+    lex = Dict("And"=>1, "notincrps"=>4)
+    m = DocumentTermMatrix(crps,lex)
+    @test size(dtm(m),1) == length(keys(lex))
+    @test size(dtm(m),1) == length(m.terms)
+    @test size(dtm(m),2) == length(crps)
+
     # construct a DocumentTermMatrix from a dtm and terms vector
     terms = m.terms
     m2 = DocumentTermMatrix(dtm1,terms)

diff --git a/test/preprocessing.jl b/test/preprocessing.jl
@@ -1,26 +1,28 @@
 
 @testset "Preprocessing" begin
 
-    sample_text1 = "This is 1 MESSED UP string!"
-    sample_text1_wo_punctuation = "This is 1 MESSED UP string"
-    sample_text1_wo_punctuation_numbers = "This is  MESSED UP string"
-    sample_text1_wo_punctuation_numbers_case = "this is  messed up string"
+    sample_text1 = "This is 1 MESSED υπ string!"
+    sample_text1_wo_punctuation = "This is 1 MESSED υπ string"
+    sample_text1_wo_punctuation_numbers = "This is  MESSED υπ string"
+    sample_text1_wo_punctuation_numbers_case = "this is  messed υπ string"
+    sample_text1_wo_punctuation_numbers_case_az = "this is  messed  string"
 
     sample_texts = [
         sample_text1,
         sample_text1_wo_punctuation,
         sample_text1_wo_punctuation_numbers,
         sample_text1_wo_punctuation_numbers_case,
+        sample_text1_wo_punctuation_numbers_case_az
     ]
 
     # This idiom is _really_ ugly since "OR" means "AND" here.
     for str in sample_texts
         sd = StringDocument(str)
         prepare!(
             sd,
-            strip_punctuation | strip_numbers | strip_case | strip_whitespace
+            strip_punctuation | strip_numbers | strip_case | strip_whitespace | strip_non_letters
         )
-        @test isequal(strip(sd.text), "this is messed up string")
+        @test isequal(strip(sd.text), "this is messed string")
     end
 
     # Need to only remove words at word boundaries