Skip to content

Commit

Permalink
bug fixes for DTM constructor and for remove_patterns (#94)
Browse files Browse the repository at this point in the history
* Fixes case where DocumentTermMatrix(crps, lex) would construct a dtm of wrong dimensions if a term provided in lex does not occur in the crps

* added DocumentTermMatrix constructor that takes a crps and a prespecified terms vector

* fixed remove_patterns to use nextind() to find starting position of next unstripped substring. Closes #92

* removed leftover info() statement in prepare! test
  • Loading branch information
AsafManela authored and aviks committed Aug 30, 2018
1 parent fbb18a7 commit 9e425f5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 14 deletions.
16 changes: 10 additions & 6 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@ function columnindices(terms::Vector{String})
column_indices
end

function DocumentTermMatrix(crps::Corpus, lex)
terms = sort(collect(keys(lex)))
function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
column_indices = columnindices(terms)

m = length(crps)
n = length(terms)

rows = Array{Int}(0)
columns = Array{Int}(0)
values = Array{Int}(0)
for i in 1:length(crps)
for i in 1:m
doc = crps.documents[i]
ngs = ngrams(doc)
for ngram in keys(ngs)
Expand All @@ -47,14 +49,16 @@ function DocumentTermMatrix(crps::Corpus, lex)
end
end
if length(rows) > 0
dtm = sparse(rows, columns, values)
dtm = sparse(rows, columns, values, m, n)
else
dtm = spzeros(Int, length(crps), 0)
dtm = spzeros(Int, m, n)
end
DocumentTermMatrix(dtm, terms, column_indices)
end
DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps))

DocumentTermMatrix(crps::Corpus, lex::Associative) = DocumentTermMatrix(crps, sort(collect(keys(lex))))

DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{String}) = DocumentTermMatrix(dtm, terms, columnindices(terms))

##############################################################################
Expand Down Expand Up @@ -99,7 +103,7 @@ function dtm_entries(d::AbstractDocument, lex::Dict{String, Int})
values = Array{Int}(0)
terms = sort(collect(keys(lex)))
column_indices = columnindices(terms)

for ngram in keys(ngs)
if haskey(column_indices, ngram)
push!(indices, column_indices[ngram])
Expand Down
4 changes: 2 additions & 2 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ function remove_patterns(s::AbstractString, rex::Regex)
Base.write_sub(iob, v, ibegin, len)
write(iob, ' ')
end
ibegin = m.endof+m.offset+1
ibegin = nextind(s, m.endof+m.offset)
end
len = length(v) - ibegin + 1
(len > 0) && Base.write_sub(iob, v, ibegin, len)
Expand All @@ -276,7 +276,7 @@ function remove_patterns{T <: String}(s::SubString{T}, rex::Regex)
Base.write_sub(iob, data, ibegin+ioffset, len)
write(iob, ' ')
end
ibegin = m.endof+m.offset+1
ibegin = nextind(s, m.endof+m.offset)
end
len = s.endof - ibegin + 1
(len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len)
Expand Down
14 changes: 14 additions & 0 deletions test/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@
tdm(crps)
hash_tdm(crps)

# construct a DocumentTermMatrix from a crps and a custom terms vector
terms = ["And", "notincrps"]
m = DocumentTermMatrix(crps,terms)
@test size(dtm(m),1) == length(terms)
@test terms == m.terms
@test size(dtm(m),2) == length(crps)

# construct a DocumentTermMatrix from a crps and a custom lexicon
lex = Dict("And"=>1, "notincrps"=>4)
m = DocumentTermMatrix(crps,lex)
@test size(dtm(m),1) == length(keys(lex))
@test size(dtm(m),1) == length(m.terms)
@test size(dtm(m),2) == length(crps)

# construct a DocumentTermMatrix from a dtm and terms vector
terms = m.terms
m2 = DocumentTermMatrix(dtm1,terms)
Expand Down
14 changes: 8 additions & 6 deletions test/preprocessing.jl
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@

@testset "Preprocessing" begin

sample_text1 = "This is 1 MESSED UP string!"
sample_text1_wo_punctuation = "This is 1 MESSED UP string"
sample_text1_wo_punctuation_numbers = "This is MESSED UP string"
sample_text1_wo_punctuation_numbers_case = "this is messed up string"
sample_text1 = "This is 1 MESSED υπ string!"
sample_text1_wo_punctuation = "This is 1 MESSED υπ string"
sample_text1_wo_punctuation_numbers = "This is MESSED υπ string"
sample_text1_wo_punctuation_numbers_case = "this is messed υπ string"
sample_text1_wo_punctuation_numbers_case_az = "this is messed string"

sample_texts = [
sample_text1,
sample_text1_wo_punctuation,
sample_text1_wo_punctuation_numbers,
sample_text1_wo_punctuation_numbers_case,
sample_text1_wo_punctuation_numbers_case_az
]

# This idiom is _really_ ugly since "OR" means "AND" here.
for str in sample_texts
sd = StringDocument(str)
prepare!(
sd,
strip_punctuation | strip_numbers | strip_case | strip_whitespace
strip_punctuation | strip_numbers | strip_case | strip_whitespace | strip_non_letters
)
@test isequal(strip(sd.text), "this is messed up string")
@test isequal(strip(sd.text), "this is messed string")
end

# Need to only remove words at word boundaries
Expand Down

0 comments on commit 9e425f5

Please sign in to comment.