Skip to content

Commit

Permalink
Merge pull request #91 from baggepinnen/tests
Browse files Browse the repository at this point in the history
Update test infrastructure
  • Loading branch information
aviks authored Aug 30, 2018
2 parents 037ae95 + 2608990 commit fbb18a7
Show file tree
Hide file tree
Showing 15 changed files with 95 additions and 114 deletions.
5 changes: 3 additions & 2 deletions src/lda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

module Lda

type TopicBasedDocument
mutable struct TopicBasedDocument
topic::Vector{Int}
text::Vector{Int}
topicidcount::Vector{Int}
end
TopicBasedDocument(ntopics) = TopicBasedDocument(Vector{Int}(), Vector{Int}(), zeros(Int, ntopics))

type Topic
mutable struct Topic
count::Int
wordcount::Dict{Int, Int}
end
Expand Down Expand Up @@ -95,6 +95,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, alpha::Float
end
end
end

# ϕ
# topic x word sparse matrix.
ϕ = spzeros(ntopics, number_of_words)
Expand Down
2 changes: 1 addition & 1 deletion src/summarizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ function summarize(d::AbstractDocument; ns=5)
update_lexicon!(c)
t = tf_idf(dtm(c))
T = t * t'
p=pagerank(t*t')
p=pagerank(T)
return sentences[sort(sortperm(vec(p), rev=true)[1:ns])]
end

Expand Down
8 changes: 3 additions & 5 deletions test/corpus.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
module TestCorpus
using Base.Test
using Languages
using TextAnalysis

@testset "Corpus" begin

sample_text1 = "This is a string"
sample_text2 = "This is also a string"
Expand All @@ -18,7 +16,7 @@ module TestCorpus
documents(crps)

for doc in crps
@assert isa(doc, AbstractDocument)
@test isa(doc, AbstractDocument)
end

lexicon(crps)
Expand Down
51 changes: 24 additions & 27 deletions test/document.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
module TestDocument
using Base.Test
using Languages
using TextAnalysis
using Compat

@testset "Document" begin

sample_text1 = "This is a string"
sample_text2 = "This is also a string"
Expand All @@ -13,32 +10,32 @@ module TestDocument
td = TokenDocument(sample_text1)
ngd = NGramDocument(sample_text1)

@assert isequal(text(sd), sample_text1)
@test isequal(text(sd), sample_text1)
text!(sd, sample_text2)
@assert isequal(text(sd), sample_text2)
@test isequal(text(sd), sample_text2)
text!(sd, sample_text1)
@assert isequal(text(sd), sample_text1)
@test isequal(text(sd), sample_text1)

@assert all(tokens(sd) .== ["This", "is", "a", "string"])
@assert "This" in keys(ngrams(sd, 1))
@assert "is" in keys(ngrams(sd, 1))
@assert "a" in keys(ngrams(sd, 1))
@assert "string" in keys(ngrams(sd, 1))
@test all(tokens(sd) .== ["This", "is", "a", "string"])
@test "This" in keys(ngrams(sd, 1))
@test "is" in keys(ngrams(sd, 1))
@test "a" in keys(ngrams(sd, 1))
@test "string" in keys(ngrams(sd, 1))

@assert length(sd) == 16
@test length(sd) == 16

hamlet_text = "To be or not to be..."
sd = StringDocument(hamlet_text)
@assert isa(sd, StringDocument)
@assert isequal(text(sd), hamlet_text)
@test isa(sd, StringDocument)
@test isequal(text(sd), hamlet_text)

@assert isa(fd, FileDocument)
@assert length(text(fd)) > 0
@test isa(fd, FileDocument)
@test length(text(fd)) > 0

my_tokens = ["To", "be", "or", "not", "to", "be..."]
td = TokenDocument(my_tokens)
@assert isa(td, TokenDocument)
@assert all(tokens(td) .== my_tokens)
@test isa(td, TokenDocument)
@test all(tokens(td) .== my_tokens)

my_ngrams = Dict{String,Int}()
my_ngrams["To"] = 1
Expand All @@ -48,24 +45,24 @@ module TestDocument
my_ngrams["to"] = 1
my_ngrams["be..."] = 1
ngd = NGramDocument(my_ngrams)
@assert isa(ngd, NGramDocument)
@assert "To" in keys(ngrams(ngd))
@test isa(ngd, NGramDocument)
@test "To" in keys(ngrams(ngd))

sd = StringDocument(hamlet_text)
td = TokenDocument(hamlet_text)
ngd = NGramDocument(hamlet_text)

d = Document("To be or not to be...")
@assert isa(d, StringDocument)
@test isa(d, StringDocument)
d = Document(joinpath(dirname(@__FILE__), "data", "poem.txt"))
@assert isa(d, FileDocument)
@test isa(d, FileDocument)
d = Document(["To", "be", "or", "not", "to", "be..."])
@assert isa(d, TokenDocument)
@test isa(d, TokenDocument)
ng = Dict{String,Int}()
ng["a"] = 1
ng["b"] = 3
d = Document(ng)
@assert isa(d, NGramDocument)
@test isa(d, NGramDocument)

@assert isequal(length(Document("this is text")), 12)
@test isequal(length(Document("this is text")), 12)
end
5 changes: 1 addition & 4 deletions test/dtm.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
module TestDTM
using Base.Test
using Languages
using TextAnalysis

@testset "DTM" begin
sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")

fd = FileDocument(sample_file)
Expand Down
6 changes: 2 additions & 4 deletions test/lda.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
module TestLDA
using Base.Test
using Languages
using TextAnalysis

@testset "LDA" begin

doc1 = "a a a sample text text"
doc2 = "another example example text text"

Expand Down
16 changes: 7 additions & 9 deletions test/metadata.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
module TestMetadata
using Base.Test
using Languages
using TextAnalysis

@testset "Metadata" begin

sample_text1 = "This is a string"
sample_text2 = "This is also a string"
sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")

sd = StringDocument(sample_text1)

@assert isequal(name(sd), "Unnamed Document")
@assert isequal(language(sd), Languages.English())
@assert isequal(author(sd), "Unknown Author")
@assert isequal(timestamp(sd), "Unknown Time")
@test isequal(name(sd), "Unnamed Document")
@test isequal(language(sd), Languages.English())
@test isequal(author(sd), "Unknown Author")
@test isequal(timestamp(sd), "Unknown Time")

language!(sd, Languages.German())
@assert isequal(language(sd), Languages.German())
@test isequal(language(sd), Languages.German())
end
11 changes: 4 additions & 7 deletions test/ngramizer.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
module TestNGramizer
using Base.Test
using Languages
using TextAnalysis
using Compat

@testset "NGramizer" begin

sample_text = "this is some sample text"
tkns = TextAnalysis.tokenize(Languages.English(), sample_text)
ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1)
@assert isequal(ngs, Dict{String,Int}("some" => 1,
@test isequal(ngs, Dict{String,Int}("some" => 1,
"this" => 1,
"is" => 1,
"sample" => 1,
"text" => 1))
ngs = TextAnalysis.ngramize(Languages.English(), tkns, 2)
@assert isequal(ngs, Dict{String,Int}("some" => 1,
@test isequal(ngs, Dict{String,Int}("some" => 1,
"this is" => 1,
"some sample" => 1,
"is some" => 1,
Expand Down
30 changes: 14 additions & 16 deletions test/preprocessing.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
module TestPreprocessing
using Base.Test
using Languages
using TextAnalysis

@testset "Preprocessing" begin

sample_text1 = "This is 1 MESSED UP string!"
sample_text1_wo_punctuation = "This is 1 MESSED UP string"
Expand All @@ -22,50 +20,50 @@ module TestPreprocessing
sd,
strip_punctuation | strip_numbers | strip_case | strip_whitespace
)
@assert isequal(strip(sd.text), "this is messed up string")
@test isequal(strip(sd.text), "this is messed up string")
end

# Need to only remove words at word boundaries
doc = Document("this is sample text")
remove_words!(doc, ["sample"])
@assert isequal(doc.text, "this is text")
@test isequal(doc.text, "this is text")

doc = Document("this is sample text")
prepare!(doc, strip_articles)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

doc = Document("this is sample text")
prepare!(doc, strip_definite_articles)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

doc = Document("this is sample text")
prepare!(doc, strip_indefinite_articles)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

doc = Document("this is sample text")
prepare!(doc, strip_prepositions)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

doc = Document("this is sample text")
prepare!(doc, strip_pronouns)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

doc = Document("this is sample text")
prepare!(doc, strip_stopwords)
@assert isequal(strip(doc.text), "sample text")
@test isequal(strip(doc.text), "sample text")

doc = Document("this is sample text")
prepare!(doc, strip_whitespace)
@assert isequal(doc.text, "this is sample text")
@test isequal(doc.text, "this is sample text")

# stem!(sd)
# tag_pos!(sd)

# Do preprocessing on TokenDocument, NGramDocument, Corpus
d = NGramDocument("this is sample text")
@assert haskey(d.ngrams, "sample")
@test haskey(d.ngrams, "sample")
remove_words!(d, ["sample"])
@assert !haskey(d.ngrams, "sample")
@test !haskey(d.ngrams, "sample")

d = StringDocument(
"""
Expand All @@ -80,7 +78,7 @@ module TestPreprocessing
"""
)
remove_html_tags!(d)
@assert "Hello world" == strip(d.text)
@test "Hello world" == strip(d.text)

#Test #62
remove_corrupt_utf8("abc") == "abc"
Expand Down
46 changes: 24 additions & 22 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
module TestTextAnalysis
using Base.Test
using Languages
using TextAnalysis
using Base.Test
using Languages
using TextAnalysis
using Compat

my_tests = [
"tokenizer.jl",
"ngramizer.jl",
"document.jl",
"metadata.jl",
"corpus.jl",
"preprocessing.jl",
"dtm.jl",
"stemmer.jl",
"tf_idf.jl",
"lda.jl",
"summarizer.jl",
"sentiment.jl"
]

println("Running tests:")
# @testset "TextAnalysis" begin

for my_test in my_tests
println(" * $(my_test)")
include(my_test)
end
println("Running tests:")
println(typeof(Compat.String))

include("tokenizer.jl")
include("ngramizer.jl")
include("document.jl")
include("metadata.jl")
include("corpus.jl")
include("preprocessing.jl")
include("dtm.jl")
include("stemmer.jl")
include("tf_idf.jl")
include("lda.jl")
include("summarizer.jl")
include("sentiment.jl")



# end
end
6 changes: 4 additions & 2 deletions test/sentiment.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
@testset "Sentiment" begin
m = SentimentAnalyzer()

d=StringDocument("a very nice thing that everyone likes")
d = StringDocument("a very nice thing that everyone likes")

m(d) > 0.5

d=StringDocument("a horrible thing that everyone hates")
d = StringDocument("a horrible thing that everyone hates")

m(d) < 0.5
end
7 changes: 2 additions & 5 deletions test/stemmer.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
module TestStemmer
using Base.Test
using Languages
using TextAnalysis
using Compat

@testset "Stemmer" begin

algs = stemmer_types()
@test !isempty(algs)
Expand Down
4 changes: 3 additions & 1 deletion test/summarizer.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
@testset "Summarizer" begin
d = StringDocument("""
Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk.
The move came after talks with a potential buyer, R Capital, collapsed leaving Poundworld with no option other than administration.
Expand All @@ -16,5 +17,6 @@ s = summarize(d)
@test length(s) == 5
@test s[1] == "Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk."

s=summarize(d; ns=2)
s = summarize(d; ns=2)
@test length(s) == 2
end
5 changes: 1 addition & 4 deletions test/tf_idf.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
module TestTFIDF

using Base.Test
using Languages
using TextAnalysis
@testset "TFIDF" begin

doc1 = "a a a sample text text"
doc2 = "another example example text text"
Expand Down
Loading

0 comments on commit fbb18a7

Please sign in to comment.