Merge pull request #91 from baggepinnen/tests

Update test infrastructure
JuliaText · Aug 30, 2018 · fbb18a7 · fbb18a7
2 parents 037ae95 + 2608990
commit fbb18a7
Show file tree

Hide file tree

Showing 15 changed files with 95 additions and 114 deletions.
diff --git a/src/lda.jl b/src/lda.jl
@@ -6,14 +6,14 @@
 
 module Lda
 
-type TopicBasedDocument
+mutable struct TopicBasedDocument
     topic::Vector{Int}
     text::Vector{Int}
     topicidcount::Vector{Int}
 end
 TopicBasedDocument(ntopics) = TopicBasedDocument(Vector{Int}(), Vector{Int}(), zeros(Int, ntopics))
 
-type Topic
+mutable struct Topic
     count::Int
     wordcount::Dict{Int, Int}
 end
@@ -95,6 +95,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, alpha::Float
             end
         end
     end
+
     # ϕ
     # topic x word sparse matrix.
     ϕ = spzeros(ntopics, number_of_words)

diff --git a/src/summarizer.jl b/src/summarizer.jl
@@ -7,7 +7,7 @@ function summarize(d::AbstractDocument; ns=5)
     update_lexicon!(c)
     t = tf_idf(dtm(c))
     T = t * t'
-    p=pagerank(t*t')
+    p=pagerank(T)
     return sentences[sort(sortperm(vec(p), rev=true)[1:ns])]
 end
 

diff --git a/test/corpus.jl b/test/corpus.jl
@@ -1,7 +1,5 @@
-module TestCorpus
-    using Base.Test
-    using Languages
-    using TextAnalysis
+
+@testset "Corpus" begin
 
     sample_text1 = "This is a string"
     sample_text2 = "This is also a string"
@@ -18,7 +16,7 @@ module TestCorpus
     documents(crps)
 
     for doc in crps
-    	@assert isa(doc, AbstractDocument)
+    	@test isa(doc, AbstractDocument)
     end
 
     lexicon(crps)

diff --git a/test/document.jl b/test/document.jl
@@ -1,8 +1,5 @@
-module TestDocument
-    using Base.Test
-    using Languages
-    using TextAnalysis
-    using Compat
+
+@testset "Document" begin
 
     sample_text1 = "This is a string"
     sample_text2 = "This is also a string"
@@ -13,32 +10,32 @@ module TestDocument
     td = TokenDocument(sample_text1)
     ngd = NGramDocument(sample_text1)
 
-    @assert isequal(text(sd), sample_text1)
+    @test isequal(text(sd), sample_text1)
     text!(sd, sample_text2)
-    @assert isequal(text(sd), sample_text2)
+    @test isequal(text(sd), sample_text2)
     text!(sd, sample_text1)
-    @assert isequal(text(sd), sample_text1)
+    @test isequal(text(sd), sample_text1)
 
-    @assert all(tokens(sd) .== ["This", "is", "a", "string"])
-    @assert "This" in keys(ngrams(sd, 1))
-    @assert "is" in keys(ngrams(sd, 1))
-    @assert "a" in keys(ngrams(sd, 1))
-    @assert "string" in keys(ngrams(sd, 1))
+    @test all(tokens(sd) .== ["This", "is", "a", "string"])
+    @test "This" in keys(ngrams(sd, 1))
+    @test "is" in keys(ngrams(sd, 1))
+    @test "a" in keys(ngrams(sd, 1))
+    @test "string" in keys(ngrams(sd, 1))
 
-    @assert length(sd) == 16
+    @test length(sd) == 16
 
     hamlet_text = "To be or not to be..."
     sd = StringDocument(hamlet_text)
-    @assert isa(sd, StringDocument)
-    @assert isequal(text(sd), hamlet_text)
+    @test isa(sd, StringDocument)
+    @test isequal(text(sd), hamlet_text)
 
-    @assert isa(fd, FileDocument)
-    @assert length(text(fd)) > 0
+    @test isa(fd, FileDocument)
+    @test length(text(fd)) > 0
 
     my_tokens = ["To", "be", "or", "not", "to", "be..."]
     td = TokenDocument(my_tokens)
-    @assert isa(td, TokenDocument)
-    @assert all(tokens(td) .== my_tokens)
+    @test isa(td, TokenDocument)
+    @test all(tokens(td) .== my_tokens)
 
     my_ngrams = Dict{String,Int}()
     my_ngrams["To"] = 1
@@ -48,24 +45,24 @@ module TestDocument
     my_ngrams["to"] = 1
     my_ngrams["be..."] = 1
     ngd = NGramDocument(my_ngrams)
-    @assert isa(ngd, NGramDocument)
-    @assert "To" in keys(ngrams(ngd))
+    @test isa(ngd, NGramDocument)
+    @test "To" in keys(ngrams(ngd))
 
     sd = StringDocument(hamlet_text)
     td = TokenDocument(hamlet_text)
     ngd = NGramDocument(hamlet_text)
 
     d = Document("To be or not to be...")
-    @assert isa(d, StringDocument)
+    @test isa(d, StringDocument)
     d = Document(joinpath(dirname(@__FILE__), "data", "poem.txt"))
-    @assert isa(d, FileDocument)
+    @test isa(d, FileDocument)
     d = Document(["To", "be", "or", "not", "to", "be..."])
-    @assert isa(d, TokenDocument)
+    @test isa(d, TokenDocument)
     ng = Dict{String,Int}()
     ng["a"] = 1
     ng["b"] = 3
     d = Document(ng)
-    @assert isa(d, NGramDocument)
+    @test isa(d, NGramDocument)
 
-    @assert isequal(length(Document("this is text")), 12)
+    @test isequal(length(Document("this is text")), 12)
 end
diff --git a/test/dtm.jl b/test/dtm.jl
@@ -1,8 +1,5 @@
-module TestDTM
-    using Base.Test
-    using Languages
-    using TextAnalysis
 
+@testset "DTM" begin
     sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")
 
     fd = FileDocument(sample_file)

diff --git a/test/lda.jl b/test/lda.jl
@@ -1,8 +1,6 @@
-module TestLDA
-    using Base.Test
-    using Languages
-    using TextAnalysis
 
+@testset "LDA" begin
+
     doc1 = "a a a sample text text"
     doc2 = "another example example text text"
 

diff --git a/test/metadata.jl b/test/metadata.jl
@@ -1,19 +1,17 @@
-module TestMetadata
-    using Base.Test
-    using Languages
-    using TextAnalysis
+
+@testset "Metadata" begin
 
     sample_text1 = "This is a string"
     sample_text2 = "This is also a string"
     sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")
 
     sd = StringDocument(sample_text1)
 
-    @assert isequal(name(sd), "Unnamed Document")
-    @assert isequal(language(sd), Languages.English())
-    @assert isequal(author(sd), "Unknown Author")
-    @assert isequal(timestamp(sd), "Unknown Time")
+    @test isequal(name(sd), "Unnamed Document")
+    @test isequal(language(sd), Languages.English())
+    @test isequal(author(sd), "Unknown Author")
+    @test isequal(timestamp(sd), "Unknown Time")
 
     language!(sd, Languages.German())
-    @assert isequal(language(sd), Languages.German())
+    @test isequal(language(sd), Languages.German())
 end
diff --git a/test/ngramizer.jl b/test/ngramizer.jl
@@ -1,19 +1,16 @@
-module TestNGramizer
-    using Base.Test
-    using Languages
-    using TextAnalysis
-    using Compat
+
+@testset "NGramizer" begin
 
     sample_text = "this is some sample text"
     tkns = TextAnalysis.tokenize(Languages.English(), sample_text)
     ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1)
-    @assert isequal(ngs, Dict{String,Int}("some" => 1,
+    @test isequal(ngs, Dict{String,Int}("some" => 1,
     	                                     "this" => 1,
     	                                     "is" => 1,
     	                                     "sample" => 1,
     	                                     "text" => 1))
     ngs = TextAnalysis.ngramize(Languages.English(), tkns, 2)
-    @assert isequal(ngs, Dict{String,Int}("some" => 1,
+    @test isequal(ngs, Dict{String,Int}("some" => 1,
                                              "this is" => 1,
                                              "some sample" => 1,
                                              "is some" => 1,

diff --git a/test/preprocessing.jl b/test/preprocessing.jl
@@ -1,7 +1,5 @@
-module TestPreprocessing
-    using Base.Test
-    using Languages
-    using TextAnalysis
+
+@testset "Preprocessing" begin
 
     sample_text1 = "This is 1 MESSED UP string!"
     sample_text1_wo_punctuation = "This is 1 MESSED UP string"
@@ -22,50 +20,50 @@ module TestPreprocessing
             sd,
             strip_punctuation | strip_numbers | strip_case | strip_whitespace
         )
-        @assert isequal(strip(sd.text), "this is messed up string")
+        @test isequal(strip(sd.text), "this is messed up string")
     end
 
     # Need to only remove words at word boundaries
     doc = Document("this is sample text")
     remove_words!(doc, ["sample"])
-    @assert isequal(doc.text, "this is   text")
+    @test isequal(doc.text, "this is   text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_articles)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_definite_articles)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_indefinite_articles)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_prepositions)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_pronouns)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_stopwords)
-    @assert isequal(strip(doc.text), "sample text")
+    @test isequal(strip(doc.text), "sample text")
 
     doc = Document("this is sample text")
     prepare!(doc, strip_whitespace)
-    @assert isequal(doc.text, "this is sample text")
+    @test isequal(doc.text, "this is sample text")
 
     # stem!(sd)
     # tag_pos!(sd)
 
     # Do preprocessing on TokenDocument, NGramDocument, Corpus
     d = NGramDocument("this is sample text")
-    @assert haskey(d.ngrams, "sample")
+    @test haskey(d.ngrams, "sample")
     remove_words!(d, ["sample"])
-    @assert !haskey(d.ngrams, "sample")
+    @test !haskey(d.ngrams, "sample")
 
     d = StringDocument(
         """
@@ -80,7 +78,7 @@ module TestPreprocessing
         """
     )
     remove_html_tags!(d)
-    @assert "Hello world" == strip(d.text)
+    @test "Hello world" == strip(d.text)
 
     #Test #62
     remove_corrupt_utf8("abc") == "abc"

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,27 +1,29 @@
 module TestTextAnalysis
-    using Base.Test
-    using Languages
-    using TextAnalysis
+using Base.Test
+using Languages
+using TextAnalysis
+using Compat
 
-    my_tests = [
-        "tokenizer.jl",
-        "ngramizer.jl",
-        "document.jl",
-        "metadata.jl",
-        "corpus.jl",
-        "preprocessing.jl",
-        "dtm.jl",
-        "stemmer.jl",
-        "tf_idf.jl",
-        "lda.jl",
-        "summarizer.jl",
-        "sentiment.jl"
-    ]
 
-    println("Running tests:")
+# @testset "TextAnalysis" begin
 
-    for my_test in my_tests
-        println(" * $(my_test)")
-        include(my_test)
-    end
+println("Running tests:")
+println(typeof(Compat.String))
+
+include("tokenizer.jl")
+include("ngramizer.jl")
+include("document.jl")
+include("metadata.jl")
+include("corpus.jl")
+include("preprocessing.jl")
+include("dtm.jl")
+include("stemmer.jl")
+include("tf_idf.jl")
+include("lda.jl")
+include("summarizer.jl")
+include("sentiment.jl")
+
+
+
+# end
 end
diff --git a/test/sentiment.jl b/test/sentiment.jl
@@ -1,9 +1,11 @@
+@testset "Sentiment" begin
 m = SentimentAnalyzer()
 
-d=StringDocument("a very nice thing that everyone likes")
+d = StringDocument("a very nice thing that everyone likes")
 
 m(d) > 0.5
 
-d=StringDocument("a horrible thing that everyone hates")
+d = StringDocument("a horrible thing that everyone hates")
 
 m(d) < 0.5
+end
diff --git a/test/stemmer.jl b/test/stemmer.jl
@@ -1,8 +1,5 @@
-module TestStemmer
-    using Base.Test
-    using Languages
-    using TextAnalysis
-    using Compat
+
+@testset "Stemmer" begin
 
     algs = stemmer_types()
     @test !isempty(algs)

diff --git a/test/summarizer.jl b/test/summarizer.jl
@@ -1,3 +1,4 @@
+@testset "Summarizer" begin
 d = StringDocument("""
     Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk.
     The move came after talks with a potential buyer, R Capital, collapsed leaving Poundworld with no option other than administration.
@@ -16,5 +17,6 @@ s = summarize(d)
 @test length(s) == 5
 @test s[1] == "Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk."
 
-s=summarize(d; ns=2)
+s = summarize(d; ns=2)
 @test length(s) == 2
+end
diff --git a/test/tf_idf.jl b/test/tf_idf.jl
@@ -1,8 +1,5 @@
-module TestTFIDF
 
-    using Base.Test
-    using Languages
-    using TextAnalysis
+@testset "TFIDF" begin
 
     doc1 = "a a a sample text text"
     doc2 = "another example example text text"