From c7e22dc1beacb61580a01907c5b8f0700c85a76e Mon Sep 17 00:00:00 2001 From: Roman S Samarev Date: Wed, 7 Feb 2024 12:55:25 -0800 Subject: [PATCH 1/2] minor style fix and fix of potential errors --- src/LM/langmodel.jl | 10 +++------- src/bayes.jl | 3 +++ src/dtm.jl | 8 ++++---- test/bayes.jl | 2 ++ 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index 6450d49c..b2294929 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -84,13 +84,9 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl accum = temp_lm[context] #print(accum) - s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) - for (text, count) in accum - if text == word - return(float(count+m.gamma)/s) - end - end - return(float(m.gamma)/s) + s = float(sum(accum) + (m.gamma) * length(m.vocab.vocab)) + idx = something(findfirst(isequal(word), accum), 0) + return float(idx + m.gamma) / s end """ diff --git a/src/bayes.jl b/src/bayes.jl index 5f2a0928..4ec96346 100644 --- a/src/bayes.jl +++ b/src/bayes.jl @@ -96,6 +96,9 @@ Fit the weights for the model on the input data. """ function fit!(c::NaiveBayesClassifier, x::Features, class) n = findfirst(==(class), c.classes) + + @assert !isnothing(n) "class \"$class\" is not present in the list $(c.classes)" + c.weights[:, n] .+= x return c end diff --git a/src/dtm.jl b/src/dtm.jl index fcbf4eda..df4e7091 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -330,7 +330,7 @@ function prune!(dtm::DocumentTermMatrix{T}, document_positions; compact::Bool=tr end if compact - termcols_to_delete = map(x->x==0, sum(dtm_matrix, dims=(1,))) + termcols_to_delete = map(iszero, sum(dtm_matrix, dims=(1,))) if retain_terms !== nothing for idx in 1:length(termcols_to_delete) (!termcols_to_delete[idx] || !(dtm.terms[idx] in retain_terms)) && continue @@ -395,9 +395,9 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where SparseMatrixCSC(S.m, n, colptr, S.rowval, S.nzval) end function row_append(A, B) - @assert size(A,2) == size(B,2) - (length(A) == 0) && (return B) - (length(B) == 0) && (return A) + @assert size(A, 2) == size(B, 2) + isempty(A) && return B + isempty(B) && return A C_colptr = similar(A.colptr) C_rowvals = similar(A.rowval, length(A.rowval) + length(B.rowval)) diff --git a/test/bayes.jl b/test/bayes.jl index b4d71b35..11fd7f8c 100644 --- a/test/bayes.jl +++ b/test/bayes.jl @@ -5,6 +5,8 @@ r = TextAnalysis.predict(m, "is this spam?") @test r[:spam] > r[:ham] + @test_throws AssertionError TextAnalysis.fit!(m, "this is spam", :non_spam) + n = NaiveBayesClassifier([:spam, :ham]) TextAnalysis.fit!(n, StringDocument("this is ham"), :ham) TextAnalysis.fit!(n, StringDocument("this is spam"), :spam) From 0230573685395bb0f80db6854b7505f34ca75903 Mon Sep 17 00:00:00 2001 From: Roman S Samarev Date: Wed, 7 Feb 2024 13:55:56 -0800 Subject: [PATCH 2/2] style fix with Julia Formatter. No functional changes --- docs/make.jl | 8 +- src/LM/api.jl | 2 +- src/LM/langmodel.jl | 94 +++++++-------- src/TextAnalysis.jl | 226 ++++++++++++++++++------------------- src/bayes.jl | 8 +- src/corpus.jl | 22 ++-- src/deprecations.jl | 12 +- src/document.jl | 22 ++-- src/dtm.jl | 40 +++---- src/lda.jl | 14 ++- src/lsa.jl | 4 +- src/metadata.jl | 4 +- src/ngramizer.jl | 13 +-- src/preprocessing.jl | 50 ++++---- src/show.jl | 2 +- src/summarizer.jl | 26 ++--- src/tagging_schemes.jl | 6 +- src/tf_idf.jl | 90 +++++++-------- src/tokenizer.jl | 4 +- test/LM.jl | 64 +++++------ test/bayes.jl | 6 +- test/corpus.jl | 6 +- test/document.jl | 26 +++-- test/dtm.jl | 34 +++--- test/evaluation_metrics.jl | 16 +-- test/ngramizer.jl | 39 ++++--- test/preprocessing.jl | 42 +++---- test/stemmer.jl | 6 +- test/summarizer.jl | 40 +++---- test/taggingschemes.jl | 20 ++-- test/tf_idf.jl | 71 ++++++------ 31 files changed, 526 insertions(+), 491 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index f27612f4..663f6cb9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,11 +1,11 @@ using Documenter, TextAnalysis makedocs( - modules = [TextAnalysis], - sitename = "TextAnalysis", - format = Documenter.HTML( + modules=[TextAnalysis], + sitename="TextAnalysis", + format=Documenter.HTML( ), - pages = [ + pages=[ "Home" => "index.md", "Documents" => "documents.md", "Corpus" => "corpus.md", diff --git a/src/LM/api.jl b/src/LM/api.jl index 066a21d7..1498e4c2 100644 --- a/src/LM/api.jl +++ b/src/LM/api.jl @@ -6,7 +6,7 @@ It is used to evaluate score with masks out of vocabulary words The arguments are the same as for [`score`](@ref) """ function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64 - score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin]) + score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin]) end """ diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index b2294929..bd28b8e2 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -2,9 +2,9 @@ abstract type Langmodel end abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing -#DataType MLE -#Type for providing MLE ngram model scores. -#Implementation of Base Ngram Model. +# DataType MLE +# Type for providing MLE ngram model scores. +# Implementation of Base Ngram Model. struct MLE <: Langmodel vocab::Vocabulary @@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores. Implementation of Base Ngram Model. """ -function MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where {T <: AbstractString} +function MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where {T<:AbstractString} MLE(Vocabulary(word, unk_cutoff, unk_label)) end -function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString} +function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString} text = lookup(lm.vocab, text) - text=convert(Array{String}, text) + text = convert(Array{String}, text) return counter2(text, min, max) end @@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires a number by which to increase the counts, gamma. """ -function Lidstone(word::Vector{T}, gamma = 1.0, unk_cutoff=1, unk_label="") where {T <: AbstractString} +function Lidstone(word::Vector{T}, gamma=1.0, unk_cutoff=1, unk_label="") where {T<:AbstractString} Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma) end -function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString} +function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString} text = lookup(lm.vocab, text) - text=convert(Array{String}, text) + text = convert(Array{String}, text) return counter2(text, min, max) end """ Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where {T <: AbstractString} + Function to initiate Type(Laplace) for providing Laplace-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires @@ -63,11 +64,11 @@ struct Laplace <: gammamodel gamma::Float64 end -function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where {T <: AbstractString} +function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where {T<:AbstractString} Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0) end -function (lm::Laplace)(text, min::Integer, max::Integer) +function (lm::Laplace)(text, min::Integer, max::Integer) text = lookup(lm.vocab, text) text = convert(Array{String}, text) return counter2(text, min, max) @@ -90,25 +91,26 @@ function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for ga end """ +$(TYPEDSIGNATURES) + To get probability of word given that context In other words, for given context calculate frequency distribution of word - """ function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64 - (isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution + (isnothing(context) || isempty(context)) && return 1.0 / length(templ_lm) #provide distribution accum = templ_lm[context] - s = float(sum(accum)) + s = float(sum(accum)) for (text, count) in accum if text == word - return(float(count) / s) + return (float(count) / s) end end if context in keys(m.vocab.vocab) return 0.0 end - return(Inf) + return (Inf) end """ @@ -121,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing) prob(m, temp_lm, word, context) end -struct WittenBellInterpolated <: InterpolatedLanguageModel - vocab ::Vocabulary +struct WittenBellInterpolated <: InterpolatedLanguageModel + vocab::Vocabulary end """ @@ -133,13 +135,13 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing. The idea to abstract this comes from Chen & Goodman 1995. """ -function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where {T <: AbstractString} +function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where {T<:AbstractString} WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label)) end -function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString} +function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString} text = lookup(lm.vocab, text) - text=convert(Array{String}, text) + text = convert(Array{String}, text) return counter2(text, min, max) end # alpha_gamma function for KneserNeyInterpolated @@ -147,27 +149,27 @@ function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, co local alpha local gam accum = templ_lm[context] - s = float(sum(accum)) - for (text,count) in accum + s = float(sum(accum)) + for (text, count) in accum if text == word - alpha=(float(count) / s) - break + alpha = (float(count) / s) + break else - alpha = 1/s + alpha = 1 / s end end - + gam = gamma(accum) - return alpha*(1- gam), gam + return alpha * (1 - gam), gam end function count_non_zero_vals(accum::Accumulator{}) - return(length(accum)) + return (length(accum)) end - + function gamma(accum) - nplus=count_non_zero_vals(accum) - return(nplus/(nplus+float(sum(accum)))) + nplus = count_non_zero_vals(accum) + return (nplus / (nplus + float(sum(accum)))) end """ @@ -183,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context (isnothing(context) || isempty(context)) && return prob(m, temp_lm, word) if context in keys(temp_lm) - alpha,gamma = alpha_gammma(m, temp_lm, word, context) - return (alpha + gamma*score(m, temp_lm, word, context_reduce(context))) + alpha, gamma = alpha_gammma(m, temp_lm, word, context) + return (alpha + gamma * score(m, temp_lm, word, context_reduce(context))) else return score(m, temp_lm, word, context_reduce(context)) end end - + function context_reduce(context) context = split(context) join(context[2:end], " ") end -struct KneserNeyInterpolated <: InterpolatedLanguageModel +struct KneserNeyInterpolated <: InterpolatedLanguageModel vocab::Vocabulary discount::Float64 end @@ -209,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model. The idea to abstract this comes from Chen & Goodman 1995. """ -function KneserNeyInterpolated(word::Vector{T}, disc = 0.1, unk_cutoff=1, unk_label="") where {T <: AbstractString} - KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc) +function KneserNeyInterpolated(word::Vector{T}, disc=0.1, unk_cutoff=1, unk_label="") where {T<:AbstractString} + KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), disc) end -function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString} +function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString} text = lookup(lm.vocab, text) - text=convert(Array{String}, text) + text = convert(Array{String}, text) return counter2(text, min, max) end # alpha_gamma function for KneserNeyInterpolated function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context) local alpha - local gamma + local gamma accum = templ_lm[context] - s = float(sum(accum)) + s = float(sum(accum)) for (text, count) in accum if text == word - alpha=(max(float(count)-m.discount, 0.0) / s) - break + alpha = (max(float(count) - m.discount, 0.0) / s) + break else - alpha = 1/length(m.vocab.vocab) + alpha = 1 / length(m.vocab.vocab) end end - gamma = (m.discount * count_non_zero_vals(accum) /s) + gamma = (m.discount * count_non_zero_vals(accum) / s) return alpha, gamma end diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 53c1470d..2d763b6b 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -1,115 +1,115 @@ module TextAnalysis - using SparseArrays - using Printf - using LinearAlgebra - using StatsBase: countmap,addcounts! - using Languages - using WordTokenizers - using Snowball - - using Tables - using DataStructures - using Statistics - using Serialization - using ProgressMeter - using DocStringExtensions - - import Base: depwarn, merge! - import Serialization: serialize, deserialize - - export AbstractDocument, Document - export FileDocument, StringDocument, TokenDocument, NGramDocument - export GenericDocument - export Corpus, DirectoryCorpus - export stemmer_types, Stemmer - export DocumentTermMatrix - export text, tokens, ngrams - export text!, tokens!, ngrams! - export documents - export language, title, author, timestamp - export languages, titles, authors, timestamps - export language!, title!, author!, timestamp! - export languages!, titles!, authors!, timestamps! - export ngram_complexity - export lexicon, update_lexicon!, lexical_frequency, lexicon_size - export inverse_index, update_inverse_index!, index_size - export remove_corrupt_utf8 - export remove_corrupt_utf8! - export remove_case - export remove_case! - export remove_words, remove_stop_words - export remove_words!, remove_stop_words! - export stem, tag_pos - export stem!, tag_pos! - export remove_html_tags, remove_html_tags! - export prepare! - export frequent_terms, sparse_terms - export remove_frequent_terms!, remove_sparse_terms! - export dtv, each_dtv, dtm, tdm - export TextHashFunction, index_hash, cardinality, hash_function, hash_function! - export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm - export CooMatrix, coom - export standardize! - export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity - export tf!, tf_idf!, bm_25!, lda! - export remove_patterns!, remove_patterns - export prune! - - export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation - export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles - export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags - - export NaiveBayesClassifier - export tag_scheme! - - export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax - export bleu_score - - export PerceptronTagger, fit!, predict - - export Vocabulary, lookup, update - export everygram, padding_ngram - export maskedscore, logscore, entropy, perplexity - export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score - - export tokenize #imported from WordTokenizers - - include("tokenizer.jl") - include("ngramizer.jl") - include("document.jl") - include("hash.jl") - include("corpus.jl") - include("metadata.jl") - include("preprocessing.jl") - - include("stemmer.jl") - include("dtm.jl") - include("tf_idf.jl") - include("lsa.jl") - include("lda.jl") - include("summarizer.jl") - include("show.jl") - include("bayes.jl") - include("deprecations.jl") - include("tagging_schemes.jl") - include("utils.jl") - - include("evaluation_metrics.jl") - include("translate_evaluation/bleu_score.jl") - include("coom.jl") - - - - # Lang_model - include("LM/vocab.jl") - include("LM/langmodel.jl") - include("LM/api.jl") - include("LM/counter.jl") - include("LM/preprocessing.jl") - - - - function __init__() - - end +using SparseArrays +using Printf +using LinearAlgebra +using StatsBase: countmap, addcounts! +using Languages +using WordTokenizers +using Snowball + +using Tables +using DataStructures +using Statistics +using Serialization +using ProgressMeter +using DocStringExtensions + +import Base: depwarn, merge! +import Serialization: serialize, deserialize + +export AbstractDocument, Document +export FileDocument, StringDocument, TokenDocument, NGramDocument +export GenericDocument +export Corpus, DirectoryCorpus +export stemmer_types, Stemmer +export DocumentTermMatrix +export text, tokens, ngrams +export text!, tokens!, ngrams! +export documents +export language, title, author, timestamp +export languages, titles, authors, timestamps +export language!, title!, author!, timestamp! +export languages!, titles!, authors!, timestamps! +export ngram_complexity +export lexicon, update_lexicon!, lexical_frequency, lexicon_size +export inverse_index, update_inverse_index!, index_size +export remove_corrupt_utf8 +export remove_corrupt_utf8! +export remove_case +export remove_case! +export remove_words, remove_stop_words +export remove_words!, remove_stop_words! +export stem, tag_pos +export stem!, tag_pos! +export remove_html_tags, remove_html_tags! +export prepare! +export frequent_terms, sparse_terms +export remove_frequent_terms!, remove_sparse_terms! +export dtv, each_dtv, dtm, tdm +export TextHashFunction, index_hash, cardinality, hash_function, hash_function! +export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm +export CooMatrix, coom +export standardize! +export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity +export tf!, tf_idf!, bm_25!, lda! +export remove_patterns!, remove_patterns +export prune! + +export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation +export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles +export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags + +export NaiveBayesClassifier +export tag_scheme! + +export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax +export bleu_score + +export PerceptronTagger, fit!, predict + +export Vocabulary, lookup, update +export everygram, padding_ngram +export maskedscore, logscore, entropy, perplexity +export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score + +export tokenize #imported from WordTokenizers + +include("tokenizer.jl") +include("ngramizer.jl") +include("document.jl") +include("hash.jl") +include("corpus.jl") +include("metadata.jl") +include("preprocessing.jl") + +include("stemmer.jl") +include("dtm.jl") +include("tf_idf.jl") +include("lsa.jl") +include("lda.jl") +include("summarizer.jl") +include("show.jl") +include("bayes.jl") +include("deprecations.jl") +include("tagging_schemes.jl") +include("utils.jl") + +include("evaluation_metrics.jl") +include("translate_evaluation/bleu_score.jl") +include("coom.jl") + + + +# Lang_model +include("LM/vocab.jl") +include("LM/langmodel.jl") +include("LM/api.jl") +include("LM/counter.jl") +include("LM/preprocessing.jl") + + + +function __init__() + +end end diff --git a/src/bayes.jl b/src/bayes.jl index 4ec96346..3a2f58c1 100644 --- a/src/bayes.jl +++ b/src/bayes.jl @@ -2,7 +2,7 @@ using WordTokenizers export NaiveBayesClassifier -simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>""))) +simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "." => ""))) """ $(TYPEDSIGNATURES) @@ -70,11 +70,11 @@ Dict{Symbol, Float64} with 2 entries: """ NaiveBayesClassifier(dict, classes) = NaiveBayesClassifier(dict, classes, - ones(Int, length(dict), length(classes))) + ones(Int, length(dict), length(classes))) NaiveBayesClassifier(classes) = NaiveBayesClassifier(String[], classes) -probabilities(c::NaiveBayesClassifier) = c.weights ./ sum(c.weights, dims = 1) +probabilities(c::NaiveBayesClassifier) = c.weights ./ sum(c.weights, dims=1) """ extend!(model::NaiveBayesClassifier, dictElement) @@ -121,7 +121,7 @@ fit!(c::NaiveBayesClassifier, s::String, class) = fit!(c, StringDocument(s), cla Predict probabilities for each class on the input Features or String. """ function predict(c::NaiveBayesClassifier, x::Features) - ps = prod(probabilities(c) .^ x, dims = 1) + ps = prod(probabilities(c) .^ x, dims=1) ps ./= sum(ps) Dict(c.classes[i] => ps[i] for i = 1:length(c.classes)) end diff --git a/src/corpus.jl b/src/corpus.jl index 926bac36..3190b553 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -1,8 +1,8 @@ -mutable struct Corpus{T <: AbstractDocument} +mutable struct Corpus{T<:AbstractDocument} documents::Vector{T} total_terms::Int - lexicon::Dict{String, Int} - inverse_index::Dict{String, Vector{Int}} + lexicon::Dict{String,Int} + inverse_index::Dict{String,Vector{Int}} h::TextHashFunction end @@ -25,12 +25,12 @@ Corpus's lexicon contains 0 tokens Corpus's index contains 0 tokens ``` """ -function Corpus(docs::Vector{T}) where {T <: AbstractDocument} +function Corpus(docs::Vector{T}) where {T<:AbstractDocument} Corpus( docs, 0, - Dict{String, Int}(), - Dict{String, Vector{Int}}(), + Dict{String,Int}(), + Dict{String,Vector{Int}}(), TextHashFunction() ) end @@ -93,7 +93,7 @@ Tables.getcolumn(d::AbstractDocument, i::Int) = Tables.getcolumn(d, Tables.colum Tables.isrowtable(x::Corpus) = true Tables.rows(x::Corpus) = x -Tables.schema(x::Corpus) = Tables.Schema((:Language, :Title, :Author, :Timestamp, :Length, :Text), (Union{String, Missing}, Union{String, Missing}, Union{String, Missing}, Union{String, Missing}, Union{Int, Missing}, Union{String, Missing})) +Tables.schema(x::Corpus) = Tables.Schema((:Language, :Title, :Author, :Timestamp, :Length, :Text), (Union{String,Missing}, Union{String,Missing}, Union{String,Missing}, Union{String,Missing}, Union{Int,Missing}, Union{String,Missing})) ############################################################################## # @@ -103,7 +103,7 @@ Tables.schema(x::Corpus) = Tables.Schema((:Language, :Title, :Author, :Timestamp function Base.iterate(crps::Corpus, ind=1) ind > length(crps.documents) && return nothing - crps.documents[ind], ind+1 + crps.documents[ind], ind + 1 end ############################################################################## @@ -133,7 +133,7 @@ Base.delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index) ############################################################################## Base.getindex(crps::Corpus, ind::Real) = crps.documents[ind] -Base.getindex(crps::Corpus, inds::Vector{T}) where {T <: Real} = crps.documents[inds] +Base.getindex(crps::Corpus, inds::Vector{T}) where {T<:Real} = crps.documents[inds] Base.getindex(crps::Corpus, r::AbstractRange) = crps.documents[r] Base.getindex(crps::Corpus, term::AbstractString) = get(crps.inverse_index, term, Int[]) @@ -230,7 +230,7 @@ contain that term. The inverse index tells us this and therefore provides a simp inverse_index(crps::Corpus) = crps.inverse_index function update_inverse_index!(crps::Corpus) - idx = Dict{String, Array{Int, 1}}() + idx = Dict{String,Array{Int,1}}() for i in 1:length(crps) doc = crps.documents[i] ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc) @@ -293,7 +293,7 @@ Corpus's lexicon contains 0 tokens Corpus's index contains 0 tokens ``` """ -function standardize!(crps::Corpus, ::Type{T}) where T <: AbstractDocument +function standardize!(crps::Corpus, ::Type{T}) where {T<:AbstractDocument} for i in 1:length(crps) crps.documents[i] = convert(T, crps.documents[i]) end diff --git a/src/deprecations.jl b/src/deprecations.jl index 483f0661..8bc09b26 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,18 +1,18 @@ ## Deprecations for Languages -function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString} - depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) +function tokenize(::Type{S}, s::T) where {S<:Language,T<:AbstractString} + depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) tokenize(S(), s) end -function ngramize(::Type{S}, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString} - depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) +function ngramize(::Type{S}, words::Vector{T}, n::Int) where {S<:Language,T<:AbstractString} + depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) ngramize(S(), words, n) end -function onegramize(::Type{S}, words::Vector{T}) where {S <: Language, T <: AbstractString} - depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) +function onegramize(::Type{S}, words::Vector{T}) where {S<:Language,T<:AbstractString} + depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S)) onegramize(S(), words) end diff --git a/src/document.jl b/src/document.jl index 0d05c19a..de27920a 100644 --- a/src/document.jl +++ b/src/document.jl @@ -46,7 +46,7 @@ end # ############################################################################## -abstract type AbstractDocument; end +abstract type AbstractDocument end mutable struct FileDocument <: AbstractDocument @@ -142,7 +142,7 @@ A TokenDocument{String} function TokenDocument(txt::AbstractString, dm::DocumentMetadata) TokenDocument(tokenize(dm.language, String(txt)), dm) end -function TokenDocument(tkns::Vector{T}) where T <: AbstractString +function TokenDocument(tkns::Vector{T}) where {T<:AbstractString} TokenDocument(tkns, DocumentMetadata()) end TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata()) @@ -189,7 +189,7 @@ end function NGramDocument(txt::AbstractString, n::Integer...=1) NGramDocument(txt, DocumentMetadata(), n...) end -function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString +function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString} NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata()) end @@ -270,14 +270,14 @@ julia> tokens(sd) "." ``` """ -tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d)) +tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d)) tokens(d::TokenDocument) = d.tokens function tokens(d::NGramDocument) error("The tokens of an NGramDocument cannot be reconstructed") end -tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens) -function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString +tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens) +function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString} error("The tokens of a $(typeof(d)) cannot be directly edited") end @@ -322,7 +322,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n. ngrams(d::NGramDocument) = d.ngrams ngrams(d::AbstractDocument) = ngrams(d, 1) -ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams) +ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams) function ngrams!(d::AbstractDocument, new_ngrams::Dict) error("The n-grams of $(typeof(d)) cannot be directly edited") end @@ -371,8 +371,8 @@ const GenericDocument = Union{ ############################################################################## Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str) -Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns) -Document(ng::Dict{String, Int}) = NGramDocument(ng) +Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns) +Document(ng::Dict{String,Int}) = NGramDocument(ng) ############################################################################## # @@ -383,11 +383,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng) function Base.convert(::Type{StringDocument}, d::FileDocument) StringDocument(text(d), d.metadata) end -function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument})) +function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument})) TokenDocument(tokens(d), d.metadata) end function Base.convert(::Type{NGramDocument}, - d::(Union{FileDocument, StringDocument, TokenDocument})) + d::(Union{FileDocument,StringDocument,TokenDocument})) NGramDocument(ngrams(d), 1, d.metadata) end Base.convert(::Type{TokenDocument}, d::TokenDocument) = d diff --git a/src/dtm.jl b/src/dtm.jl index df4e7091..c60a3561 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -1,7 +1,7 @@ mutable struct DocumentTermMatrix{T} - dtm::SparseMatrixCSC{Int, Int} + dtm::SparseMatrixCSC{Int,Int} terms::Vector{T} - column_indices::Dict{T, Int} + column_indices::Dict{T,Int} end function serialize(io::AbstractSerializer, dtm::DocumentTermMatrix{T}) where {T} @@ -15,7 +15,7 @@ end function deserialize(io::AbstractSerializer, ::Type{DocumentTermMatrix{T}}) where {T} dtm = deserialize(io) terms = deserialize(io) - column_indices = Dict{T,Int}(term => idx for (idx,term) in enumerate(terms)) + column_indices = Dict{T,Int}(term => idx for (idx, term) in enumerate(terms)) DocumentTermMatrix{T}(dtm, terms, column_indices) end @@ -24,8 +24,8 @@ end Creates a column index lookup dictionary from a vector of terms. """ -function columnindices(terms::Vector{T}) where T - column_indices = Dict{T, Int}() +function columnindices(terms::Vector{T}) where {T} + column_indices = Dict{T,Int}() for i in 1:length(terms) term = terms[i] column_indices[term] = i @@ -68,7 +68,7 @@ julia> m.dtm [2, 6] = 1 ``` """ -function DocumentTermMatrix(crps::Corpus, terms::Vector{T}) where T +function DocumentTermMatrix(crps::Corpus, terms::Vector{T}) where {T} column_indices = columnindices(terms) m = length(crps) @@ -101,7 +101,7 @@ DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps)) DocumentTermMatrix(crps::Corpus, lex::AbstractDict) = DocumentTermMatrix(crps, sort(collect(keys(lex)))) -DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{T}) where T = DocumentTermMatrix{T}(dtm, terms, columnindices(terms)) +DocumentTermMatrix(dtm::SparseMatrixCSC{Int,Int}, terms::Vector{T}) where {T} = DocumentTermMatrix{T}(dtm, terms, columnindices(terms)) """ dtm(crps::Corpus) @@ -166,7 +166,7 @@ tdm(crps::Corpus) = dtm(crps)' #' # ############################################################################## -function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T +function dtm_entries(d::AbstractDocument, lex::Dict{T,Int}) where {T} ngs = ngrams(d) indices = Int[] values = Int[] @@ -198,7 +198,7 @@ julia> dtv(crps[1], lexicon(crps)) 1 2 0 1 1 1 ``` """ -function dtv(d::AbstractDocument, lex::Dict{T, Int}) where T +function dtv(d::AbstractDocument, lex::Dict{T,Int}) where {T} p = length(keys(lex)) row = zeros(Int, 1, p) indices, values = dtm_entries(d, lex) @@ -342,9 +342,9 @@ function prune!(dtm::DocumentTermMatrix{T}, document_positions; compact::Bool=tr end if any(termcols_to_delete) - dtm.dtm = dtm_matrix[:,[!termcols_to_delete[idx] for idx in 1:length(termcols_to_delete)]] + dtm.dtm = dtm_matrix[:, [!termcols_to_delete[idx] for idx in 1:length(termcols_to_delete)]] dtm.terms = [dtm.terms[idx] for idx in 1:length(dtm.terms) if !termcols_to_delete[idx]] - dtm.column_indices = Dict{T,Int}(term => idx for (idx,term) in enumerate(dtm.terms)) + dtm.column_indices = Dict{T,Int}(term => idx for (idx, term) in enumerate(dtm.terms)) else dtm.dtm = dtm_matrix end @@ -361,7 +361,7 @@ For efficiency, this may result in modifications to dtm2 as well. function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where {T} (length(dtm2.dtm) == 0) && (return dtm1) - ncombined_docs = size(dtm1.dtm,1) + size(dtm2.dtm,1) + ncombined_docs = size(dtm1.dtm, 1) + size(dtm2.dtm, 1) termset1 = Set(dtm1.terms) termset2 = Set(dtm2.terms) termset = union(termset1, termset2) @@ -377,24 +377,24 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where function permute_terms!(dtm_to_permute, terms) (length(dtm_to_permute) == 0) && (return dtm_to_permute) - terms_perm = map(x->(x===nothing) ? 0 : x, indexin(combined_terms, terms)) + terms_perm = map(x -> (x === nothing) ? 0 : x, indexin(combined_terms, terms)) remaining_cols = setdiff(1:ncombined_terms, terms_perm) for idx in 1:length(terms_perm) if terms_perm[idx] == 0 terms_perm[idx] = popfirst!(remaining_cols) end end - permute!(dtm_to_permute, 1:size(dtm_to_permute,1), terms_perm) + permute!(dtm_to_permute, 1:size(dtm_to_permute, 1), terms_perm) end function expand_columns(S, n) (S.n == n) && (return S) @assert (n > S.n) colptr = S.colptr - resize!(colptr, n+1) + resize!(colptr, n + 1) colptr[(S.n+2):(n+1)] .= colptr[S.n+1] SparseMatrixCSC(S.m, n, colptr, S.rowval, S.nzval) end - function row_append(A, B) + function row_append(A::AbstractMatrix, B::AbstractMatrix) @assert size(A, 2) == size(B, 2) isempty(A) && return B isempty(B) && return A @@ -418,14 +418,14 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where # then copy from B nvalsB = B.colptr[col+1] - B.colptr[col] if nvalsB > 0 - C_rowvals[colptr_pos:(colptr_pos+nvalsB-1)] .= (B.rowval[B.colptr[col]:(B.colptr[col+1]-1)] .+ size(A,1)) + C_rowvals[colptr_pos:(colptr_pos+nvalsB-1)] .= (B.rowval[B.colptr[col]:(B.colptr[col+1]-1)] .+ size(A, 1)) C_nzval[colptr_pos:(colptr_pos+nvalsB-1)] .= B.nzval[B.colptr[col]:(B.colptr[col+1]-1)] offset += nvalsB colptr_pos += nvalsB end end - C_colptr[end] = length(C_rowvals)+1 - SparseMatrixCSC(size(A,1) + size(B,1), size(A,2), C_colptr, C_rowvals, C_nzval) + C_colptr[end] = length(C_rowvals) + 1 + SparseMatrixCSC(size(A, 1) + size(B, 1), size(A, 2), C_colptr, C_rowvals, C_nzval) end dtm1_matrix = (combined_terms === dtm1.terms) ? dtm1.dtm : permute_terms!(expand_columns(dtm1.dtm, ncombined_terms), dtm1.terms) @@ -435,7 +435,7 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where # set new terms and recompute column_indices dtm1.dtm = combined_matrix dtm1.terms = combined_terms - dtm1.column_indices = Dict{T,Int}(term => idx for (idx,term) in enumerate(combined_terms)) + dtm1.column_indices = Dict{T,Int}(term => idx for (idx, term) in enumerate(combined_terms)) dtm1 end diff --git a/src/lda.jl b/src/lda.jl index 97c80253..2497b9c0 100644 --- a/src/lda.jl +++ b/src/lda.jl @@ -15,9 +15,9 @@ TopicBasedDocument(ntopics) = TopicBasedDocument(Vector{Int}(), Vector{Int}(), z mutable struct Topic count::Int - wordcount::Dict{Int, Int} + wordcount::Dict{Int,Int} end -Topic() = Topic(0, Dict{Int, Int}()) +Topic() = Topic(0, Dict{Int,Int}()) end @@ -37,8 +37,10 @@ Perform [Latent Dirichlet allocation](https://en.wikipedia.org/wiki/Latent_Diric - `ϕ`: `ntopics × nwords` Sparse matrix of probabilities s.t. `sum(ϕ, 1) == 1` - `θ`: `ntopics × ndocs` Dense matrix of probabilities s.t. `sum(θ, 1) == 1` """ -function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, - alpha::Float64, beta::Float64; showprogress::Bool = true) +function lda( + dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, + alpha::Float64, beta::Float64; showprogress::Bool=true +) number_of_documents, number_of_words = size(dtm.dtm) docs = [Lda.TopicBasedDocument(ntopics) for _ in 1:number_of_documents] @@ -69,7 +71,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, wait_time = showprogress ? 1.0 : Inf # Gibbs sampling - @showprogress dt=wait_time for _ in 1:iteration + @showprogress dt = wait_time for _ in 1:iteration for doc in docs for (i, word) in enumerate(doc.text) topicid_current = doc.topic[i] @@ -81,7 +83,7 @@ function lda(dtm::DocumentTermMatrix, ntopics::Int, iteration::Int, for target_topicid in 1:ntopics topicprob = (doc.topicidcount[target_topicid] + beta) / (document_lenth + beta * ntopics) topic = topics[target_topicid] - wordprob = (get(topic.wordcount, word, 0)+ alpha) / (topic.count + alpha * number_of_words) + wordprob = (get(topic.wordcount, word, 0) + alpha) / (topic.count + alpha * number_of_words) probs[target_topicid] = topicprob * wordprob end normalize_probs = sum(probs) diff --git a/src/lsa.jl b/src/lsa.jl index f552f7eb..21b24b7a 100644 --- a/src/lsa.jl +++ b/src/lsa.jl @@ -7,6 +7,6 @@ Performs Latent Semantic Analysis or LSA on a corpus. """ lsa(dtm::DocumentTermMatrix) = svd(Matrix(tf_idf(dtm))) function lsa(crps::Corpus) - update_lexicon!(crps) - svd(Matrix(tf_idf(DocumentTermMatrix(crps)))) + update_lexicon!(crps) + svd(Matrix(tf_idf(DocumentTermMatrix(crps)))) end diff --git a/src/metadata.jl b/src/metadata.jl index b9b98c05..5a8a457a 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -129,7 +129,7 @@ See also: [`timestamps!`](@ref), [`timestamp`](@ref), [`timestamp!`](@ref) timestamps(c::Corpus) = map(d -> timestamp(d), documents(c)) titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv) -languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast +languages!(c::Corpus, nv::T) where {T<:Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv)) timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv)) @@ -160,7 +160,7 @@ If the input is a Vector, then language of the `i`th document is set to the `i`t See also: [`languages`](@ref), [`language!`](@ref), [`language`](@ref) """ -function languages!(c::Corpus, nvs::Vector{T}) where T <: Language +function languages!(c::Corpus, nvs::Vector{T}) where {T<:Language} length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) language!(d, nvs[i]) diff --git a/src/ngramizer.jl b/src/ngramizer.jl index 1a8ade7b..1b84ff3e 100644 --- a/src/ngramizer.jl +++ b/src/ngramizer.jl @@ -13,16 +13,16 @@ Dict{AbstractString,Int64} with 3 entries: "To be or" => 1 ``` """ -function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S <: Language, T <: AbstractString} +function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S<:Language,T<:AbstractString} (length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words) n_words = length(words) - tokens = Dict{AbstractString, Int}() + tokens = Dict{AbstractString,Int}() for n in nlist - for index in 1:(n_words - n + 1) - token = join(words[index:(index + n - 1)], " ") + for index in 1:(n_words-n+1) + token = join(words[index:(index+n-1)], " ") tokens[token] = get(tokens, token, 0) + 1 end end @@ -48,9 +48,8 @@ Dict{String,Int64} with 5 entries: "be" => 2 ``` """ -function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: AbstractString} - n_words = length(words) - tokens = Dict{T, Int}() +function onegramize(lang::S, words::Vector{T}) where {S<:Language,T<:AbstractString} + tokens = Dict{T,Int}() for word in words tokens[word] = get(tokens, word, 0) + 1 diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 8d239c02..f9a5ec49 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -26,7 +26,7 @@ const strip_html_tags = UInt32(0x1) << 20 const alpha_sparse = 0.05 const alpha_frequent = 0.95 -const regex_cache = Dict{AbstractString, Regex}() +const regex_cache = Dict{AbstractString,Regex}() function mk_regex(regex_string) d = haskey(regex_cache, regex_string) ? regex_cache[regex_string] : @@ -42,7 +42,7 @@ Remove corrupt UTF8 characters in `str`. See also: [`remove_corrupt_utf8!`](@ref) """ function remove_corrupt_utf8(s::AbstractString) - return map(x->isvalid(x) ? x : ' ', s) + return map(x -> isvalid(x) ? x : ' ', s) end remove_corrupt_utf8!(d::FileDocument) = error("FileDocument cannot be modified") @@ -66,7 +66,7 @@ function remove_corrupt_utf8!(d::TokenDocument) end function remove_corrupt_utf8!(d::NGramDocument) - new_ngrams = Dict{AbstractString, Int}() + new_ngrams = Dict{AbstractString,Int}() for token in keys(d.ngrams) new_token = remove_corrupt_utf8(token) count = get(new_ngrams, new_token, 0) @@ -86,7 +86,7 @@ end Convert `str` to lowercase. See also: [`remove_case!`](@ref) """ -remove_case(s::T) where {T <: AbstractString} = lowercase(s) +remove_case(s::T) where {T<:AbstractString} = lowercase(s) """ @@ -124,7 +124,7 @@ function remove_case!(d::TokenDocument) end function remove_case!(d::NGramDocument) - new_ngrams = Dict{AbstractString, Int}() + new_ngrams = Dict{AbstractString,Int}() for token in keys(d.ngrams) new_token = remove_case(token) count = get(new_ngrams, new_token, 0) @@ -215,10 +215,10 @@ julia> sd.text ``` """ function remove_words!(entity::(Union{AbstractDocument,Corpus}), - words::Vector{T}) where T <: AbstractString + words::Vector{T}) where {T<:AbstractString} skipwords = Set{AbstractString}() union!(skipwords, words) - prepare!(entity, strip_patterns, skip_words = skipwords) + prepare!(entity, strip_patterns, skip_words=skipwords) end @@ -229,7 +229,7 @@ end # ############################################################################## -function tag_pos!(entity::Union{Corpus, TokenDocument, StringDocument}) +function tag_pos!(entity::Union{Corpus,TokenDocument,StringDocument}) @warn "tag_pos! is deprecated, Use Perceptrontagger instead" tagger = PerceptronTagger(true) if typeof(entity) == Corpus @@ -260,7 +260,7 @@ julia> sparse_terms(crps, 0.5) ``` See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref) """ -function sparse_terms(crps::Corpus, alpha::Real = alpha_sparse) +function sparse_terms(crps::Corpus, alpha::Real=alpha_sparse) update_lexicon!(crps) update_inverse_index!(crps) res = Array{String}(undef, 0) @@ -296,7 +296,7 @@ julia> frequent_terms(crps) ``` See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref) """ -function frequent_terms(crps::Corpus, alpha::Real = alpha_frequent) +function frequent_terms(crps::Corpus, alpha::Real=alpha_frequent) update_lexicon!(crps) update_inverse_index!(crps) res = Array{String}(undef, 0) @@ -332,7 +332,7 @@ julia> crps[2].text ``` See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref) """ -remove_sparse_terms!(crps::Corpus, alpha::Real = alpha_sparse) = remove_words!(crps, sparse_terms(crps, alpha)) +remove_sparse_terms!(crps::Corpus, alpha::Real=alpha_sparse) = remove_words!(crps, sparse_terms(crps, alpha)) """ remove_frequent_terms!(crps, alpha=0.95) @@ -356,7 +356,7 @@ julia> text(crps[2]) ``` See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref) """ -remove_frequent_terms!(crps::Corpus, alpha::Real = alpha_frequent) = remove_words!(crps, frequent_terms(crps, alpha)) +remove_frequent_terms!(crps::Corpus, alpha::Real=alpha_frequent) = remove_words!(crps, frequent_terms(crps, alpha)) """ @@ -396,7 +396,10 @@ julia> text(doc) "This is document of " ``` """ -function prepare!(crps::Corpus, flags::UInt32; skip_patterns = Set{AbstractString}(), skip_words = Set{AbstractString}()) +function prepare!( + crps::Corpus, flags::UInt32; + skip_patterns=Set{AbstractString}(), skip_words=Set{AbstractString}() +) ((flags & strip_sparse_terms) > 0) && union!(skip_words, sparse_terms(crps)) ((flags & strip_frequent_terms) > 0) && union!(skip_words, frequent_terms(crps)) @@ -415,7 +418,10 @@ function prepare!(crps::Corpus, flags::UInt32; skip_patterns = Set{AbstractStrin nothing end -function prepare!(d::AbstractDocument, flags::UInt32; skip_patterns = Set{AbstractString}(), skip_words = Set{AbstractString}()) +function prepare!( + d::AbstractDocument, flags::UInt32; + skip_patterns=Set{AbstractString}(), skip_words=Set{AbstractString}() +) ((flags & strip_corrupt_utf8) > 0) && remove_corrupt_utf8!(d) ((flags & strip_case) > 0) && remove_case!(d) ((flags & strip_html_tags) > 0) && remove_html_tags!(d) @@ -431,16 +437,18 @@ end """ remove_whitespace(str) + Squash multiple whitespaces to a single one. And remove all leading and trailing whitespaces. See also: [`remove_whitespace!`](@ref) """ -remove_whitespace(str::AbstractString) = replace(strip(str), r"\s+"=>" ") +remove_whitespace(str::AbstractString) = replace(strip(str), r"\s+" => " ") """ remove_whitespace!(doc) remove_whitespace!(crps) + Squash multiple whitespaces to a single space and remove all leading and trailing whitespaces in document or crps. Does no-op for `FileDocument`, `TokenDocument` or `NGramDocument`. See also: [`remove_whitespace`](@ref) @@ -461,6 +469,7 @@ end """ remove_patterns(str, rex::Regex) + Remove the part of str matched by rex. See also: [`remove_patterns!`](@ref) """ @@ -490,6 +499,7 @@ end """ remove_patterns!(doc, rex::Regex) remove_patterns!(crps, rex::Regex) + Remove patterns matched by `rex` in document or Corpus. Does not modify `FileDocument` or Corpus containing `FileDocument`. See also: [`remove_patterns`](@ref) @@ -508,7 +518,7 @@ function remove_patterns!(d::TokenDocument, rex::Regex) end function remove_patterns!(d::NGramDocument, rex::Regex) - new_ngrams = Dict{AbstractString, Int}() + new_ngrams = Dict{AbstractString,Int}() for token in keys(d.ngrams) new_token = remove_patterns(token, rex) count = get(new_ngrams, new_token, 0) @@ -528,9 +538,9 @@ end # internal helper methods _build_regex(lang, flags::UInt32) = _build_regex(lang, flags, Set{AbstractString}(), Set{AbstractString}()) -_build_regex(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where {T <: AbstractString} = _combine_regex(_build_regex_patterns(lang, flags, patterns, words)) +_build_regex(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where {T<:AbstractString} = _combine_regex(_build_regex_patterns(lang, flags, patterns, words)) -function _combine_regex(regex_parts::Set{T}) where T <: AbstractString +function _combine_regex(regex_parts::Set{T}) where {T<:AbstractString} l = length(regex_parts) (0 == l) && return r"" (1 == l) && return mk_regex(pop!(regex_parts)) @@ -543,7 +553,7 @@ function _combine_regex(regex_parts::Set{T}) where T <: AbstractString mk_regex(String(take!(iob))) end -function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString +function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where {T<:AbstractString} #((flags & strip_whitespace) > 0) && push!(patterns, "\\s+") if (flags & strip_non_letters) > 0 push!(patterns, "[^\\p{L}\\s]") @@ -566,7 +576,7 @@ function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set patterns end -function _build_words_pattern(words::Vector{T}) where T <: AbstractString +function _build_words_pattern(words::Vector{T}) where {T<:AbstractString} isempty(words) && return "" iob = IOBuffer() diff --git a/src/show.jl b/src/show.jl index ce1363f8..22504b6b 100644 --- a/src/show.jl +++ b/src/show.jl @@ -10,7 +10,7 @@ function Base.summary(d::AbstractDocument) o *= " * Author: $(author(d))\n" o *= " * Timestamp: $(timestamp(d))\n" - if typeof(d) <: Union{TokenDocument, NGramDocument} + if typeof(d) <: Union{TokenDocument,NGramDocument} o *= " * Snippet: ***SAMPLE TEXT NOT AVAILABLE***" else sample_text = replace(first(text(d), 50), r"\s+" => " ") diff --git a/src/summarizer.jl b/src/summarizer.jl index b9d83f3a..d3bee689 100644 --- a/src/summarizer.jl +++ b/src/summarizer.jl @@ -24,7 +24,7 @@ function summarize(d::AbstractDocument; ns=5) num_sentences = length(sentences) s = StringDocument.(sentences) c = Corpus(s) - prepare!(c, strip_case | strip_stopwords | stem_words ) + prepare!(c, strip_case | strip_stopwords | stem_words) update_lexicon!(c) t = tf_idf(dtm(c)) T = t * t' @@ -32,19 +32,19 @@ function summarize(d::AbstractDocument; ns=5) return sentences[sort(sortperm(vec(p), rev=true)[1:min(ns, num_sentences)])] end -function pagerank( A; Niter=20, damping=.15) - Nmax = size(A, 1) - r = rand(1,Nmax); # Generate a random starting rank. - r = r ./ norm(r,1); # Normalize - a = (1-damping) ./ Nmax; # Create damping vector +function pagerank(A; n_iter=20, damping=0.15) + nmax = size(A, 1) + r = rand(1, nmax) # Generate a random starting rank. + r = r ./ norm(r, 1) # Normalize + a = (1 - damping) ./ nmax # Create damping vector - for i=1:Niter - s = r * A - rmul!(s, damping) - r = s .+ (a * sum(r, dims=2)); # Compute PageRank. - end + for _ = 1:n_iter + s = r * A + rmul!(s, damping) + r = s .+ (a * sum(r, dims=2)) # Compute PageRank. + end - r = r./norm(r,1); + r = r ./ norm(r, 1) - return r + return r end diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl index 8b13d5fd..83d8611e 100644 --- a/src/tagging_schemes.jl +++ b/src/tagging_schemes.jl @@ -64,7 +64,7 @@ function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) if tags[i][1] == 'I' if i == 1 tags[i] = 'B' * tags[i][2:end] - elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end] + elseif tags[i-1] == "O" || tags[i-1][2:end] != tags[i][2:end] tags[i] = 'B' * tags[i][2:end] else continue @@ -86,7 +86,7 @@ function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) if tags[i][1] == 'B' if i == length(tags) tags[i] = 'I' * tags[i][2:end] - elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end] + elseif tags[i+1] == "O" || tags[i+1][2:end] != tags[i][2:end] tags[i] = 'I' * tags[i][2:end] else continue @@ -108,7 +108,7 @@ function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) tags[i+1][2:end] != tags[i][2:end]) tags[i] = 'E' * tags[i][2:end] elseif tags[i][1] == 'B' && (i == length(tags) || - tags[i+1][2:end] != tags[i][2:end]) + tags[i+1][2:end] != tags[i][2:end]) tags[i] = 'S' * tags[i][2:end] else (tags[i][1] == 'I' || tags[i][1] == 'B') && continue diff --git a/src/tf_idf.jl b/src/tf_idf.jl index 202cb871..7ccc33b9 100644 --- a/src/tf_idf.jl +++ b/src/tf_idf.jl @@ -7,7 +7,7 @@ Works correctly if `dtm` and `tf` are same matrix. See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) """ -function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} +function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1<:Real,T2<:AbstractFloat} n, p = size(dtm) # TF tells us what proportion of a document is defined by a term @@ -31,28 +31,28 @@ Overwrite `tf` with the term frequency of the `dtm`. See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) """ -function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} +function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T<:Real,F<:AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) tfvals = nonzeros(tf) @assert size(dtmvals) == size(tfvals) # TF tells us what proportion of a document is defined by a term - words_in_documents = sum(dtm,dims=2) + words_in_documents = sum(dtm, dims=2) n, p = size(dtm) for i = 1:p - for j in nzrange(dtm, i) - row = rows[j] - tfvals[j] = dtmvals[j] / max(words_in_documents[row], one(T)) - end + for j in nzrange(dtm, i) + row = rows[j] + tfvals[j] = dtmvals[j] / max(words_in_documents[row], one(T)) + end end return tf end -tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm) +tf!(dtm::AbstractMatrix{T}) where {T<:Real} = tf!(dtm, dtm) -tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm) +tf!(dtm::SparseMatrixCSC{T}) where {T<:Real} = tf!(dtm, dtm) """ tf(dtm::DocumentTermMatrix) @@ -89,9 +89,9 @@ See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) """ tf(dtm::DocumentTermMatrix) = tf(dtm.dtm) -tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...)) +tf(dtm::Matrix{T}) where {T<:Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...)) -tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64)) +tf(dtm::SparseMatrixCSC{T}) where {T<:Real} = tf!(dtm, similar(dtm, Float64)) """ tf_idf!(dtm::AbstractMatrix{Real}, tf_idf::AbstractMatrix{AbstractFloat}) @@ -102,7 +102,7 @@ Overwrite `tf_idf` with the tf-idf (Term Frequency - Inverse Doc Frequency) of t See also: [`tf`](@ref), [`tf!`](@ref) , [`tf_idf`](@ref) """ -function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} +function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1<:Real,T2<:AbstractFloat} n, p = size(dtm) # TF tells us what proportion of a document is defined by a term @@ -131,7 +131,7 @@ The arguments must have same number of nonzeros. See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) """ -function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} +function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T<:Real,F<:AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) tfidfvals = nonzeros(tfidf) @@ -148,10 +148,10 @@ function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: idf = log.(n ./ documents_containing_term) for i = 1:p - for j in nzrange(dtm, i) - row = rows[j] - tfidfvals[j] = dtmvals[j] / max(words_in_documents[row], oneval) * idf[i] - end + for j in nzrange(dtm, i) + row = rows[j] + tfidfvals[j] = dtmvals[j] / max(words_in_documents[row], oneval) * idf[i] + end end return tfidf @@ -162,9 +162,9 @@ end Compute tf-idf for `dtm` """ -tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm) +tf_idf!(dtm::AbstractMatrix{T}) where {T<:Real} = tf_idf!(dtm, dtm) -tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm) +tf_idf!(dtm::SparseMatrixCSC{T}) where {T<:Real} = tf_idf!(dtm, dtm) # This does not make sense, since DocumentTermMatrix is based on an array of integers #tf_idf!(dtm::DocumentTermMatrix) = tf_idf!(dtm.dtm) @@ -211,15 +211,16 @@ See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) """ tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm) -tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64)) +tf_idf(dtm::SparseMatrixCSC{T}) where {T<:Real} = tf_idf!(dtm, similar(dtm, Float64)) -tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...)) +tf_idf(dtm::Matrix{T}) where {T<:Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...)) -function bm_25!(dtm::AbstractMatrix{T}, - bm25::AbstractMatrix{F}; - κ::Int=2, - β::Float64=0.75 - ) where {T<:Real, F<:AbstractFloat} +function bm_25!( + dtm::AbstractMatrix{T}, + bm25::AbstractMatrix{F}; + κ::Int=2, + β::Float64=0.75 +) where {T<:Real,F<:AbstractFloat} @assert size(dtm) == size(bm25) # Initializations k = F(κ) @@ -228,7 +229,7 @@ function bm_25!(dtm::AbstractMatrix{T}, oneval = one(F) # TF tells us what proportion of a document is defined by a term words_in_documents = F.(sum(dtm, dims=1)) - ln = words_in_documents./mean(words_in_documents) + ln = words_in_documents ./ mean(words_in_documents) # IDF tells us how rare a term is in the corpus documents_containing_term = vec(sum(dtm .> 0, dims=2)) .+ one(T) idf = log.(n ./ documents_containing_term) .+ oneval @@ -237,18 +238,19 @@ function bm_25!(dtm::AbstractMatrix{T}, @inbounds @simd for i in 1:n for j in 1:p bm25[j, i] = idf[j] * - ((k + 1) * bm25[j, i]) / - (k * (oneval - b + b * ln[i]) + bm25[j, i]) + ((k + 1) * bm25[j, i]) / + (k * (oneval - b + b * ln[i]) + bm25[j, i]) end end return bm25 end -function bm_25!(dtm::SparseMatrixCSC{T}, - bm25::SparseMatrixCSC{F}; - κ::Int=2, - β::Float64=0.75 - ) where {T<:Real, F<:AbstractFloat} +function bm_25!( + dtm::SparseMatrixCSC{T}, + bm25::SparseMatrixCSC{F}; + κ::Int=2, + β::Float64=0.75 +) where {T<:Real,F<:AbstractFloat} @assert size(dtm) == size(bm25) # Initializations k = F(κ) @@ -260,26 +262,26 @@ function bm_25!(dtm::SparseMatrixCSC{T}, p, n = size(dtm) # TF tells us what proportion of a document is defined by a term words_in_documents = F.(sum(dtm, dims=1)) - ln = words_in_documents./mean(words_in_documents) + ln = words_in_documents ./ mean(words_in_documents) oneval = one(F) # IDF tells us how rare a term is in the corpus documents_containing_term = vec(sum(dtm .> 0, dims=2)) .+ one(T) idf = log.(n ./ documents_containing_term) .+ oneval for i = 1:n - for j in nzrange(dtm, i) - row = rows[j] - tf = sqrt.(dtmvals[j] / max(words_in_documents[i], oneval)) - bm25vals[j] = idf[row] * ((k + 1) * tf) / - (k * (oneval - b + b * ln[i]) + tf) - end + for j in nzrange(dtm, i) + row = rows[j] + tf = sqrt.(dtmvals[j] / max(words_in_documents[i], oneval)) + bm25vals[j] = idf[row] * ((k + 1) * tf) / + (k * (oneval - b + b * ln[i]) + tf) + end end return bm25 end -bm_25(dtm::AbstractMatrix{T}; κ::Int=2, β::Float64=0.75) where T<:Integer = +bm_25(dtm::AbstractMatrix{T}; κ::Int=2, β::Float64=0.75) where {T<:Integer} = bm_25!(dtm, similar(dtm, Float64), κ=κ, β=β) -bm_25(dtm::AbstractMatrix{T}; κ::Int=2, β::Float64=0.75) where T<:AbstractFloat = +bm_25(dtm::AbstractMatrix{T}; κ::Int=2, β::Float64=0.75) where {T<:AbstractFloat} = bm_25!(dtm, similar(dtm, T), κ=κ, β=β) bm_25(dtm::DocumentTermMatrix; κ::Int=2, β::Float64=0.75) = @@ -291,7 +293,7 @@ bm_25!(dtm::DocumentTermMatrix; κ::Int=2, β::Float64=0.75) = # The score was modified according to for bm25: # https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/ function tf_bm25!(dtm::AbstractMatrix{T}, tf::AbstractMatrix{F} - ) where {T<:Real, F<:AbstractFloat} +) where {T<:Real,F<:AbstractFloat} @assert size(dtm) == size(tf) p, n = size(dtm) # TF tells us what proportion of a document is defined by a term diff --git a/src/tokenizer.jl b/src/tokenizer.jl index 78ef1c9f..818b3e28 100644 --- a/src/tokenizer.jl +++ b/src/tokenizer.jl @@ -16,7 +16,7 @@ julia> tokenize(Languages.English(), "Too foo words!") See also: [`sentence_tokenize`](@ref) """ -tokenize(lang::S, s::T) where {S <: Language, T <: AbstractString} = WordTokenizers.tokenize(s) +tokenize(lang::S, s::T) where {S<:Language,T<:AbstractString} = WordTokenizers.tokenize(s) """ @@ -34,4 +34,4 @@ julia> sentence_tokenize(Languages.English(), "Here are few words! I am Foo Bar. See also: [`tokenize`](@ref) """ -sentence_tokenize(lang::S, s::T) where {S <: Language, T<:AbstractString} = WordTokenizers.split_sentences(s) +sentence_tokenize(lang::S, s::T) where {S<:Language,T<:AbstractString} = WordTokenizers.split_sentences(s) diff --git a/test/LM.jl b/test/LM.jl index 486de598..bf5d501e 100644 --- a/test/LM.jl +++ b/test/LM.jl @@ -1,7 +1,7 @@ using DataStructures @testset "Vocabulary" begin - + words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] vocab = Vocabulary(words, 2, "") @test vocab isa Vocabulary @@ -12,37 +12,37 @@ using DataStructures @test length(vocab.vocab) == 4 #only 4 differnt string over word freq 2 @test isequal(vocab.unk_cutoff, 2) @test vocab.unk_label == "" - @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) - @test isequal(vocab.vocab, Dict{String,Int}(""=>1,"c"=>3,"a"=>3,"d"=>2)) + @test isequal(vocab.allword, ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) + @test isequal(vocab.vocab, Dict{String,Int}("" => 1, "c" => 3, "a" => 3, "d" => 2)) #to check lookup function - @test lookup(vocab,["a", "b", "c", "alien"]) == ["a", "", "c", ""] + @test lookup(vocab, ["a", "b", "c", "alien"]) == ["a", "", "c", ""] word_set = ["", "is", "already", "there"] @test_throws ErrorException Vocabulary(word_set, 1, "") end @testset "preprocessing" begin - @testset "ngramizenew" begin + @testset "ngramizenew" begin sample_text = ["this", "is", "some", "sample", "text"] ngrams = TextAnalysis.ngramizenew(sample_text, 1) - + @test isequal(ngrams, ["this", "is", "some", "sample", "text"]) - - ngrams = TextAnalysis.ngramizenew(sample_text,2) + + ngrams = TextAnalysis.ngramizenew(sample_text, 2) @test isequal(ngrams, ["this is", "is some", "some sample", "sample text"]) - - ngrams = TextAnalysis.ngramizenew(sample_text,1,2) + + ngrams = TextAnalysis.ngramizenew(sample_text, 1, 2) @test isequal(ngrams, ["this", "is", "some", "sample", "text", "this is", "is some", "some sample", "sample text"]) end - + @testset "Padding function" begin example = ["1", "2", "3", "4", "5"] - padded=padding_ngram(example, 2, pad_left=true, pad_right=true) - @test isequal(padded,[" 1", "1 2", "2 3", "3 4", "4 5", "5 "]) + padded = padding_ngram(example, 2, pad_left=true, pad_right=true) + @test isequal(padded, [" 1", "1 2", "2 3", "3 4", "4 5", "5 "]) @test isequal(example, ["", "1", "2", "3", "4", "5", ""]) - + example = ["1", "2", "3", "4", "5"] #if used - padded=padding_ngram(example, 2, pad_right=true) - @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 "]) + padded = padding_ngram(example, 2, pad_right=true) + @test isequal(padded, ["1 2", "2 3", "3 4", "4 5", "5 "]) end @testset "everygram function" begin example = ["1", "2", "3", "4", "5"] @@ -52,51 +52,51 @@ end end @testset "counter" begin - exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"] + exam = ["To", "be", "or", "not", "to", "be", "To", "be", "or", "not", "to", "be"] fit = (TextAnalysis.counter2(exam, 2, 2)) @test fit isa DataStructures.DefaultDict @test length(fit) == 5 #length of unique words end @testset "language model" begin - + @testset "MLE" begin - voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + voc = ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model = MLE(voc) fit = model(train, 2, 2) #considering only bigrams - unmaskedscore = score(model, fit, "is" ,"") + unmaskedscore = score(model, fit, "is", "") @test unmaskedscore == 0.3333333333333333 @test score(model, fit, "is", "alien") == Inf #context not in vocabulary @test score(model, fit, "alien", "is") == 0 # word not in vocabulary - end - + end + @testset "Lidstone" begin - voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + voc = ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model2 = Lidstone(voc, 1.0) fit = model2(train, 2, 2) - @test score(model2, fit,"is", "alien") == 0.1 - @test score(model2, fit, "alien", "is") >= 0 + @test score(model2, fit, "is", "alien") == 0.1 + @test score(model2, fit, "alien", "is") >= 0 end @testset "Laplace" begin - voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + voc = ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model3 = Laplace(voc) fit2 = model3(train, 2, 2) - @test score(model3, fit2,"is", "alien") == 0.1 + @test score(model3, fit2, "is", "alien") == 0.1 end @testset "WittenBellInterpolated" begin - voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + voc = ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model = WittenBellInterpolated(voc) fit = model(train, 2, 2) - @test score(model, fit,"is", "alien") == 0.2 + @test score(model, fit, "is", "alien") == 0.2 @test score(model, fit, "alien", "is") == 0.4 - @test score(model, fit,"alien") == 0.2 #should be non-zero + @test score(model, fit, "alien") == 0.2 #should be non-zero end @testset "KneserNeyInterpolated" begin - voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + voc = ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model = KneserNeyInterpolated(voc, 0.1) fit = model(train, 2, 2) @@ -104,4 +104,4 @@ end @test score(model, fit, "alien", "is") == 0.11000000000000001 end end - + diff --git a/test/bayes.jl b/test/bayes.jl index 11fd7f8c..3c6e341c 100644 --- a/test/bayes.jl +++ b/test/bayes.jl @@ -1,7 +1,7 @@ @testset "Bayes" begin - m=NaiveBayesClassifier([:spam, :ham]) - TextAnalysis.fit!(m, "this is ham", :ham); - TextAnalysis.fit!(m, "this is spam", :spam); + m = NaiveBayesClassifier([:spam, :ham]) + TextAnalysis.fit!(m, "this is ham", :ham) + TextAnalysis.fit!(m, "this is spam", :spam) r = TextAnalysis.predict(m, "is this spam?") @test r[:spam] > r[:ham] diff --git a/test/corpus.jl b/test/corpus.jl index 057432fa..044c89a6 100644 --- a/test/corpus.jl +++ b/test/corpus.jl @@ -16,7 +16,7 @@ documents(crps) for doc in crps - @test isa(doc, AbstractDocument) + @test isa(doc, AbstractDocument) end lexicon(crps) @@ -37,9 +37,9 @@ standardize!(crps, StringDocument) remove_words!(crps, [""]) update_lexicon!(crps) - answer = Dict("1"=> 2, "2"=> 1, "4"=> 1) + answer = Dict("1" => 2, "2" => 1, "4" => 1) - @test answer == lexicon(crps) + @test answer == lexicon(crps) end using DataFrames diff --git a/test/document.jl b/test/document.jl index e080f841..8ffa3ef3 100644 --- a/test/document.jl +++ b/test/document.jl @@ -1,13 +1,19 @@ @testset "Document" begin - dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2")) - @test (dmeta.language == Languages.English()) && - (dmeta.title == "test title") && - (dmeta.author == "test author") && - (dmeta.timestamp == "test time") && - (get(dmeta.custom, :k1, "") == "v1") && - (get(dmeta.custom, :k2, "") == "v2") + dmeta = TextAnalysis.DocumentMetadata( + Languages.English(), + "test title", + "test author", + "test time", + Dict(:k1 => "v1", :k2 => "v2") + ) + @test (dmeta.language == Languages.English()) && + (dmeta.title == "test title") && + (dmeta.author == "test author") && + (dmeta.timestamp == "test time") && + (get(dmeta.custom, :k1, "") == "v1") && + (get(dmeta.custom, :k2, "") == "v2") # mutability dmeta.custom = nothing @@ -79,8 +85,10 @@ @test isequal(length(Document("this is text")), 12) # NGramDocument creation with multiple ngram complexity - let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7) - for (n,c,l) in zip(N,C,L) + let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), + C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7) + + for (n, c, l) in zip(N, C, L) ngd = NGramDocument(sample_text1, n...) @test ngram_complexity(ngd) == c @test length(ngd.ngrams) == l diff --git a/test/dtm.jl b/test/dtm.jl index 75e15fd7..0a2f01fd 100644 --- a/test/dtm.jl +++ b/test/dtm.jl @@ -33,23 +33,23 @@ # construct a DocumentTermMatrix from a crps and a custom terms vector terms = ["And", "notincrps"] - m = DocumentTermMatrix(crps,terms) - @test size(dtm(m),1) == length(terms) + m = DocumentTermMatrix(crps, terms) + @test size(dtm(m), 1) == length(terms) @test terms == m.terms - @test size(dtm(m),2) == length(crps) + @test size(dtm(m), 2) == length(crps) # construct a DocumentTermMatrix from a crps and a custom lexicon - lex = Dict("And"=>1, "notincrps"=>4) - m = DocumentTermMatrix(crps,lex) - @test size(dtm(m),1) == length(keys(lex)) - @test size(dtm(m),1) == length(m.terms) - @test size(dtm(m),2) == length(crps) + lex = Dict("And" => 1, "notincrps" => 4) + m = DocumentTermMatrix(crps, lex) + @test size(dtm(m), 1) == length(keys(lex)) + @test size(dtm(m), 1) == length(m.terms) + @test size(dtm(m), 2) == length(crps) # construct a DocumentTermMatrix from a dtm and terms vector terms = m.terms - m2 = DocumentTermMatrix(dtm1,terms) + m2 = DocumentTermMatrix(dtm1, terms) @test m.column_indices == m2.column_indices - m2 = DocumentTermMatrix(dtm1sp,terms) + m2 = DocumentTermMatrix(dtm1sp, terms) @test m.column_indices == m2.column_indices # test serialization and deserialization @@ -75,24 +75,24 @@ prune!(dtm1, nothing; compact=false) @test length(dtm1.terms) == 4 - @test size(dtm1.dtm) == (2,4) + @test size(dtm1.dtm) == (2, 4) prune!(dtm1, [1]; compact=false) @test length(dtm1.terms) == 4 - @test size(dtm1.dtm) == (1,4) + @test size(dtm1.dtm) == (1, 4) dtm1 = DocumentTermMatrix(crps1) prune!(dtm1, [1]; compact=true) @test length(dtm1.terms) == 3 - @test size(dtm1.dtm) == (1,3) + @test size(dtm1.dtm) == (1, 3) dtm1 = DocumentTermMatrix(crps1) prune!(dtm1, [1]; compact=true, retain_terms=["one"]) @test length(dtm1.terms) == 4 - @test size(dtm1.dtm) == (1,4) + @test size(dtm1.dtm) == (1, 4) merge!(dtm1, dtm2) - @test size(dtm1.dtm) == (3,5) + @test size(dtm1.dtm) == (3, 5) @test sum(dtm1.dtm, dims=(1,)) == [1 3 0 3 2] @test dtm1.terms == ["five", "four", "one", "three", "two"] @@ -100,13 +100,13 @@ dtm1.dtm = similar(dtm1.dtm, 0, dtm1.dtm.n) merge!(dtm1, dtm2) @test dtm1.terms == ["five", "four", "one", "three", "two"] - @test size(dtm1.dtm) == (2,5) + @test size(dtm1.dtm) == (2, 5) @test sum(dtm1.dtm, dims=(1,)) == [1 2 0 2 1] dtm2 = DocumentTermMatrix(crps2) dtm1.dtm = similar(dtm1.dtm, 0, dtm1.dtm.n) merge!(dtm2, dtm1) @test dtm2.terms == ["five", "four", "three", "two"] - @test size(dtm2.dtm) == (2,4) + @test size(dtm2.dtm) == (2, 4) @test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1] end diff --git a/test/evaluation_metrics.jl b/test/evaluation_metrics.jl index 712c95a9..5ee15f26 100644 --- a/test/evaluation_metrics.jl +++ b/test/evaluation_metrics.jl @@ -11,16 +11,16 @@ using Test @test s.precision ≈ p && s.fmeasure ≈ f @test argmax([ - Score(0., 1., 2.), - Score(3., 0., 0.), - Score(0., 6., 1.) - ]) == Score(0., 1., 2.) + Score(0.0, 1.0, 2.0), + Score(3.0, 0.0, 0.0), + Score(0.0, 6.0, 1.0) + ]) == Score(0.0, 1.0, 2.0) @test average([ - Score(1., 10., 100.), - Score(2., 20., 200.), - Score(3., 30., 300.) - ]) == Score(2., 20., 200.) + Score(1.0, 10.0, 100.0), + Score(2.0, 20.0, 200.0), + Score(3.0, 30.0, 300.0) + ]) == Score(2.0, 20.0, 200.0) end @testset "Evaluation Metrics" begin diff --git a/test/ngramizer.jl b/test/ngramizer.jl index 620eda0d..e2f734d1 100644 --- a/test/ngramizer.jl +++ b/test/ngramizer.jl @@ -4,24 +4,27 @@ sample_text = "this is some sample text" tkns = TextAnalysis.tokenize(Languages.English(), sample_text) ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1) - @test isequal(ngs, Dict{String,Int}("some" => 1, - "this" => 1, - "is" => 1, - "sample" => 1, - "text" => 1)) + @test isequal(ngs, Dict{String,Int}( + "some" => 1, + "this" => 1, + "is" => 1, + "sample" => 1, + "text" => 1)) ngs = TextAnalysis.ngramize(Languages.English(), tkns, 2) - @test isequal(ngs, Dict{String,Int}("this is" => 1, - "sample text" => 1, - "is some" => 1, - "some sample" => 1)) + @test isequal(ngs, Dict{String,Int}( + "this is" => 1, + "sample text" => 1, + "is some" => 1, + "some sample" => 1)) ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1, 2) - @test isequal(ngs, Dict{String,Int}("this is" => 1, - "is some" => 1, - "some sample" => 1, - "sample text" => 1, - "this" => 1, - "is" => 1, - "some" => 1, - "sample" => 1, - "text" => 1)) + @test isequal(ngs, Dict{String,Int}( + "this is" => 1, + "is some" => 1, + "some sample" => 1, + "sample text" => 1, + "this" => 1, + "is" => 1, + "some" => 1, + "sample" => 1, + "text" => 1)) end diff --git a/test/preprocessing.jl b/test/preprocessing.jl index f986b49b..1d29b1f6 100644 --- a/test/preprocessing.jl +++ b/test/preprocessing.jl @@ -82,22 +82,22 @@ @test "Hello world" == strip(d.text) style_html_doc = StringDocument( - """ - - - - - - -

Hello

world - - - """ - ) + """ + + + + + + +

Hello

world + + + """ + ) remove_html_tags!(style_html_doc) @test "Hello world" == strip(style_html_doc.text) @@ -107,8 +107,8 @@ #Test frequent_terms sparse_terms crps = Corpus(StringDocument.(sample_texts)) - @test isempty(setdiff(frequent_terms(crps),["string","is"])) - @test isempty(setdiff(sparse_terms(crps,0.3),["!"])) + @test isempty(setdiff(frequent_terms(crps), ["string", "is"])) + @test isempty(setdiff(sparse_terms(crps, 0.3), ["!"])) #Tests strip_punctuation regex conditions str = Document("These punctuations should be removed [-.,:;,!?'\"[](){}|\`#\$%@^&*_+<>") @@ -131,9 +131,9 @@ @test isequal(doc.text, "this is sample text") crps = Corpus( - [StringDocument(" Document 1"), - StringDocument(" Document 2 ")] - ) + [StringDocument(" Document 1"), + StringDocument(" Document 2 ")] + ) prepare!(crps, strip_whitespace) @test isequal(crps[1].text, "Document 1") @test isequal(crps[2].text, "Document 2") diff --git a/test/stemmer.jl b/test/stemmer.jl index 3fcad296..02debaa8 100644 --- a/test/stemmer.jl +++ b/test/stemmer.jl @@ -9,8 +9,8 @@ using Snowball Snowball.release(stmr) end - test_cases = Dict{String, Any}( - "english" => Dict{AbstractString, AbstractString}( + test_cases = Dict{String,Any}( + "english" => Dict{AbstractString,AbstractString}( "working" => "work", "worker" => "worker", "aβc" => "aβc", @@ -20,7 +20,7 @@ using Snowball for (alg, test_words) in test_cases stmr = Stemmer(alg) - for (n,v) in test_words + for (n, v) in test_words @test v == stem(stmr, n) end end diff --git a/test/summarizer.jl b/test/summarizer.jl index 80eb7c54..a7ea971e 100644 --- a/test/summarizer.jl +++ b/test/summarizer.jl @@ -1,28 +1,28 @@ @testset "Summarizer" begin -d = StringDocument(""" - Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk. - The move came after talks with a potential buyer, R Capital, collapsed leaving Poundworld with no option other than administration. - Poundworld, which serves two million customers a week from 355 stores, also trades under the Bargain Buys name. - Administrators Deloitte stress the stores will continue to trade as normal with no redundancies at this time. - It said in a statement: Like many high street retailers, Poundworld has suffered from high product cost inflation, decreasing footfall, weaker consumer confidence and an increasingly competitive discount retail market. - Clare Boardman, joint administrator at Deloitte, said: The retail trading environment in the UK remains extremely challenging and Poundworld has been seeking to address this through a restructure of its business. - Unfortunately, this has not been possible. - She said Deloitte believed a buyer could be found for the business, or at least part of it. - A spokesperson for Poundworlds owner, TPG said filing for administration had been a difficult decision. - Despite investing resources to strengthen the business, the decline in UK retail and changing consumer behaviour affected Poundworld significantly, they added. - """) + d = StringDocument(""" + Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk. + The move came after talks with a potential buyer, R Capital, collapsed leaving Poundworld with no option other than administration. + Poundworld, which serves two million customers a week from 355 stores, also trades under the Bargain Buys name. + Administrators Deloitte stress the stores will continue to trade as normal with no redundancies at this time. + It said in a statement: Like many high street retailers, Poundworld has suffered from high product cost inflation, decreasing footfall, weaker consumer confidence and an increasingly competitive discount retail market. + Clare Boardman, joint administrator at Deloitte, said: The retail trading environment in the UK remains extremely challenging and Poundworld has been seeking to address this through a restructure of its business. + Unfortunately, this has not been possible. + She said Deloitte believed a buyer could be found for the business, or at least part of it. + A spokesperson for Poundworlds owner, TPG said filing for administration had been a difficult decision. + Despite investing resources to strengthen the business, the decline in UK retail and changing consumer behaviour affected Poundworld significantly, they added. + """) -s = summarize(d) -@test length(s) == 5 -@test s[1] == "Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk." + s = summarize(d) + @test length(s) == 5 + @test s[1] == "Discount retailer Poundworld has appointed administrators, putting 5,100 jobs at risk." -s = summarize(d; ns=2) -@test length(s) == 2 + s = summarize(d; ns=2) + @test length(s) == 2 -short_doc = StringDocument("These is a small document. It has only 2 sentences in it.") + short_doc = StringDocument("These is a small document. It has only 2 sentences in it.") -s = summarize(short_doc) -@test length(s) == 2 + s = summarize(short_doc) + @test length(s) == 2 end diff --git a/test/taggingschemes.jl b/test/taggingschemes.jl index 0a96c4d5..d1f51b47 100644 --- a/test/taggingschemes.jl +++ b/test/taggingschemes.jl @@ -12,10 +12,12 @@ end @testset "BIO1 and BIOES" begin - tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", - "I-PER", "I-PER"] - tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", - "I-PER", "E-PER"] + tags_BIO1 = [ + "I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = [ + "S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] output_tags = deepcopy(tags_BIO1) tag_scheme!(tags_BIO1, "BIO1", "BIOES") @@ -26,10 +28,12 @@ end @testset "BIO2 and BIOES" begin - tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER", - "I-PER", "I-PER"] - tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", - "I-PER", "E-PER"] + tags_BIO2 = [ + "B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = [ + "S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] output_tags = deepcopy(tags_BIO2) tag_scheme!(tags_BIO2, "BIO2", "BIOES") diff --git a/test/tf_idf.jl b/test/tf_idf.jl index c326b224..56a0ab7d 100644 --- a/test/tf_idf.jl +++ b/test/tf_idf.jl @@ -12,10 +12,12 @@ m = DocumentTermMatrix(crps) # Terms are in alphabetical ordering - correctweights =[0.5 0.0 0.0 1/6 1/3 - 0.0 0.2 0.4 0.0 0.4 - 0.0 0.0 0.0 0.0 0.0 - 0.0 1/3 0.0 0.0 2/3] + correctweights = [ + 0.5 0.0 0.0 1/6 1/3 + 0.0 0.2 0.4 0.0 0.4 + 0.0 0.0 0.0 0.0 0.0 + 0.0 1/3 0.0 0.0 2/3 + ] myweights = tf(m) @test myweights == correctweights @@ -29,21 +31,23 @@ @test myweights ≈ correctweights @test typeof(myweights) <: Matrix - myweights = float(dtm(m)); + myweights = float(dtm(m)) tf!(myweights) @test myweights ≈ correctweights @test typeof(myweights) <: SparseMatrixCSC - myweights = float(dtm(m, :dense)); + myweights = float(dtm(m, :dense)) tf!(myweights) @test myweights ≈ correctweights @test typeof(myweights) <: Matrix # Terms are in alphabetical ordering - correctweights = [0.6931471805599453 0.0 0.0 0.23104906018664842 0.09589402415059362 - 0.0 0.13862943611198905 0.5545177444479562 0.0 0.11507282898071235 - 0.0 0.0 0.0 0.0 0.0 - 0.0 0.23104906018664842 0.0 0.0 0.19178804830118723] + correctweights = [ + 0.6931471805599453 0.0 0.0 0.23104906018664842 0.09589402415059362 + 0.0 0.13862943611198905 0.5545177444479562 0.0 0.11507282898071235 + 0.0 0.0 0.0 0.0 0.0 + 0.0 0.23104906018664842 0.0 0.0 0.19178804830118723 + ] myweights = tf_idf(m) @test myweights ≈ correctweights @@ -57,12 +61,12 @@ @test myweights ≈ correctweights @test typeof(myweights) <: Matrix - myweights = float(dtm(m)); + myweights = float(dtm(m)) tf_idf!(myweights) @test myweights ≈ correctweights @test typeof(myweights) <: SparseMatrixCSC - myweights = float(dtm(m, :dense)); + myweights = float(dtm(m, :dense)) tf_idf!(myweights) @test myweights ≈ correctweights @test typeof(myweights) <: Matrix @@ -80,15 +84,16 @@ max_tol = 1e-5 approx_eq(m1::AbstractMatrix{T}, m2::AbstractMatrix{T}; tol=1e-6 - ) where T = begin - t = all(abs.(m1-m2) .<= tol) + ) where {T} = begin + t = all(abs.(m1 - m2) .<= tol) return t end - correctweights = [1.29959 0.0 0.0 1.89031 0.405067; - 0.0 0.882404 1.54025 0.0 0.405067; - 0.0 0.0 0.0 0.0 0.0; - 0.0 1.40179 0.0 0.0 0.676646] + correctweights = [ + 1.29959 0.0 0.0 1.89031 0.405067; + 0.0 0.882404 1.54025 0.0 0.405067; + 0.0 0.0 0.0 0.0 0.0; + 0.0 1.40179 0.0 0.0 0.676646] myweights = bm_25(m) @@ -109,7 +114,7 @@ @test typeof(myweights) <: SparseMatrixCSC @test eltype(myweights) == typeof(1.0) - myweights = float(Matrix(dtm(crps))); + myweights = float(Matrix(dtm(crps))) myweights = bm_25(myweights) @test approx_eq(Float64.(myweights), correctweights, tol=max_tol) @test typeof(myweights) <: Matrix @@ -118,19 +123,19 @@ @test_throws MethodError bm_25!(DocumentTermMatrix(crps)) end - @testset "cosine similarity `cos_similarity()`" begin - crps = Corpus( StringDocument.([ - "to be or not to be", - "to sing or not to sing", - "to talk or to silence"]) ) - update_lexicon!(crps) - d = dtm(crps) - tfm = tf_idf(d) - cs = cos_similarity(tfm) - @test cs==[ - 1.0 0.03293177886245518 0.0; - 0.03293177886245518 1.0 0.0; - 0.0 0.0 1.0 - ] + @testset "cosine similarity `cos_similarity()`" begin + crps = Corpus(StringDocument.([ + "to be or not to be", + "to sing or not to sing", + "to talk or to silence"])) + update_lexicon!(crps) + d = dtm(crps) + tfm = tf_idf(d) + cs = cos_similarity(tfm) + @test cs == [ + 1.0 0.03293177886245518 0.0; + 0.03293177886245518 1.0 0.0; + 0.0 0.0 1.0 + ] end end