JuliaText · rssdev10 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,11 +1,11 @@
 using Documenter, TextAnalysis
 
 makedocs(
-    modules = [TextAnalysis],
-    sitename = "TextAnalysis",
-    format = Documenter.HTML(
+    modules=[TextAnalysis],
+    sitename="TextAnalysis",
+    format=Documenter.HTML(
     ),
-    pages = [
+    pages=[
         "Home" => "index.md",
         "Documents" => "documents.md",
         "Corpus" => "corpus.md",

diff --git a/src/LM/api.jl b/src/LM/api.jl
@@ -6,7 +6,7 @@ It is used to evaluate score with masks out of vocabulary words
 The arguments are the same as for [`score`](@ref)
 """
 function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
-   score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
+    score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
 end
 
 """

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
@@ -2,9 +2,9 @@ abstract type Langmodel end
 abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
 abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing
 
-#DataType MLE
-#Type for providing MLE ngram model scores.
-#Implementation of Base Ngram Model.
+# DataType MLE
+# Type for providing MLE ngram model scores.
+# Implementation of Base Ngram Model.
 
 struct MLE <: Langmodel
     vocab::Vocabulary
@@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores.
 Implementation of Base Ngram Model.
 
 """
-function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
+function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     MLE(Vocabulary(word, unk_cutoff, unk_label))
 end
 
-function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
+function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
     text = lookup(lm.vocab, text)
-    text=convert(Array{String}, text)
+    text = convert(Array{String}, text)
     return counter2(text, min, max)
 end
 
@@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
 In addition to initialization arguments from BaseNgramModel also requires 
 a number by which to increase the counts, gamma.
 """
-function Lidstone(word::Vector{T}, gamma = 1.0, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
+function Lidstone(word::Vector{T}, gamma=1.0, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
 end
 
-function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
+function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
     text = lookup(lm.vocab, text)
-    text=convert(Array{String}, text)
+    text = convert(Array{String}, text)
     return counter2(text, min, max)
 end
 
 """
     Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
+
 Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
 
 In addition to initialization arguments from BaseNgramModel also requires
@@ -63,11 +64,11 @@ struct Laplace <: gammamodel
     gamma::Float64
 end
 
-function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
+function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
 end
 
-function (lm::Laplace)(text, min::Integer, max::Integer) 
+function (lm::Laplace)(text, min::Integer, max::Integer)
     text = lookup(lm.vocab, text)
     text = convert(Array{String}, text)
     return counter2(text, min, max)
@@ -84,35 +85,32 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models
 function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
     accum = temp_lm[context]
     #print(accum)
-    s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) 
-    for (text, count) in accum
-        if text == word
-            return(float(count+m.gamma)/s)
-        end
-    end
-    return(float(m.gamma)/s)
+    s = float(sum(accum) + (m.gamma) * length(m.vocab.vocab))
+    idx = something(findfirst(isequal(word), accum), 0)
+    return float(idx + m.gamma) / s
 end
 
 """
+$(TYPEDSIGNATURES)
+
 To get probability of word given that context
 
 In other words, for given context calculate frequency distribution of word
-
 """
 function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
-    (isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution
+    (isnothing(context) || isempty(context)) && return 1.0 / length(templ_lm) #provide distribution
 
     accum = templ_lm[context]
-    s = float(sum(accum)) 
+    s = float(sum(accum))
     for (text, count) in accum
         if text == word
-            return(float(count) / s)
+            return (float(count) / s)
         end
     end
     if context in keys(m.vocab.vocab)
         return 0.0
     end
-    return(Inf)
+    return (Inf)
 end
 
 """
@@ -125,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
     prob(m, temp_lm, word, context)
 end
 
-struct WittenBellInterpolated <: InterpolatedLanguageModel 
-    vocab ::Vocabulary
+struct WittenBellInterpolated <: InterpolatedLanguageModel
+    vocab::Vocabulary
 end
 
 """
@@ -137,41 +135,41 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing.
 The idea to abstract this comes from Chen & Goodman 1995.
 
 """
-function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
+function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
 end
 
-function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
+function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
     text = lookup(lm.vocab, text)
-    text=convert(Array{String}, text)
+    text = convert(Array{String}, text)
     return counter2(text, min, max)
 end
 # alpha_gamma function for KneserNeyInterpolated
 function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
     local alpha
     local gam
     accum = templ_lm[context]
-    s = float(sum(accum)) 
-    for (text,count) in accum
+    s = float(sum(accum))
+    for (text, count) in accum
         if text == word
-            alpha=(float(count) / s)
-            break 
+            alpha = (float(count) / s)
+            break
         else
-            alpha = 1/s
+            alpha = 1 / s
         end
     end
-   
+
     gam = gamma(accum)
-    return alpha*(1- gam), gam 
+    return alpha * (1 - gam), gam
 end
 
 function count_non_zero_vals(accum::Accumulator{})
-    return(length(accum))
+    return (length(accum))
 end
-    
+
 function gamma(accum)
-    nplus=count_non_zero_vals(accum)
-    return(nplus/(nplus+float(sum(accum))))
+    nplus = count_non_zero_vals(accum)
+    return (nplus / (nplus + float(sum(accum))))
 end
 
 """
@@ -187,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context
     (isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)
 
     if context in keys(temp_lm)
-        alpha,gamma = alpha_gammma(m, temp_lm, word, context)
-        return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
+        alpha, gamma = alpha_gammma(m, temp_lm, word, context)
+        return (alpha + gamma * score(m, temp_lm, word, context_reduce(context)))
     else
         return score(m, temp_lm, word, context_reduce(context))
     end
 end
-        
+
 function context_reduce(context)
     context = split(context)
     join(context[2:end], " ")
 end
 
 
-struct KneserNeyInterpolated <: InterpolatedLanguageModel 
+struct KneserNeyInterpolated <: InterpolatedLanguageModel
     vocab::Vocabulary
     discount::Float64
 end
@@ -213,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model.
 The idea to abstract this comes from Chen & Goodman 1995.
 
 """
-function KneserNeyInterpolated(word::Vector{T}, disc = 0.1, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
-    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
+function KneserNeyInterpolated(word::Vector{T}, disc=0.1, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
+    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), disc)
 end
 
-function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
+function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
     text = lookup(lm.vocab, text)
-    text=convert(Array{String}, text)
+    text = convert(Array{String}, text)
     return counter2(text, min, max)
 end
 # alpha_gamma function for KneserNeyInterpolated
 function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
     local alpha
-    local gamma   
+    local gamma
     accum = templ_lm[context]
-    s = float(sum(accum)) 
+    s = float(sum(accum))
     for (text, count) in accum
         if text == word
-            alpha=(max(float(count)-m.discount, 0.0) / s)
-            break 
+            alpha = (max(float(count) - m.discount, 0.0) / s)
+            break
         else
-            alpha = 1/length(m.vocab.vocab)
+            alpha = 1 / length(m.vocab.vocab)
         end
     end
-    gamma = (m.discount * count_non_zero_vals(accum) /s)
+    gamma = (m.discount * count_non_zero_vals(accum) / s)
     return alpha, gamma
 end