Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/style improvement #282

Merged
merged 2 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
using Documenter, TextAnalysis

makedocs(
modules = [TextAnalysis],
sitename = "TextAnalysis",
format = Documenter.HTML(
modules=[TextAnalysis],
sitename="TextAnalysis",
format=Documenter.HTML(
),
pages = [
pages=[
"Home" => "index.md",
"Documents" => "documents.md",
"Corpus" => "corpus.md",
Expand Down
2 changes: 1 addition & 1 deletion src/LM/api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ It is used to evaluate score with masks out of vocabulary words
The arguments are the same as for [`score`](@ref)
"""
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
end

"""
Expand Down
104 changes: 51 additions & 53 deletions src/LM/langmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ abstract type Langmodel end
abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing

#DataType MLE
#Type for providing MLE ngram model scores.
#Implementation of Base Ngram Model.
# DataType MLE
# Type for providing MLE ngram model scores.
# Implementation of Base Ngram Model.

struct MLE <: Langmodel
vocab::Vocabulary
Expand All @@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores.
Implementation of Base Ngram Model.

"""
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
MLE(Vocabulary(word, unk_cutoff, unk_label))
end

function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end

Expand All @@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
In addition to initialization arguments from BaseNgramModel also requires
a number by which to increase the counts, gamma.
"""
function Lidstone(word::Vector{T}, gamma = 1.0, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function Lidstone(word::Vector{T}, gamma=1.0, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
end

function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end

"""
Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}

Function to initiate Type(Laplace) for providing Laplace-smoothed scores.

In addition to initialization arguments from BaseNgramModel also requires
Expand All @@ -63,11 +64,11 @@ struct Laplace <: gammamodel
gamma::Float64
end

function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
end

function (lm::Laplace)(text, min::Integer, max::Integer)
function (lm::Laplace)(text, min::Integer, max::Integer)
text = lookup(lm.vocab, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
Expand All @@ -84,35 +85,32 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models
function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
accum = temp_lm[context]
#print(accum)
s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab))
for (text, count) in accum
if text == word
return(float(count+m.gamma)/s)
end
end
return(float(m.gamma)/s)
s = float(sum(accum) + (m.gamma) * length(m.vocab.vocab))
idx = something(findfirst(isequal(word), accum), 0)
return float(idx + m.gamma) / s
end

"""
$(TYPEDSIGNATURES)

To get probability of word given that context

In other words, for given context calculate frequency distribution of word

"""
function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
(isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution
(isnothing(context) || isempty(context)) && return 1.0 / length(templ_lm) #provide distribution

accum = templ_lm[context]
s = float(sum(accum))
s = float(sum(accum))
for (text, count) in accum
if text == word
return(float(count) / s)
return (float(count) / s)
end
end
if context in keys(m.vocab.vocab)
return 0.0
end
return(Inf)
return (Inf)
end

"""
Expand All @@ -125,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
prob(m, temp_lm, word, context)
end

struct WittenBellInterpolated <: InterpolatedLanguageModel
vocab ::Vocabulary
struct WittenBellInterpolated <: InterpolatedLanguageModel
vocab::Vocabulary
end

"""
Expand All @@ -137,41 +135,41 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing.
The idea to abstract this comes from Chen & Goodman 1995.

"""
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
end

function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end
# alpha_gamma function for KneserNeyInterpolated
function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
local alpha
local gam
accum = templ_lm[context]
s = float(sum(accum))
for (text,count) in accum
s = float(sum(accum))
for (text, count) in accum
if text == word
alpha=(float(count) / s)
break
alpha = (float(count) / s)
break
else
alpha = 1/s
alpha = 1 / s
end
end

gam = gamma(accum)
return alpha*(1- gam), gam
return alpha * (1 - gam), gam
end

function count_non_zero_vals(accum::Accumulator{})
return(length(accum))
return (length(accum))
end

function gamma(accum)
nplus=count_non_zero_vals(accum)
return(nplus/(nplus+float(sum(accum))))
nplus = count_non_zero_vals(accum)
return (nplus / (nplus + float(sum(accum))))
end

"""
Expand All @@ -187,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context
(isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)

if context in keys(temp_lm)
alpha,gamma = alpha_gammma(m, temp_lm, word, context)
return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
alpha, gamma = alpha_gammma(m, temp_lm, word, context)
return (alpha + gamma * score(m, temp_lm, word, context_reduce(context)))
else
return score(m, temp_lm, word, context_reduce(context))
end
end

function context_reduce(context)
context = split(context)
join(context[2:end], " ")
end


struct KneserNeyInterpolated <: InterpolatedLanguageModel
struct KneserNeyInterpolated <: InterpolatedLanguageModel
vocab::Vocabulary
discount::Float64
end
Expand All @@ -213,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model.
The idea to abstract this comes from Chen & Goodman 1995.

"""
function KneserNeyInterpolated(word::Vector{T}, disc = 0.1, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
function KneserNeyInterpolated(word::Vector{T}, disc=0.1, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), disc)
end

function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end
# alpha_gamma function for KneserNeyInterpolated
function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
local alpha
local gamma
local gamma
accum = templ_lm[context]
s = float(sum(accum))
s = float(sum(accum))
for (text, count) in accum
if text == word
alpha=(max(float(count)-m.discount, 0.0) / s)
break
alpha = (max(float(count) - m.discount, 0.0) / s)
break
else
alpha = 1/length(m.vocab.vocab)
alpha = 1 / length(m.vocab.vocab)
end
end
gamma = (m.discount * count_non_zero_vals(accum) /s)
gamma = (m.discount * count_non_zero_vals(accum) / s)
return alpha, gamma
end
Loading
Loading