-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.jl
211 lines (179 loc) · 6.71 KB
/
utils.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
prepare_data(df::DataFrame, cols::Symbol...)
Prepare DataFrame for LCA by converting categorical columns to dummy variables if needed.
# Arguments
- `df::DataFrame`: Input DataFrame
- `cols::Symbol...`: Column names to use for analysis
- `zero_based::Union{Nothing, Vector{Bool}}=nothing`: Specify which columns are 0/1 coded.
If nothing, automatically detect based on data.
# Returns
- `Matrix{Int}`: Prepared data matrix
- `Vector{Int}`: Number of categories for each variable
- `Vector{Bool}`: Whether each column was treated as zero-based
"""
function prepare_data(
df::DataFrame, cols::Symbol...;
zero_based::Union{Nothing,Vector{Bool}}=nothing
)
# Initialize zero_based vector if not provided
if isnothing(zero_based)
zero_based = Vector{Bool}(undef, length(cols))
# Detect coding scheme for each column
for (i, col) in enumerate(cols)
if eltype(df[!, col]) <: Number
zero_based[i] = minimum(df[!, col]) == 0
else
zero_based[i] = false # Categorical columns are 1-based by default
end
end
else
if length(zero_based) != length(cols)
throw(ArgumentError("Length of zero_based must match number of columns"))
end
end
data = Matrix{Int}(undef, nrow(df), length(cols))
n_categories = Int[]
for (i, col) in enumerate(cols)
if eltype(df[!, col]) <: Number
# For zero-based columns, shift values to 1-based
if zero_based[i]
data[:, i] = df[!, col] .+ 1
push!(n_categories, length(unique(df[!, col])))
else
data[:, i] = df[!, col]
push!(n_categories, length(unique(df[!, col])))
end
else
# Categorical columns are always 1-based
categories = sort(unique(df[!, col]))
data[:, i] = indexin(df[!, col], categories)
push!(n_categories, length(categories))
end
# Validate data
if any(x -> x < 1, view(data, :, i))
throw(ArgumentError("Column $(cols[i]): Invalid values after processing. " *
"Check if zero_based specification is correct."))
end
end
return data, n_categories
end
"""
Check model identifiability
"""
function check_identifiability(n_items::Int, n_classes::Int, n_categories::Vector{Int})
# Use minimum categories as worst case bound
min_cat = minimum(n_categories)
required_items = 2 * ceil(Int, log(min_cat, n_classes)) + 1
if n_items < required_items
@warn(
"Model may not be identifiable. " *
"With $n_classes classes and minimum of $min_cat categories, " *
"need ideally $required_items items (got $n_items)."
)
end
return true
end
"""
diagnostics!(model::LCAModel, data::Matrix{Int}, ll::Float64)
Calculate model fit statistics including AIC, BIC, SBIC, and entropy.
# Arguments
- `model::LCAModel`: Fitted model
- `data::Matrix{Int}`: Data matrix
- `ll::Float64`: Log-likelihood from model fitting
# Returns
- `ModelDiagnostics`: Structure containing fit statistics
"""
function diagnostics!(model::LCAModel, data::Matrix{Int}, ll::Float64)
n_obs = size(data, 1)
# Calculate number of parameters
# Class probabilities (K-1) + Item probabilities for each class and item
n_params = (model.n_classes - 1) +
sum(cats -> (model.n_classes * (cats - 1)), model.n_categories)
# Calculate AIC and BIC
aic = -2 * ll + 2 * n_params
bic = -2 * ll + log(n_obs) * n_params
# Calculate sample-size adjusted BIC
n_star = (n_obs + 2) / 24
sbic = -2 * ll + log(n_star) * n_params
# Calculate entropy
posterior = zeros(n_obs, model.n_classes)
for i in 1:n_obs
for k in 1:model.n_classes
prob = log(model.class_probs[k])
for j in 1:model.n_items
prob += log(model.item_probs[j][k, data[i, j]])
end
posterior[i, k] = exp(prob)
end
posterior[i, :] ./= sum(posterior[i, :])
end
entropy = 0.0
for i in 1:n_obs
for k in 1:model.n_classes
p = posterior[i, k]
entropy -= p * log(p + eps()) # eps() to avoid log(0)
end
end
entropy = 1 - (entropy / (n_obs * log(model.n_classes)))
return ModelDiagnostics(ll, aic, bic, sbic, entropy)
end
"""
show_profiles(model::LCAModel, data::DataFrame, cols::Vector{Symbol};
var_names::Union{Nothing, Vector{String}}=nothing,
var_labels::Union{Nothing, Vector{Vector{String}}}=nothing,
digits::Int=3)
Display the latent class profiles with aligned columns.
"""
function show_profiles(model::LCAModel, data::DataFrame, cols::Vector{Symbol};
var_names::Union{Nothing,Vector{String}}=nothing,
var_labels::Union{Nothing,Vector{Vector{String}}}=nothing,
digits::Int=3)
# Use DataFrame column names if var_names not provided
display_names = isnothing(var_names) ? String[string(col) for col in cols] : var_names
# Extract or use provided category labels
if isnothing(var_labels)
var_labels = Vector{Vector{String}}()
for (i, col) in enumerate(cols)
if eltype(data[!, col]) <: CategoricalValue
push!(var_labels, string.(levels(data[!, col])))
else
unique_vals = sort(unique(data[!, col]))
push!(var_labels, string.(unique_vals))
end
end
end
# Print header
println("\nLatent Class Profiles")
println("="^80)
# Print class sizes with better alignment
println("Class Sizes:")
for k in 1:model.n_classes
pct = model.class_probs[k] * 100
println(" Class $k: $(rpad(@sprintf("%.1f", pct), 6))%")
end
println("-"^80)
# Calculate maximum label length for alignment
max_label_length = maximum(maximum(length.(labels)) for labels in var_labels)
# Print item probabilities for each class
for (i, var) in enumerate(display_names)
println("\n$var:")
# Print header row with class numbers
print(" "^(max_label_length + 2)) # Space for labels
for k in 1:model.n_classes
print("Class $k" * " "^7)
end
println()
# Print probabilities for each category
for (j, label) in enumerate(var_labels[i])
# Print label with padding
print(rpad("$label:", max_label_length + 2))
# Print probabilities
for k in 1:model.n_classes
pct = model.item_probs[i][k, j] * 100
print(rpad(@sprintf("%.3f%%", pct), 12))
end
println()
end
end
println("\n" * "-"^80)
end