-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_sequence.R
97 lines (80 loc) · 2.53 KB
/
02_sequence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
## ------------------------------------------------------------------------
##
## Script name: 02_sequence.R
## Purpose: Sequence analysis
## Author: Yanwen Wang
## Date Created: 2024-11-20
## Email: [email protected]
##
## ------------------------------------------------------------------------
##
## Notes:
##
## ------------------------------------------------------------------------
# 1 Prepare data ----------------------------------------------------------
# Select childless respondents
seq_childless_1540 <- seq_marital_1540 %>%
filter(n_children == 0) %>%
mutate(across(2:302, ~ replace(., . == "married", "remarried")))
# Create labels for marital status
lab <- c(
"cohabit",
"first marriage",
"remarried",
"never_married",
"unpartnered"
)
# Define sequence objects
seq_childless <- seqdef(seq_childless_1540[, 2:302],
labels = lab
)
# Define color palettes
cpal(seq_childless) <- c("#7eb0d5", "#bd7ebe", "#ffb55a", "#b2e061", "#fd7f6f")
# Calculate within-individual longitudinal entropy, turbulence, and complexity
seq_childless_1540$entropy <- as.data.frame(seqient(seq_childless))$Entropy
seq_childless_1540$turbulence <- as.data.frame(seqST(seq_childless))$Turbulence
seq_childless_1540$complexity <- as.data.frame(seqici(seq_childless))$C
# 2 Identify clusters ------------------------------------------------------
# Calculate pairwise distance by optimal matching with constant cost
omdist <- seqdist(
seq_childless,
method = "OMspell", sm = "CONSTANT", with.missing = TRUE
)
# Hierarchical clustering
hc_ward <- hclust(as.dist(omdist), method = "ward.D")
# Retrieve cluster membership and fit measures
clust <- as.clustrange(
hc_ward,
diss = omdist,
ncluster = 10
)
# The optimal number of clusters is 5.
# 3 Plot sequence profiles -------------------------------------------------
# Name the five clusters
clust_labels <- c(
"never married",
"married late",
"married early",
"married ontime",
"unpartnered"
)
seq_childless_1540$cluster5 <- factor(
clust$clustering$cluster5,
levels = c(1, 2, 3, 4, 5),
labels = clust_labels
)
# Plot individual sequences and distribution of status by cluster
plot_individual <- ggseqiplot(
seq_childless,
group = seq_childless$cluster5
) +
labs(title = "Individual Sequences by Cluster") +
theme(legend.position = "none")
plot_distribution <- ggseqdplot(
seq_childless,
group = seq_childless$cluster5
) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Distribution of Status by Cluster")
# Combine plots
plot_individual / plot_distribution