-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode3
106 lines (90 loc) · 3.63 KB
/
Code3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Install required packages if they're not already installed
if (!require("Hmisc")) {
install.packages("Hmisc")
}
if (!require("writexl")) {
install.packages("writexl")
}
if (!require("haven")) {
install.packages("haven")
}
if (!require("dplyr")) {
install.packages("dplyr")
}
library(Hmisc)
library(dplyr)
library(writexl)
library(haven)
# Define the path to your Desktop folder
desktop_path <- "~/Desktop/Rick"
# Load the data for years 2015 to 2023
years <- 2015:2023
file_paths <- paste0(desktop_path, "/epi_cpsorg_1979_2024/epi_cpsorg_", years, ".dta")
# Load the data for each year
data_list <- lapply(file_paths, haven::read_dta)
df <- bind_rows(data_list)
# Ensure wage column is cleaned up and outliers handled
df <- df %>%
mutate(wage = ifelse(wage > 0, wage, NA)) %>%
filter(!is.na(wage))
# Apply the EDUC_GROUP classification based on the working EDUC variable
df <- df %>%
mutate(EDUC_GROUP = case_when(
educ %in% c(1, 2, 3) ~ "Less than Bachelor's",
educ %in% c(4, 5) ~ "Bachelor's and Higher"
))
# Convert female to character to ensure consistency
df <- df %>%
mutate(female = as.character(female))
# Filter data to include necessary columns and group by year, sex, and EDUC_GROUP
df_filtered <- df %>%
select(year, female, EDUC_GROUP, wage, orgwgt) %>%
filter(!is.na(EDUC_GROUP), !is.na(wage))
# Group and calculate weighted averages and total employees for gendered data
df_summary_gender <- df_filtered %>%
group_by(year, female, EDUC_GROUP) %>%
summarise(
avg_hourly_wage = wtd.mean(wage, weights = orgwgt, na.rm = TRUE),
total_employees = sum(orgwgt, na.rm = TRUE) / 1000
)
# Add the "All" category for EDUC_GROUP by summarizing without filtering on EDUC_GROUP, but split by education levels
df_summary_all_educ <- df_filtered %>%
group_by(year, EDUC_GROUP) %>%
summarise(
avg_hourly_wage = wtd.mean(wage, weights = orgwgt, na.rm = TRUE),
total_employees = sum(orgwgt, na.rm = TRUE) / 1000
) %>%
mutate(female = "All") # Mark these rows as "All" gender, broken down by education levels
# Add the **Gender-only** breakdown without any education filter
df_summary_gender_all_educ <- df_filtered %>%
group_by(year, female) %>%
summarise(
avg_hourly_wage = wtd.mean(wage, weights = orgwgt, na.rm = TRUE),
total_employees = sum(orgwgt, na.rm = TRUE) / 1000
) %>%
mutate(EDUC_GROUP = "All") # Mark this as "All" education levels.
# Add the true "All" category (combining all education levels and all genders)
df_summary_all <- df_filtered %>%
group_by(year) %>%
summarise(
avg_hourly_wage = wtd.mean(wage, weights = orgwgt, na.rm = TRUE),
total_employees = sum(orgwgt, na.rm = TRUE) / 1000
) %>%
mutate(female = "All", EDUC_GROUP = "All") # True 'All' category across all educ levels and all genders
# Combine gendered, All-educ-levels, true 'All', and **Gender-only** data ensuring data types match
df_combined <- bind_rows(df_summary_gender, df_summary_all_educ, df_summary_gender_all_educ, df_summary_all)
# Convert female to 'Sex' column for output clarity
df_combined <- df_combined %>%
mutate(Sex = case_when(
female == "1" ~ "Female",
female == "0" ~ "Male",
female == "All" ~ "All"
)) %>%
select(year, Sex, EDUC_GROUP, avg_hourly_wage, total_employees)
# Drop the 'female' column for neatness
df_combined <- df_combined[ , names(df_combined) != "female"]
# Save the result to an Excel file and CSV file
write_xlsx(df_combined, path = paste0(desktop_path,"/cps_avg_wages_2015_2023_v11.xlsx"))
write.csv(df_combined, file = paste0(desktop_path,"/cps_avg_wages_2015_2023_v11.csv"))
# Debug - output unique values of educ
print(unique(df$educ))