Skip to content

Commit 488bcab

Browse files
committed
fixed ai review comments
1 parent 9acd46a commit 488bcab

7 files changed

Lines changed: 128 additions & 174 deletions

File tree

statvar_imports/us_urban_school/ap_ib_gt_enrollment/advanced_placements/run_process.sh

Lines changed: 25 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -7,83 +7,47 @@ set -e
77
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
88
cd "$SCRIPT_DIR"
99

10-
# Default to not downloading
11-
DOWNLOAD=false
12-
13-
# Parse command line arguments
14-
while [[ "$#" -gt 0 ]]; do
15-
case $1 in
16-
--download) DOWNLOAD=true ;;
17-
*) echo "Unknown parameter passed: $1"; exit 1 ;;
18-
esac
19-
shift
20-
done
10+
# Always download data
11+
echo "--- Starting download of AP data ---"
12+
python3 ../download_ap_ib_gt.py --ap
13+
python3 ../download_2015_16.py --ap
14+
echo "--- Download complete ---"
2115

2216
# Function to process each downloaded data file.
2317
process_files() {
2418
# Create the output directory if it doesn't exist.
2519
mkdir -p output_files
2620

27-
declare -A processed_years
28-
29-
# Loop through all AP Enrollment files in the input directory to identify unique years
21+
# Loop through all Gifted and Talented Enrollment files in the input directory.
3022
for input_file in input_files/*_AP_Enrollment.*; do
3123
# Check if any file exists to avoid errors when no files are found.
3224
[ -e "$input_file" ] || continue
3325

26+
echo "Processing file: $input_file"
27+
28+
# Extract the year from the filename (e.g., "2014" from "2014_AP_Enrollment.xlsx").
3429
filename=$(basename "$input_file")
3530
year=$(echo "$filename" | cut -d'_' -f1)
36-
extension="${filename##*.}"
37-
38-
# Determine expected extension based on year
39-
expected_extension=""
40-
if [[ "$year" == "2010" || "$year" == "2012" || "$year" == "2014" ]]; then
41-
expected_extension="xlsx"
42-
else
43-
expected_extension="csv"
44-
fi
45-
46-
# Skip if the extension does not match the expected one
47-
if [[ "$extension" != "$expected_extension" ]]; then
48-
echo "Skipping $input_file: Expected .$expected_extension, but found .$extension."
49-
continue
50-
fi
5131

52-
# If this year hasn't been processed yet, process it
53-
if [[ -z ${processed_years[$year]} ]]; then
54-
processed_years[$year]=true
55-
echo "Processing year: $year"
56-
57-
# Define the glob pattern for input files for this year
58-
input_data_glob="input_files/${year}_AP_Enrollment_*.${expected_extension}"
59-
60-
# Define the output path based on the year.
61-
output_path="output_files/output_${year}_ap"
62-
63-
# Construct the command for the current year
64-
CMD_ARRAY=(python3 ../../../tools/statvar_importer/stat_var_processor.py \
65-
--input_data="${input_data_glob}" \
66-
--pv_map=../config/ap_enrollment_pvmap.csv \
67-
--config_file=../config/common_metadata.csv \
68-
--output_path="${output_path}" \
69-
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf)
70-
# Print and execute the command.
71-
echo "Executing command for year ${year}:"
72-
printf "%q " "${CMD_ARRAY[@]}"; echo
73-
"${CMD_ARRAY[@]}"
74-
echo "--- Finished processing for year ${year} ---"
75-
fi
32+
# Define the output path based on the year.
33+
output_path="output_files/output_${year}_ap"
34+
35+
# Construct the command from the manifest.
36+
CMD="python3 ../../../../../data/tools/statvar_importer/stat_var_processor.py"
37+
CMD+=" --input_data=\"${input_file}\""
38+
CMD+=" --pv_map=../config/ap_enrollment_pvmap.csv"
39+
CMD+=" --config_file=../config/common_metadata.csv"
40+
CMD+=" --output_path=\"${output_path}\""
41+
CMD+=" --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf"
42+
43+
# Print and execute the command.
44+
echo "Executing command for year ${year}:"
45+
echo "$CMD"
46+
eval "$CMD"
47+
echo "--- Finished processing for year ${year} ---"
7648
done
7749
}
7850

79-
if [ "$DOWNLOAD" = true ]; then
80-
echo "--- Starting download of AP data ---"
81-
python3 ../download_ap_ib_gt.py --ap
82-
python3 ../download_2015_16.py --ap
83-
echo "--- Download complete ---"
84-
fi
85-
8651
echo "--- Starting processing of files ---"
8752
process_files
8853
echo "--- All processing complete ---"
89-

statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/ap_enrollment_pvmap.csv

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ key,,,,,,,,,,
22
YEAR,observationDate,{Number},populationType,Student,assessmentType,AdvancedPlacement,enrollmentStatus,EnrolledInEducationOrTraining,,
33
ncesid,observationAbout,ncesId/{Data},,,,,,,,
44
SCH_APENR_IND,,,,,,,,,,
5-
SCH_APCOURSES,value,{Number},#Filter,Allow=int(value) >= 0,populationType,Course,enrollmentStatus,"""""",juvenileJusticeFacilityStatus,""""""
5+
SCH_APCOURSES,,,,,,,,,,
66
SCH_APSEL,,,,,,,,,,
77
SCH_APENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,,
88
SCH_APENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,,
@@ -67,26 +67,26 @@ SCH_APSCIENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,la
6767
SCH_APSCIENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Science
6868
SCH_APSCIENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Science
6969
SCH_APCOMPENR_IND,,,,,,,,,,
70-
SCH_APCOMPENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,ComputerScience
71-
SCH_APCOMPENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,ComputerScience
72-
SCH_APCOMPENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,ComputerScience
73-
SCH_APCOMPENR_AM_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,AmericanIndianOrAlaskaNative,schoolSubject,ComputerScience
74-
SCH_APCOMPENR_AS_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,ComputerScience
75-
SCH_APCOMPENR_AS_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,Asian,schoolSubject,ComputerScience
76-
SCH_APCOMPENR_HP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,ComputerScience
77-
SCH_APCOMPENR_HP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,ComputerScience
78-
SCH_APCOMPENR_BL_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,BlackOrAfricanAmericanAlone,schoolSubject,ComputerScience
79-
SCH_APCOMPENR_BL_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,BlackOrAfricanAmericanAlone,schoolSubject,ComputerScience
80-
SCH_APCOMPENR_WH_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,White,schoolSubject,ComputerScience
81-
SCH_APCOMPENR_WH_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,White,schoolSubject,ComputerScience
82-
SCH_APCOMPENR_TR_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,TwoOrMoreRaces,schoolSubject,ComputerScience
83-
SCH_APCOMPENR_TR_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,TwoOrMoreRaces,schoolSubject,ComputerScience
84-
TOT_APCOMPENR_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,,,schoolSubject,ComputerScience
85-
TOT_APCOMPENR_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,ComputerScience
86-
SCH_APCOMPENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,ComputerScience
87-
SCH_APCOMPENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,ComputerScience
88-
SCH_APCOMPENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,ComputerScience
89-
SCH_APCOMPENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,ComputerScience
70+
SCH_APCOMPENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Computer
71+
SCH_APCOMPENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,Computer
72+
SCH_APCOMPENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Computer
73+
SCH_APCOMPENR_AM_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,AmericanIndianOrAlaskaNative,schoolSubject,Computer
74+
SCH_APCOMPENR_AS_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Computer
75+
SCH_APCOMPENR_AS_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,Asian,schoolSubject,Computer
76+
SCH_APCOMPENR_HP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,Computer
77+
SCH_APCOMPENR_HP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,Computer
78+
SCH_APCOMPENR_BL_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,BlackOrAfricanAmericanAlone,schoolSubject,Computer
79+
SCH_APCOMPENR_BL_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,BlackOrAfricanAmericanAlone,schoolSubject,Computer
80+
SCH_APCOMPENR_WH_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,White,schoolSubject,Computer
81+
SCH_APCOMPENR_WH_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,White,schoolSubject,Computer
82+
SCH_APCOMPENR_TR_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,TwoOrMoreRaces,schoolSubject,Computer
83+
SCH_APCOMPENR_TR_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,TwoOrMoreRaces,schoolSubject,Computer
84+
TOT_APCOMPENR_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,,,schoolSubject,Computer
85+
TOT_APCOMPENR_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,Computer
86+
SCH_APCOMPENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Computer
87+
SCH_APCOMPENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Computer
88+
SCH_APCOMPENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Computer
89+
SCH_APCOMPENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Computer
9090
,,,,,,,,,,
9191
M_AME_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
9292
M_ASI_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Mathematics
@@ -129,3 +129,24 @@ F_2_OR_MORE_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Fem
129129
F_TOT_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,Science
130130
F_DIS_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Science
131131
F_LEP_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Science
132+
,,,,,,,,,,
133+
M_AME_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
134+
M_ASI_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Mathematics
135+
M_HIS_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Mathematics
136+
M_BLA_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,BlackOrAfricanAmericanAlone,schoolSubject,Mathematics
137+
M_WHI_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,White,schoolSubject,Mathematics
138+
M_HI_PAC_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,Mathematics
139+
M_2_OR_MORE_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,TwoOrMoreRaces,schoolSubject,Mathematics
140+
M_TOT_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,,,schoolSubject,Mathematics
141+
M_DIS_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Mathematics
142+
M_LEP_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Mathematics
143+
F_AME_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
144+
F_ASI_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,Asian,schoolSubject,Mathematics
145+
F_HIS_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,Mathematics
146+
F_BLA_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,BlackOrAfricanAmericanAlone,schoolSubject,Mathematics
147+
F_WHI_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,White,schoolSubject,Mathematics
148+
F_HI_PAC_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,NativeHawaiianOrOtherPacificIslanderAlone,schoolSubject,Mathematics
149+
F_2_OR_MORE_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,TwoOrMoreRaces,schoolSubject,Mathematics
150+
F_TOT_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,Mathematics
151+
F_DIS_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Mathematics
152+
F_LEP_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Mathematics
Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
parameter,value
2-
mapped_rows,1
3-
output_columns,"observationDate,observationAbout,variableMeasured,value"
4-
#input_rows,10
5-
mapped_columns,2
1+
parameter,value
2+
mapped_rows,1
3+
output_columns,"observationDate,observationAbout,variableMeasured,value"
4+
#input_rows,10
5+
mapped_columns,2
6+
dc_api_root,https://api.datacommons.org

statvar_imports/us_urban_school/ap_ib_gt_enrollment/download_2015_16.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@
2727

2828
FLAGS = flags.FLAGS
2929

30-
flags.DEFINE_boolean('ap', False, 'Download Advanced Placement data for 2015-16 only.')
31-
flags.DEFINE_boolean('ib', False, 'Download International Baccalaureate data for 2015-16 only.')
32-
flags.DEFINE_boolean('gt', False, 'Download Gifted and Talented data for 2015-16 only.')
30+
3331

3432

3533
def download_and_extract_2015_16(data_type, output_dir):

statvar_imports/us_urban_school/ap_ib_gt_enrollment/download_ap_ib_gt.py

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -441,43 +441,38 @@ def download_and_extract(config_key, config_data, data_type, output_dir):
441441
with zipfile.ZipFile(zip_content) as zip_ref:
442442
if data_type == 'ap' and config_key in ["2009-10", "2011-12"]:
443443
logging.info(f"--- Applying special AP subject handling for {config_key} data ---")
444-
subjects_to_find = ["Mathematics", "Science"]
445-
found_files = 0
446444

445+
math_file_name, science_file_name = None, None
446+
447+
# Find the math and science files in the zip
447448
for name in zip_ref.namelist():
448449
normalized_name = name.lower()
450+
# For 2011-12, the files are more specific, e.g., "...Students who are taking AP Science.xlsx"
451+
if 'ap' in normalized_name and 'mathematics' in normalized_name and 'students' in normalized_name and not '$' in name:
452+
math_file_name = name
453+
elif 'ap' in normalized_name and 'science' in normalized_name and 'students' in normalized_name and not '$' in name:
454+
science_file_name = name
455+
456+
if not math_file_name or not science_file_name:
457+
logging.warning(f"Could not find both AP Mathematics and Science files for {config_key}.")
458+
return
449459

450-
is_ap_file = False
451-
for pattern in keywords:
452-
if re.search(pattern, name, re.IGNORECASE):
453-
is_ap_file = True
454-
break
455-
456-
if not is_ap_file or '$' in name:
457-
continue
458-
459-
for subject in subjects_to_find:
460-
if subject.lower() in normalized_name:
461-
logging.info(f"Found AP {subject} file: {name}")
462-
463-
output_name = f"{final_year}_{output_name_fragment}_{subject}.xlsx"
464-
if not name.lower().endswith(".xlsx"):
465-
output_name = f"{final_year}_{output_name_fragment}_{subject}.csv"
466-
467-
output_path = output_dir / output_name
468-
469-
with zip_ref.open(name) as source_file:
470-
with open(output_path, 'wb') as f:
471-
f.write(source_file.read())
472-
473-
add_year_column(output_path, final_year)
474-
logging.info(f"Successfully extracted and saved: {output_path.name}")
475-
found_files += 1
476-
break
460+
# Read both files into pandas dataframes
461+
with zip_ref.open(math_file_name) as math_file:
462+
df_math = pd.read_excel(math_file)
463+
with zip_ref.open(science_file_name) as science_file:
464+
df_science = pd.read_excel(science_file)
465+
466+
# Merge the dataframes
467+
merged_df = pd.merge(df_math, df_science, on=['LEAID', 'SCHID'], how='outer')
477468

478-
if found_files == 0:
479-
logging.warning(f"Could not find any AP Mathematics or Science files for {config_key}.")
469+
# Save the merged dataframe
470+
output_name = f"{final_year}_{output_name_fragment}.xlsx"
471+
output_path = output_dir / output_name
472+
merged_df.to_excel(output_path, index=False)
480473

474+
add_year_column(output_path, final_year)
475+
logging.info(f"Successfully extracted, merged, and saved: {output_path.name}")
481476
return
482477

483478
output_name = f"{final_year}_{output_name_fragment}.csv"

0 commit comments

Comments
 (0)