Skip to content

Commit 05900d7

Browse files
committed
resolved core comments
1 parent 36f07b6 commit 05900d7

4 files changed

Lines changed: 8 additions & 136 deletions

File tree

statvar_imports/us_urban_school/ap_ib_gt_enrollment/README.md

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -78,23 +78,18 @@ The scripts perform the following processing steps:
7878

7979
## Processing the downloaded data
8080

81-
After downloading the data, you can process it by running the `run_process.sh` script in each of the data directories.
82-
For example, to process the Advanced Placement data, run the following command:
81+
To process the data, you can run the `run_process.sh` script in each of the data directories. This script first downloads the relevant data and then processes it.
8382

84-
```bash
85-
bash statvar_imports/us_urban_school/ap_ib_gt_enrollment/advanced_placements/run_process.sh
86-
```
87-
88-
You can also download and process the data in one step by using the `--download` flag:
83+
For example, to process the Advanced Placement data, change into the directory and then run the script:
8984

9085
```bash
91-
bash statvar_imports/us_urban_school/ap_ib_gt_enrollment/advanced_placements/run_process.sh --download
86+
cd statvar_imports/us_urban_school/ap_ib_gt_enrollment/advanced_placements/
87+
bash run_process.sh
9288
```
9389

9490
The processing script will:
9591

96-
1. Create an `output_files` directory.
97-
98-
2. Process the downloaded data for each year.
99-
100-
3. Generate statistical variables using the `stat_var_processor.py` script.
92+
1. **Download the data** using `download_ap_ib_gt.py` and `download_2015_16.py`.
93+
2. Create an `output_files` directory.
94+
3. Process the downloaded data for each year.
95+
4. Generate statistical variables using the `stat_var_processor.py` script.

statvar_imports/us_urban_school/ap_ib_gt_enrollment/config/ap_enrollment_pvmap.csv

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
key,,,,,,,,,,
22
YEAR,observationDate,{Number},populationType,Student,assessmentType,AdvancedPlacement,enrollmentStatus,EnrolledInEducationOrTraining,,
33
ncesid,observationAbout,ncesId/{Data},,,,,,,,
4-
SCH_APENR_IND,,,,,,,,,,
5-
SCH_APCOURSES,,,,,,,,,,
6-
SCH_APSEL,,,,,,,,,,
74
SCH_APENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,,
85
SCH_APENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,,
96
SCH_APENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,,
@@ -24,7 +21,6 @@ SCH_APENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,languag
2421
SCH_APENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,,
2522
SCH_APENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,,
2623
SCH_APENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,,
27-
SCH_APMATHENR_IND,,,,,,,,,,
2824
SCH_APMATHENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Mathematics
2925
SCH_APMATHENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,Mathematics
3026
SCH_APMATHENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
@@ -45,7 +41,6 @@ SCH_APMATHENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,lan
4541
SCH_APMATHENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Mathematics
4642
SCH_APMATHENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Mathematics
4743
SCH_APMATHENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Mathematics
48-
SCH_APSCIENR_IND,,,,,,,,,,
4944
SCH_APSCIENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Science
5045
SCH_APSCIENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,Science
5146
SCH_APSCIENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Science
@@ -66,7 +61,6 @@ SCH_APSCIENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,lang
6661
SCH_APSCIENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Science
6762
SCH_APSCIENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Science
6863
SCH_APSCIENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Science
69-
SCH_APCOMPENR_IND,,,,,,,,,,
7064
SCH_APCOMPENR_HI_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Computer
7165
SCH_APCOMPENR_HI_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,race,HispanicOrLatino,schoolSubject,Computer
7266
SCH_APCOMPENR_AM_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Computer
@@ -87,7 +81,6 @@ SCH_APCOMPENR_LEP_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,lan
8781
SCH_APCOMPENR_LEP_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Computer
8882
SCH_APCOMPENR_IDEA_M,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,disabilityStatus,WithDisability,schoolSubject,Computer
8983
SCH_APCOMPENR_IDEA_F,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Computer
90-
,,,,,,,,,,
9184
M_AME_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
9285
M_ASI_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Mathematics
9386
M_HIS_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Mathematics
@@ -108,7 +101,6 @@ F_2_OR_MORE_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female
108101
F_TOT_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,Mathematics
109102
F_DIS_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Mathematics
110103
F_LEP_7_MATH_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Mathematics
111-
,,,,,,,,,,
112104
M_AME_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Science
113105
M_ASI_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Science
114106
M_HIS_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Science
@@ -129,7 +121,6 @@ F_2_OR_MORE_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Fem
129121
F_TOT_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,,,schoolSubject,Science
130122
F_DIS_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,disabilityStatus,WithDisability,schoolSubject,Science
131123
F_LEP_7_SCIENCE_AP,value,{Number},#Filter,Allow=int(value) >= 0,gender,Female,languageFluencyLevel,LimitedEnglishProficient,schoolSubject,Science
132-
,,,,,,,,,,
133124
M_AME_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,AmericanIndianOrAlaskaNative,schoolSubject,Mathematics
134125
M_ASI_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,Asian,schoolSubject,Mathematics
135126
M_HIS_7_MATH_ENR,value,{Number},#Filter,Allow=int(value) >= 0,gender,Male,race,HispanicOrLatino,schoolSubject,Mathematics

statvar_imports/us_urban_school/ap_ib_gt_enrollment/download_2015_16.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@
2727

2828
FLAGS = flags.FLAGS
2929

30-
31-
32-
3330
def download_and_extract_2015_16(data_type, output_dir):
3431
"""
3532
Downloads the ZIP archive for 2015-16 and extracts the target file.

statvar_imports/us_urban_school/ap_ib_gt_enrollment/download_ap_ib_gt.py

Lines changed: 0 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -96,44 +96,26 @@
9696

9797
}
9898

99-
100-
10199
HARDCODED_CONFIGS = {
102-
103100
"2021-22": {"final_year": 2022},
104-
105101
"2020-21": {"final_year": 2021},
106-
107102
"2017-18": {"final_year": 2018},
108-
109103
"2015-16": {"final_year": 2016},
110-
111104
"2013-14": {"final_year": 2014},
112-
113105
"2011-12": {"final_year": 2012},
114-
115106
"2009-10": {"final_year": 2010}
116107

117108
}
118109

119-
120-
121110
SCRIPT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
122111

123-
124-
125112
def add_year_column(file_path, year):
126113

127114
"""
128-
129115
Reads the file (CSV or XLSX), adds the 'YEAR' and 'ncesid' columns,
130-
131116
and saves it back, using 'latin1' encoding for robustness.
132-
133117
If the file is empty or contains no data rows, it is deleted.
134-
135118
"""
136-
137119
logging.info(f"-> Starting post-processing for {file_path.name}")
138120

139121
df = pd.DataFrame()
@@ -164,8 +146,6 @@ def add_year_column(file_path, year):
164146

165147
return
166148

167-
168-
169149
if df.empty:
170150

171151
logging.warning(f"Warning: DataFrame is empty after reading {file_path.name}. Deleting empty file.")
@@ -174,14 +154,10 @@ def add_year_column(file_path, year):
174154

175155
return
176156

177-
178-
179157
# Add YEAR column
180158

181159
df['YEAR'] = year
182160

183-
184-
185161
# Create NCESID column
186162

187163
if all(col in df.columns for col in ["LEAID", "SCHID"]):
@@ -196,8 +172,6 @@ def add_year_column(file_path, year):
196172

197173
logging.warning(f"-> Warning: Missing LEAID or SCHID columns for NCESID creation in {file_path.name}")
198174

199-
200-
201175
# Reorder columns to place 'YEAR', 'ncesid', and 'JJ' (if present) at the beginning
202176

203177
cols = df.columns.tolist()
@@ -208,26 +182,18 @@ def add_year_column(file_path, year):
208182

209183
desired_first_cols.append('JJ')
210184

211-
212-
213185
# Remove desired_first_cols from their original positions
214186

215187
for col in desired_first_cols:
216-
217188
if col in cols:
218-
219189
cols.remove(col)
220190

221-
222-
223191
# Construct the new column order
224192

225193
new_column_order = desired_first_cols + cols
226194

227195
df = df[new_column_order]
228196

229-
230-
231197
if file_path.suffix == '.xlsx':
232198

233199
df.to_excel(file_path, index=False)
@@ -236,12 +202,8 @@ def add_year_column(file_path, year):
236202

237203
df.to_csv(file_path, index=False)
238204

239-
240-
241205
logging.info(f"-> Successfully added 'YEAR' and 'ncesid' columns to {file_path.name}")
242206

243-
244-
245207
except Exception as e:
246208

247209
# If an error occurs during processing but after the file was created, attempt to delete it.
@@ -254,88 +216,41 @@ def add_year_column(file_path, year):
254216

255217
raise RuntimeError(f"Failed to add year/ncesid column to {file_path.name}: {e}")
256218

257-
258-
259219
def generate_future_configs(start_year):
260220

261221
"""
262-
263222
Generates configuration dictionaries for future biennial CRDC years.
264-
265223
"""
266-
267224
current_calendar_year = date.today().year
268225

269226
generated_configs = {}
270227

271-
272-
273228
for year in range(start_year, current_calendar_year + 1):
274-
275229
end_year = year + 1
276-
277230
year_range_key = f"{year}-{end_year % 100:02d}"
278231

279-
280-
281232
if year_range_key in HARDCODED_CONFIGS:
282-
283233
continue
284234

285-
286-
287235
generated_configs[year_range_key] = {
288236

289237
"final_year": end_year
290-
291238
}
292239

293-
294-
295240
return generated_configs
296241

297-
298-
299242
@retry(tries=3, delay=5, backoff=2)
300243

301-
302-
303244
def download_url_with_retry(zip_url):
304-
305-
306-
307245
"""
308-
309-
310-
311246
Handles downloading the ZIP content with retries and status checks.
312-
313-
314-
315247
"""
316-
317-
318-
319248
head_response = requests.head(zip_url, allow_redirects=True, timeout=10)
320-
321-
322-
323249
head_response.raise_for_status()
324-
325-
326-
327250
response = requests.get(zip_url, stream=True, timeout=180)
328-
329-
330-
331251
response.raise_for_status()
332-
333-
334-
335252
return response
336253

337-
338-
339254
def get_file_keywords(data_type, config_key):
340255

341256
"""
@@ -360,44 +275,26 @@ def get_file_keywords(data_type, config_key):
360275

361276
return keywords, specific_constraint, output_name_fragment, pvmap_config
362277

363-
364-
365-
366-
367278
def get_req_cols_from_config(config_path):
368279

369280
"""
370-
371281
Reads a CSV config file to extract a list of required column names.
372-
373282
It takes the first column, filters out specific values ('YEAR', 'ncesid', ''),
374-
375283
and adds 'LEAID' and 'SCHID'.
376-
377284
"""
378285

379286
try:
380287

381288
df = pd.read_csv(config_path, header=None, usecols=[0], on_bad_lines='skip')
382-
383289
# Get first column as a list
384-
385290
req_cols = df[0].tolist()
386-
387291
# Remove header 'key'
388-
389292
if 'key' in req_cols:
390-
391293
req_cols.remove('key')
392-
393294
# Filter out values
394-
395295
req_cols = [col for col in req_cols if col not in ['YEAR', 'ncesid', ''] and pd.notna(col)]
396-
397296
# Add necessary columns for ncesid creation
398-
399297
req_cols.extend(['LEAID', 'SCHID'])
400-
401298
return req_cols
402299

403300
except Exception as e:
@@ -406,8 +303,6 @@ def get_req_cols_from_config(config_path):
406303

407304
return None
408305

409-
410-
411306
def download_and_extract(config_key, config_data, data_type, output_dir):
412307
"""
413308
Downloads the ZIP archive and extracts the target file using keyword search,
@@ -525,10 +420,6 @@ def download_and_extract(config_key, config_data, data_type, output_dir):
525420
except Exception as e:
526421
raise RuntimeError(f"An unhandled error occurred during extraction/processing for {config_key}: {e}")
527422

528-
529-
530-
531-
532423
def main(_):
533424
"""
534425
Main function to run the download process for AP, IB, and GT data.
@@ -565,8 +456,6 @@ def main(_):
565456

566457
logging.info(f"\n--- All downloads complete ---")
567458

568-
569-
570459
if __name__ == '__main__':
571460

572461
app.run(main)

0 commit comments

Comments
 (0)