Skip to content

Commit 491a969

Browse files
Merge branch 'master' into US_UrbanSchool_Finances
2 parents 910b5aa + 7fc816e commit 491a969

54 files changed

Lines changed: 4095 additions & 16 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scripts/us_census/pep/monthly_population_estimate/preprocess.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def _transform_data(self, df: pd.DataFrame, file: str) -> None:
318318
ascending=False,
319319
inplace=True)
320320
# Data for 2020 exists in two sources, causing overlap. We'll eliminate duplicates
321-
#self._df.drop_duplicates("Date", keep="last", inplace=True)
321+
self._df.drop_duplicates("Date", keep="last", inplace=True)
322322
self._df.drop(['date_range'], axis=1, inplace=True)
323323
float_col = self._df.select_dtypes(include=['float64'])
324324
for col in float_col.columns.values:

scripts/us_census/pep/population_estimate_by_race/preprocess.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,12 @@ def _clean_county_2022_csv_file(df: pd.DataFrame,
517517
'2': '2020',
518518
'3': '2021',
519519
'4': '2022',
520-
'5': '2023'
520+
'5': '2023',
521+
'6': '2024',
522+
'7': '2025',
523+
'8': '2026',
524+
'9': '2027',
525+
'10': '2028'
521526
}
522527
df = df.replace({'YEAR': conversion_of_year_to_value})
523528
df.insert(6, 'geo_ID', 'geoId/', True)
@@ -1006,12 +1011,15 @@ def _transform_data(self, df: pd.DataFrame, file_path: str) -> None:
10061011
"Count_Person_AsianOrPacificIslander",\
10071012
"Count_Person_TwoOrMoreRaces","Count_Person_NonWhite"]]
10081013
df_before_2000 = self.df[self.df["Year"] < 2000]
1009-
df_county_after_2000 = self.df[(self.df["Year"] >= 2000) &
1010-
(self.df["geo_ID"] != "country/USA")
1011-
& (self.df["geo_ID"].str.len() > 9)]
1014+
df_county_after_2000 = self.df[
1015+
(self.df["Year"] >= 2000) &
1016+
(self.df["geo_ID"] != "country/USA") &
1017+
(self.df["geo_ID"].str.len() > 9)].drop_duplicates(
1018+
subset=['Year', 'geo_ID'], keep='last')
10121019
df_national_state_2000 = self.df[(self.df["Year"] >= 2000) & (
10131020
(self.df["geo_ID"].str.len() <= 9) |
1014-
(self.df["geo_ID"] == "country/USA"))]
1021+
(self.df["geo_ID"] == "country/USA"))].drop_duplicates(
1022+
subset=['Year', 'geo_ID'], keep='last')
10151023
df_before_2000.to_csv(os.path.join(
10161024
self.cleaned_csv_file_path,
10171025
"USA_Population_Count_by_Race_before_2000.csv"),

scripts/us_nces/demographics/private_school/process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def set_generate_statvars_flag(self, flag: bool):
6767

6868
if __name__ == '__main__':
6969
try:
70-
logging.set_verbosity(2)
70+
logging.set_verbosity(logging.INFO)
7171
logging.info("Main Method Starts For Private School District ")
7272
gcs_output_dir_local = os.path.join(
7373
os.path.dirname(os.path.abspath(__file__)), "gcs_folder")

scripts/us_nces/demographics/public_school/process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class NCESPublicSchool(USEducation):
5353

5454
if __name__ == '__main__':
5555
try:
56-
logging.set_verbosity(2)
56+
logging.set_verbosity(logging.INFO)
5757
logging.info("Main Method Starts For Public School")
5858
gcs_output_dir_local = os.path.join(
5959
os.path.dirname(os.path.abspath(__file__)), "gcs_folder")

scripts/us_nces/demographics/school_district/process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class NCESDistrictSchool(USEducation):
5353

5454
if __name__ == '__main__':
5555
try:
56-
logging.set_verbosity(2)
56+
logging.set_verbosity(logging.INFO)
5757
logging.info("Main Method Starts For School District ")
5858
gcs_output_dir_local = os.path.join(
5959
os.path.dirname(os.path.abspath(__file__)), "gcs_folder")
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# child_birth
2+
3+
The data set contains USA birth data
4+
5+
Download:
6+
Data download URL : https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD
7+
8+
###Execution steps :
9+
10+
To Download, run:
11+
12+
python3 util/download_util_script.py --download_url=https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD --output_folder=input_files/
13+
14+
Note : The downloaded file will be saved as "input_files/rows.csv"
15+
16+
###How to run:
17+
18+
python3 stat_var_processor.py
19+
--input_data=../../statvar_imports/child_birth/input_files/*.csv \
20+
--pv_map=../../statvar_imports/child_birth/pvmap.csv \
21+
--places_resolved_csv=../../statvar_imports/child_birth/place_resolved.csv \
22+
--config_file=../../statvar_imports/child_birth/metadata.csv \
23+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf \
24+
--output_path=../../statvar_imports/child_birth/output_files/child_birth
25+
26+
27+
28+
###Example
29+
30+
To Process the files, Run:
31+
32+
Execute the script from the `tools/statvar_importer/` directory.
33+
34+
```
35+
python3 stat_var_processor.py
36+
--input_data=../../statvar_imports/child_birth/input_files/*.csv
37+
--pv_map=../../statvar_imports/child_birth/pvmap.csv
38+
--places_resolved_csv=../../statvar_imports/child_birth/places_resolved.csv
39+
--config_file=../../statvar_imports/child_birth/metadata.csv
40+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
41+
--output_path=../../statvar_imports/child_birth/output_files/child_birth
42+
```
43+
44+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"import_specifications": [
3+
{
4+
"import_name": "usa_child_birth",
5+
"curator_emails": [
6+
"support@datacommons.org"
7+
],
8+
"provenance_url": "https://www.cdc.gov/nchs/nvss/vsrr/provisional-tables.html",
9+
"provenance_description": "The data set contains USA birth data",
10+
"scripts": [
11+
"../../util/download_util_script.py --download_url=https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD --output_folder=input_files/",
12+
"../../tools/statvar_importer/stat_var_processor.py --input_data=input_files/*.csv --pv_map=pvmap.csv --config_file=metadata.csv --places_resolved_csv=places_resolved.csv --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf --output_path=output_files/child_birth"
13+
],
14+
"source_files": [
15+
"input_files/*.csv"
16+
],
17+
"import_inputs": [
18+
{
19+
"template_mcf": "output_files/child_birth.tmcf",
20+
"cleaned_csv": "output_files/child_birth.csv"
21+
}
22+
],
23+
"cron_schedule": "0 07 * * 2"
24+
}
25+
]
26+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
parameter,value
2+
#places_within,country/USA
3+
output_columns,"observationAbout,observationDate,value,variableMeasured,observationPeriod"
4+
header_rows,1
5+
mapped_columns,5
6+
dc_api_root,https://api.datacommons.org
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
place_name,dcid
2+
UNITED STATES,country/USA
3+
ALABAMA,geoId/01
4+
ALASKA,geoId/02
5+
ARIZONA,geoId/04
6+
ARKANSAS,geoId/05
7+
CALIFORNIA,geoId/06
8+
COLORADO,geoId/08
9+
CONNECTICUT,geoId/09
10+
DELAWARE,geoId/10
11+
DISTRICT OF COLUMBIA,geoId/11
12+
FLORIDA,geoId/12
13+
GEORGIA,geoId/13
14+
HAWAII,geoId/15
15+
IDAHO,geoId/16
16+
ILLINOIS,geoId/17
17+
INDIANA,geoId/18
18+
IOWA,geoId/19
19+
KANSAS,geoId/20
20+
KENTUCKY,geoId/21
21+
LOUISIANA,geoId/22
22+
MAINE,geoId/23
23+
MARYLAND,geoId/24
24+
MASSACHUSETTS,geoId/25
25+
MICHIGAN,geoId/26
26+
MINNESOTA,geoId/27
27+
MISSISSIPPI,geoId/28
28+
MISSOURI,geoId/29
29+
MONTANA,geoId/30
30+
NEBRASKA,geoId/31
31+
NEVADA,geoId/32
32+
NEW HAMPSHIRE,geoId/33
33+
NEW JERSEY,geoId/34
34+
NEW MEXICO,geoId/35
35+
NEW YORK,geoId/36
36+
NORTH CAROLINA,geoId/37
37+
NORTH DAKOTA,geoId/38
38+
OHIO,geoId/39
39+
OKLAHOMA,geoId/40
40+
OREGON,geoId/41
41+
PENNSYLVANIA,geoId/42
42+
RHODE ISLAND,geoId/44
43+
SOUTH CAROLINA,geoId/45
44+
SOUTH DAKOTA,geoId/46
45+
TENNESSEE,geoId/47
46+
TEXAS,geoId/48
47+
UTAH,geoId/49
48+
VERMONT,geoId/50
49+
VIRGINIA,geoId/51
50+
WASHINGTON,geoId/53
51+
WEST VIRGINIA,geoId/54
52+
WISCONSIN,geoId/55
53+
WYOMING,geoId/56
54+
PUERTO RICO,geoId/72
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
key,p1,v1,p2,v2,p3,v3,p4,v4
2+
Number of Live Births,medicalStatus,LiveBirth,populationType,BirthEvent,measuredProperty,count,statType,measuredValue
3+
Number of Deaths,populationType,MortalityEvent,statType,measuredValue,measuredProperty,count,,
4+
Number of Infant Deaths,populationType,MortalityEvent,age,YearsUpto1,statType,measuredValue,measuredProperty,count
5+
Month,Month,{Data},,,,,,
6+
Year,Year,{Data},,,,,,
7+
Data Value,value,{Number},observationDate,{Month} {Year},,,,
8+
Monthly,observationPeriod,P1M,,,,,,
9+
12 Month-ending,observationPeriod,P1Y,,,,,,
10+
State,observationAbout,{Data},,,,,,

0 commit comments

Comments
 (0)