datacommonsorg
diff --git a/‎scripts/us_census/pep/monthly_population_estimate/preprocess.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/us_census/pep/monthly_population_estimate/preprocess.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/us_census/pep/population_estimate_by_race/preprocess.py‎
Lines changed: 13 additions & 5 deletions b/‎scripts/us_census/pep/population_estimate_by_race/preprocess.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎scripts/us_nces/demographics/private_school/process.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/us_nces/demographics/private_school/process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/us_nces/demographics/public_school/process.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/us_nces/demographics/public_school/process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/us_nces/demographics/school_district/process.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/us_nces/demographics/school_district/process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎statvar_imports/child_birth/README.md‎
Lines changed: 44 additions & 0 deletions b/‎statvar_imports/child_birth/README.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎statvar_imports/child_birth/manifest.json‎
Lines changed: 26 additions & 0 deletions b/‎statvar_imports/child_birth/manifest.json‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎statvar_imports/child_birth/metadata.csv‎
Lines changed: 6 additions & 0 deletions b/‎statvar_imports/child_birth/metadata.csv‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎statvar_imports/child_birth/places_resolved.csv‎
Lines changed: 54 additions & 0 deletions b/‎statvar_imports/child_birth/places_resolved.csv‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎statvar_imports/child_birth/pvmap.csv‎
Lines changed: 10 additions & 0 deletions b/‎statvar_imports/child_birth/pvmap.csv‎
Lines changed: 10 additions & 0 deletions
@@ -318,7 +318,7 @@ def _transform_data(self, df: pd.DataFrame, file: str) -> None:
                                  ascending=False,
                                  inplace=True)
             # Data for 2020 exists in two sources, causing overlap. We'll eliminate duplicates
-            #self._df.drop_duplicates("Date", keep="last", inplace=True)
+            self._df.drop_duplicates("Date", keep="last", inplace=True)
             self._df.drop(['date_range'], axis=1, inplace=True)
             float_col = self._df.select_dtypes(include=['float64'])
             for col in float_col.columns.values:
 
@@ -517,7 +517,12 @@ def _clean_county_2022_csv_file(df: pd.DataFrame,
             '2': '2020',
             '3': '2021',
             '4': '2022',
-            '5': '2023'
+            '5': '2023',
+            '6': '2024',
+            '7': '2025',
+            '8': '2026',
+            '9': '2027',
+            '10': '2028'
         }
         df = df.replace({'YEAR': conversion_of_year_to_value})
         df.insert(6, 'geo_ID', 'geoId/', True)
@@ -1006,12 +1011,15 @@ def _transform_data(self, df: pd.DataFrame, file_path: str) -> None:
                 "Count_Person_AsianOrPacificIslander",\
                 "Count_Person_TwoOrMoreRaces","Count_Person_NonWhite"]]
             df_before_2000 = self.df[self.df["Year"] < 2000]
-            df_county_after_2000 = self.df[(self.df["Year"] >= 2000) &
-                                           (self.df["geo_ID"] != "country/USA")
-                                           & (self.df["geo_ID"].str.len() > 9)]
+            df_county_after_2000 = self.df[
+                (self.df["Year"] >= 2000) &
+                (self.df["geo_ID"] != "country/USA") &
+                (self.df["geo_ID"].str.len() > 9)].drop_duplicates(
+                    subset=['Year', 'geo_ID'], keep='last')
             df_national_state_2000 = self.df[(self.df["Year"] >= 2000) & (
                 (self.df["geo_ID"].str.len() <= 9) |
-                (self.df["geo_ID"] == "country/USA"))]
+                (self.df["geo_ID"] == "country/USA"))].drop_duplicates(
+                    subset=['Year', 'geo_ID'], keep='last')
             df_before_2000.to_csv(os.path.join(
                 self.cleaned_csv_file_path,
                 "USA_Population_Count_by_Race_before_2000.csv"),
 
@@ -67,7 +67,7 @@ def set_generate_statvars_flag(self, flag: bool):
 
 if __name__ == '__main__':
     try:
-        logging.set_verbosity(2)
+        logging.set_verbosity(logging.INFO)
         logging.info("Main Method Starts For Private School District ")
         gcs_output_dir_local = os.path.join(
             os.path.dirname(os.path.abspath(__file__)), "gcs_folder")
 
@@ -53,7 +53,7 @@ class NCESPublicSchool(USEducation):
 
 if __name__ == '__main__':
     try:
-        logging.set_verbosity(2)
+        logging.set_verbosity(logging.INFO)
         logging.info("Main Method Starts For Public School")
         gcs_output_dir_local = os.path.join(
             os.path.dirname(os.path.abspath(__file__)), "gcs_folder")
 
@@ -53,7 +53,7 @@ class NCESDistrictSchool(USEducation):
 
 if __name__ == '__main__':
     try:
-        logging.set_verbosity(2)
+        logging.set_verbosity(logging.INFO)
         logging.info("Main Method Starts For School District ")
         gcs_output_dir_local = os.path.join(
             os.path.dirname(os.path.abspath(__file__)), "gcs_folder")
 
@@ -0,0 +1,44 @@
+# child_birth
+
+The data set contains USA birth data
+
+Download:
+Data download URL : https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD
+
+###Execution steps :
+
+To Download, run:
+
+python3 util/download_util_script.py --download_url=https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD --output_folder=input_files/
+
+Note : The downloaded file will be saved as "input_files/rows.csv"
+
+###How to run:
+
+python3 stat_var_processor.py 
+--input_data=../../statvar_imports/child_birth/input_files/*.csv \
+--pv_map=../../statvar_imports/child_birth/pvmap.csv \
+--places_resolved_csv=../../statvar_imports/child_birth/place_resolved.csv \ 
+--config_file=../../statvar_imports/child_birth/metadata.csv \
+--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf \ 
+--output_path=../../statvar_imports/child_birth/output_files/child_birth 
+
+
+
+###Example
+
+To Process the files, Run:
+
+Execute the script from the `tools/statvar_importer/` directory.
+
+```
+python3 stat_var_processor.py 
+--input_data=../../statvar_imports/child_birth/input_files/*.csv 
+--pv_map=../../statvar_imports/child_birth/pvmap.csv
+--places_resolved_csv=../../statvar_imports/child_birth/places_resolved.csv 
+--config_file=../../statvar_imports/child_birth/metadata.csv 
+--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf 
+--output_path=../../statvar_imports/child_birth/output_files/child_birth
+```
+
+
@@ -0,0 +1,26 @@
+{
+    "import_specifications": [
+        {
+            "import_name": "usa_child_birth",
+            "curator_emails": [
+                "support@datacommons.org"
+            ],
+            "provenance_url": "https://www.cdc.gov/nchs/nvss/vsrr/provisional-tables.html",
+            "provenance_description": "The data set contains USA birth data",
+            "scripts": [
+                "../../util/download_util_script.py --download_url=https://data.cdc.gov/api/views/hmz2-vwda/rows.csv?accessType=DOWNLOAD --output_folder=input_files/",
+                "../../tools/statvar_importer/stat_var_processor.py --input_data=input_files/*.csv --pv_map=pvmap.csv --config_file=metadata.csv --places_resolved_csv=places_resolved.csv --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf --output_path=output_files/child_birth"
+            ],
+            "source_files": [
+                "input_files/*.csv"
+            ],
+            "import_inputs": [
+                {
+                    "template_mcf": "output_files/child_birth.tmcf",
+                    "cleaned_csv": "output_files/child_birth.csv"
+                }
+            ],
+            "cron_schedule": "0 07 * * 2"
+        }
+    ]
+}
@@ -0,0 +1,6 @@
+parameter,value
+#places_within,country/USA
+output_columns,"observationAbout,observationDate,value,variableMeasured,observationPeriod"
+header_rows,1
+mapped_columns,5
+dc_api_root,https://api.datacommons.org
@@ -0,0 +1,54 @@
+place_name,dcid
+UNITED STATES,country/USA
+ALABAMA,geoId/01
+ALASKA,geoId/02
+ARIZONA,geoId/04
+ARKANSAS,geoId/05
+CALIFORNIA,geoId/06
+COLORADO,geoId/08
+CONNECTICUT,geoId/09
+DELAWARE,geoId/10
+DISTRICT OF COLUMBIA,geoId/11
+FLORIDA,geoId/12
+GEORGIA,geoId/13
+HAWAII,geoId/15
+IDAHO,geoId/16
+ILLINOIS,geoId/17
+INDIANA,geoId/18
+IOWA,geoId/19
+KANSAS,geoId/20
+KENTUCKY,geoId/21
+LOUISIANA,geoId/22
+MAINE,geoId/23
+MARYLAND,geoId/24
+MASSACHUSETTS,geoId/25
+MICHIGAN,geoId/26
+MINNESOTA,geoId/27
+MISSISSIPPI,geoId/28
+MISSOURI,geoId/29
+MONTANA,geoId/30
+NEBRASKA,geoId/31
+NEVADA,geoId/32
+NEW HAMPSHIRE,geoId/33
+NEW JERSEY,geoId/34
+NEW MEXICO,geoId/35
+NEW YORK,geoId/36
+NORTH CAROLINA,geoId/37
+NORTH DAKOTA,geoId/38
+OHIO,geoId/39
+OKLAHOMA,geoId/40
+OREGON,geoId/41
+PENNSYLVANIA,geoId/42
+RHODE ISLAND,geoId/44
+SOUTH CAROLINA,geoId/45
+SOUTH DAKOTA,geoId/46
+TENNESSEE,geoId/47
+TEXAS,geoId/48
+UTAH,geoId/49
+VERMONT,geoId/50
+VIRGINIA,geoId/51
+WASHINGTON,geoId/53
+WEST VIRGINIA,geoId/54
+WISCONSIN,geoId/55
+WYOMING,geoId/56
+PUERTO RICO,geoId/72
@@ -0,0 +1,10 @@
+key,p1,v1,p2,v2,p3,v3,p4,v4
+Number of Live Births,medicalStatus,LiveBirth,populationType,BirthEvent,measuredProperty,count,statType,measuredValue
+Number of Deaths,populationType,MortalityEvent,statType,measuredValue,measuredProperty,count,,
+Number of Infant Deaths,populationType,MortalityEvent,age,YearsUpto1,statType,measuredValue,measuredProperty,count
+Month,Month,{Data},,,,,,
+Year,Year,{Data},,,,,,
+Data Value,value,{Number},observationDate,{Month} {Year},,,,
+Monthly,observationPeriod,P1M,,,,,,
+12 Month-ending,observationPeriod,P1Y,,,,,,
+State,observationAbout,{Data},,,,,,