datacommonsorg · Harsha-chandaluri · Dec 30, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 6, 2025
diff --git a/statvar_imports/school_finance/README.md b/statvar_imports/school_finance/README.md
@@ -0,0 +1,62 @@
+#### Copyright 2025 Google LLC
+####
+#### Licensed under the Apache License, Version 2.0 (the "License");
+#### you may not use this file except in compliance with the License.
+#### You may obtain a copy of the License at
+####
+####    https://www.apache.org/licenses/LICENSE-2.0
+####
+#### Unless required by applicable law or agreed to in writing, software
+#### distributed under the License is distributed on an "AS IS" BASIS,
+#### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#### See the License for the specific language governing permissions and
+#### limitations under the License.
+
+-----
+
+## US_UrbanSchool_Finances Import
+
+This import focuses on urban school finance. This dataset contains financial and identifying information for educational institutions, including details on salaries and expenditures of teachers.
+
+-----
+- source: https://ocrdata.ed.gov/data
+
+- type of place: Country
+
+- statvars: Education
+
+- years: 2010 and 2012
+
+### ⚙️ Workflow
+
+The workflow for this data import involves two main steps: downloading the necessary files and then processing them.
+
+#### Step 1: Download the Source Data
+
+To acquire the necessary data files, execute the download script `download_script.py`.
+
+All downloaded files will be stored in the directory `input_files`.
+
+#### Step 2: Process the Data
+
+Once the data is downloaded run the `stat_var_processor.py` script to process the files and generate the final output artifacts (CSV, TMCF, MCF).
+
+The script is located in the `data/tools/statvar_importer/` directory. Run the following command
+```bash
+    python3 stat_var_processor.py --input_data=../../statvar_imports/school_finance/input_files/*.xlsx --pv_map=../../statvar_imports/school_finance/school_finance_pvmap.csv --config_file=../../statvar_imports/school_finance/school_finance_metadata.csv --output_path=../../statvar_imports/school_finance/output/school_finance_output
+```
+
+### Autorefresh type
+
+This import uses a fully automated refresh process.
+
+-----
+
+
+### Automation
+
+This import pipeline is configured to run automatically on a monthly schedule.
+
+- Cron Expression: 30 08 25 * *
+
+Schedule: The script runs at 8:30 AM on the 25th day of every month.
diff --git a/statvar_imports/school_finance/download_script.py b/statvar_imports/school_finance/download_script.py
@@ -0,0 +1,153 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from absl import app
+from absl import logging
+import datetime
+import glob
+import shutil
+import pandas as pd
+import re
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+
+sys.path.append(os.path.join(_SCRIPT_PATH, '../../util/'))
+
+from download_util_script import download_file
+
+logging.set_verbosity(logging.INFO)
+
+_BASE_URL = "https://civilrightsdata.ed.gov/assets/ocr/docs/{year_range}-crdc-data.zip"
+_OUTPUT_DIRECTORY = "input_files"
+_START_YEAR = 2009
+_CURRENT_YEAR = datetime.datetime.now().year
+
+
+def add_year_column(filepath: str, year: int):
+    """Adds a 'year' column as the first column to the given CSV or XLSX file."""
+    try:
+        # Determine file type and read the DataFrame
+        if filepath.endswith('.csv'):
+            df = pd.read_csv(filepath, encoding='utf-8', low_memory=False, dtype=str)
+        elif filepath.endswith('.xlsx'):
+            df = pd.read_excel(filepath, dtype=str)
+        else:
+            logging.warning(f"Unsupported file type for year column addition: {filepath}")
+            return
+
+        # Added the 'year' column
+        if 'year' in df.columns:
+            df['year'] = year
+            cols = ['year'] + [c for c in df.columns if c != 'year']
+            df = df[cols]
+        else:
+            df.insert(0, 'year', year)
+
+        if filepath.endswith('.csv'):
+            df.to_csv(filepath, index=False, encoding='utf-8')
+        elif filepath.endswith('.xlsx'):
+            with pd.ExcelWriter(filepath) as writer:
+                df.to_excel(writer, index=False, sheet_name='Sheet1')
+
+        logging.info(
+            f"Added 'year' column with value {year} as the FIRST column to {os.path.basename(filepath)}"
+        )
+    except Exception as e:
+        # Log the error so you see the filename
+        logging.error(f"Could not add year column to {filepath}: {e}")
+        # Kill the script immediately so you don't get bad data
+        raise RuntimeError(e)
+
+
+
+
+def main(_):
+    os.makedirs(_OUTPUT_DIRECTORY, exist_ok=True)
+    logging.info(f"Base output directory '{_OUTPUT_DIRECTORY}' ensured to exist.")
+
+    # CRDC data typically follows an odd-year reporting schedule (e.g., 2009-10, 2011-12)
+    years_to_try = list(range(_START_YEAR, 2018, 2)) + list(
+        range(2020, _CURRENT_YEAR + 1, 2))
+
+    for year in years_to_try:
+        year_range = f"{year}-{str(year+1)[-2:]}"
+        url = _BASE_URL.format(year_range=year_range)
+
+        # Download to a temporary sub-folder
+        temp_output_dir = os.path.join(_OUTPUT_DIRECTORY, f"{year_range}")
+        os.makedirs(temp_output_dir, exist_ok=True)
+        logging.info(f"Starting download process for year range {year_range}")
+        logging.info(f"Download Params: url={url}, "f"output_dir='{temp_output_dir}'")
+
+        success = download_file(url=url,
+                                output_folder=temp_output_dir,
+                                unzip=True)
+
+        if not success:
+            logging.warning(
+                f"Failed to download or process data for year {year}. "
+                f"Cleaning up temporary directory and continuing to next year."
+            )
+            # This is the 'cleanup' action being performed
+            shutil.rmtree(temp_output_dir, ignore_errors=True)
+            continue
+
+        logging.info(f"Successfully downloaded and extracted data for {year_range}.")
+
+        # Find, rename, and move the files we want to keep
+        search_pattern = os.path.join(temp_output_dir, '**', '*')
+
+        # Define the target category
+        category_name = "school finance"
+        category_dir = _OUTPUT_DIRECTORY
+
+        for item_path in glob.glob(search_pattern, recursive=True):
+            if not os.path.isfile(item_path):
+                continue
+
+            filename = os.path.basename(item_path)
+            base, extension = os.path.splitext(filename)
+            extension = extension.lower()
+
+            # Use a cleaner check for the required files
+            if (extension in ['.csv', '.xlsx'] and 
+                category_name in base.lower()):
+                if 'lea' in base.lower() and extension == '.xlsx':
+                    logging.info(f"Skipping and removing Excel file: '{filename}' because it contains 'LEA'.")
+                    os.remove(item_path)
+                    continue
+
+                clean_base = re.sub(r'[^a-zA-Z0-9]+', '_', base).lower()
+
+                new_filename = f"crdc_{year_range}_{clean_base}{extension}"
+                new_filepath = os.path.join(category_dir, new_filename)
+
+                logging.info(f"Moving '{item_path}' to '{new_filepath}'")
+                shutil.move(item_path, new_filepath)
+
+                # Add the year column (using the end year of the range)
+                end_year = int(f"20{year_range.split('-')[1]}")
+                add_year_column(new_filepath, end_year)
+
+        # Clean up the temporary directory for the year
+        logging.info(f"Removing temporary directory: {temp_output_dir}")
+        shutil.rmtree(temp_output_dir, ignore_errors=True)
+
+    logging.info("Script finished.")
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/statvar_imports/school_finance/manifest.json b/statvar_imports/school_finance/manifest.json
@@ -0,0 +1,31 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "US_UrbanSchool_Finances",
+      "curator_emails": [
+        "support@datacommons.org"
+      ],
+      "provenance_url": "https://ocrdata.ed.gov/data",
+      "provenance_description": "School Finance dataset contains financial and identifying information for educational institutions, including details on salaries and expenditures for teachers.",
+      "scripts": [
+        "download_script.py",
+        "../../tools/statvar_importer/stat_var_processor.py --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf --input_data=input_files/* --pv_map=school_finance_pvmap.csv --config_file=school_finance_metadata.csv --output_path=output/school_finance_output"
+      ],
+      "source_files": [
+        "input_files/*"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "output/school_finance_output.tmcf",
+          "cleaned_csv": "output/school_finance_output.csv"
+        }
+      ],
+      "cron_schedule": "30 08 25 * *",
+      "resource_limits": {
+        "cpu": 16,
+        "memory": 128,
+        "disk": 500
+      }
+    }
+  ]
+}
diff --git a/statvar_imports/school_finance/school_finance_metadata.csv b/statvar_imports/school_finance/school_finance_metadata.csv
@@ -0,0 +1,5 @@
+parameter,value
+url,https://ocrdata.ed.gov/data
+header_rows,1
+output_columns,"observationAbout, observationDate, value, variableMeasured, unit, scalingFactor"
+#input_rows,15
diff --git a/statvar_imports/school_finance/school_finance_pvmap.csv b/statvar_imports/school_finance/school_finance_pvmap.csv
@@ -0,0 +1,10 @@
+,,,,,,,,,,,,,,
+year,observationDate,{Number},,,,,,,,,,,,
+COMBOKEY,#Format,observationAbout=nces/{Data},,,,,,,,,,,,
+,,,,,,,,,,,,,,
+FTE_TEACHERS_FIN,populationType,Teacher,value,{Number},measurementQualifier,FullTimeEquivalent,measuredProperty,count,,,,,,
+TEACH_AMOUNT,populationType,EconomicActivity,measuredProperty,expenditure,expenditureType,Salaries,facultyType,Teacher,value,{Number},unit,USDollar,,
+AVG_TEACH_SALARY,populationType,EconomicActivity,value,{Number},measuredProperty,expenditure,statType,meanValue,expenditureType,Salaries,facultyType,Teacher,unit,USDollar
+TOT_SALARIES,populationType,EconomicActivity,measuredProperty,expenditure,expenditureType,Salaries,value,{Number},unit,USDollar,,,,
+INST_SALARIES,populationType,EconomicActivity,measuredProperty,expenditure,expenditureType,Salaries,facultyType,InstructionalStaff,value,{Number},unit,USDollar,,
+EXPEND,populationType,EconomicActivity,measuredProperty,expenditure,expenditureType,NonPersonnel,value,{Number},unit,USDollar,,,,
diff --git a/statvar_imports/school_finance/test_data/2009_2010_sample_input_1.xlsx b/statvar_imports/school_finance/test_data/2009_2010_sample_input_1.xlsx
diff --git a/statvar_imports/school_finance/test_data/2009_2010_sample_input_2.xlsx b/statvar_imports/school_finance/test_data/2009_2010_sample_input_2.xlsx
diff --git a/statvar_imports/school_finance/test_data/2011_12_sample_input_1.xlsx b/statvar_imports/school_finance/test_data/2011_12_sample_input_1.xlsx
diff --git a/statvar_imports/school_finance/test_data/2011_12_sample_input_2.xlsx b/statvar_imports/school_finance/test_data/2011_12_sample_input_2.xlsx