datacommonsorg
diff --git a/‎import-automation/executor/app/configs.py‎
Lines changed: 0 additions & 6 deletions b/‎import-automation/executor/app/configs.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 7 additions & 66 deletions b/‎import-automation/executor/app/executor/import_executor.py‎
Lines changed: 7 additions & 66 deletions
diff --git a/‎import-automation/executor/update_import_version.sh‎
Lines changed: 11 additions & 42 deletions b/‎import-automation/executor/update_import_version.sh‎
Lines changed: 11 additions & 42 deletions
diff --git a/‎import-automation/workflow/import-automation-workflow.yaml‎
Lines changed: 12 additions & 0 deletions b/‎import-automation/workflow/import-automation-workflow.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎import-automation/workflow/ingestion-helper/import_utils.py‎
Lines changed: 149 additions & 0 deletions b/‎import-automation/workflow/ingestion-helper/import_utils.py‎
Lines changed: 149 additions & 0 deletions
@@ -42,10 +42,6 @@ class ExecutorConfig:
     # Name of the Cloud Storage bucket to store the generated data files
     # for importing to prod.
     storage_prod_bucket_name: str = 'datcom-prod-imports'
-    # Spanner instance details for import status.
-    spanner_project_id: str = 'datcom-store'
-    spanner_instance_id: str = 'dc-kg-test'
-    spanner_database_id: str = 'dc_graph_import'
     # Name of the Cloud Storage bucket that the Data Commons importer
     # outputs to.
     storage_importer_bucket_name: str = 'resolved_mcf'
@@ -152,8 +148,6 @@ class ExecutorConfig:
     ignore_validation_status: bool = False
     # Import validation config file path (relative to data repo).
     validation_config_file: str = 'tools/import_validation/validation_config.json'
-    # Latest import version (overwrite)
-    import_version_override: str = ''
     # Maximum time venv creation can take in seconds.
     venv_create_timeout: float = 3600
     # Maximum time downloading a file can take in seconds.
 
@@ -29,7 +29,6 @@
 import time
 import traceback
 from typing import Callable, Dict, Iterable, List, Optional, Tuple
-from google.cloud import spanner
 import datetime
 from enum import Enum
 
@@ -776,42 +775,6 @@ def _invoke_import_job(
             import_summary.import_stats.get('source_data_size', 0))
         return inputs
 
-    def _update_import_status_table(
-            self, import_summary: ImportStatusSummary) -> None:
-        """Updates import job status table in spanner."""
-        logging.info(
-            f'Updating {import_summary.import_name} status to {import_summary.status} in spanner.'
-        )
-        if not self.config.spanner_project_id or not self.config.spanner_instance_id or not self.config.spanner_database_id:
-            return
-        spanner_client = spanner.Client(
-            project=self.config.spanner_project_id,
-            client_options={'quota_project_id': self.config.spanner_project_id})
-        instance = spanner_client.instance(self.config.spanner_instance_id)
-        database = instance.database(self.config.spanner_database_id)
-
-        with database.batch() as batch:
-            columns = [
-                "ImportName", "State", "JobId", "ExecutionTime", "DataVolume",
-                "StatusUpdateTimestamp", "NextRefreshTimestamp", "LatestVersion"
-            ]
-            values = [
-                import_summary.import_name, import_summary.status.name,
-                import_summary.job_id, import_summary.execution_time,
-                import_summary.data_volume, spanner.COMMIT_TIMESTAMP,
-                import_summary.next_refresh, import_summary.latest_version
-            ]
-            # Update import timestamp only if import completed successfully.
-            if import_summary.status == ImportStatus.READY:
-                columns.extend(["DataImportTimestamp"])
-                values.extend([spanner.COMMIT_TIMESTAMP])
-
-            batch.insert_or_update(table="ImportStatus",
-                                   columns=tuple(columns),
-                                   values=[tuple(values)])
-
-        logging.info(f'Updated {import_summary.import_name} status in spanner.')
-
     def _update_latest_version(self, version, output_dir, import_spec,
                                import_summary):
         if self.config.skip_gcs_upload:
@@ -821,25 +784,16 @@ def _update_latest_version(self, version, output_dir, import_spec,
         logging.info(f'Updating import latest version {version}')
         self.uploader.upload_string(
             version,
-            os.path.join(output_dir, self.config.storage_version_filename))
+            os.path.join(output_dir, STAGING_PATH,
+                         self.config.storage_version_filename))
         self.uploader.upload_string(
             self._import_metadata_mcf_helper(import_spec),
-            os.path.join(output_dir, self.config.import_metadata_mcf_filename))
+            os.path.join(output_dir, version,
+                         self.config.import_metadata_mcf_filename))
         self.uploader.upload_string(
             json.dumps(dataclasses.asdict(import_summary), default=str),
-            os.path.join(output_dir, IMPORT_SUMMARY_FILE))
-        # Add current version to the history of versions if import was successful.
-        if self.config.storage_version_history_filename:
-            history_filename = os.path.join(
-                output_dir, self.config.storage_version_history_filename)
-            versions_history = [version]
-            history = self._get_blob_content(history_filename)
-            if history:
-                versions_history.append(history)
-            self.uploader.upload_string('\n'.join(versions_history),
-                                        history_filename)
+            os.path.join(output_dir, version, IMPORT_SUMMARY_FILE))
         logging.info(f'Updated import latest version {version}')
-        self._update_import_status_table(import_summary)
 
     @log_function_call
     def _import_one_helper(
@@ -868,21 +822,14 @@ def _import_one_helper(
                                     self.config.file_download_timeout)
 
         output_dir = f'{relative_import_dir}/{import_name}'
-        version = self.config.import_version_override if self.config.import_version_override else _clean_time(
-            utils.pacific_time())
+        version = _clean_time(utils.pacific_time())
         if version == 'DATE_VERSION_PLACEHOLDER':
             version = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d")
         import_summary.latest_version = 'gs://' + os.path.join(
             self.config.storage_prod_bucket_name, output_dir, version, '*', '*',
             '*.mcf')
         import_summary.next_refresh = utils.next_utc_timestamp(
             import_spec.get('cron_schedule'))
-        if self.config.import_version_override and self.config.import_version_override != 'DATE_VERSION_PLACEHOLDER':
-            logging.info(f'Import version override {version}')
-            import_summary.status = ImportStatus.READY
-            self._update_latest_version(version, output_dir, import_spec,
-                                        import_summary)
-            return
 
         with tempfile.TemporaryDirectory() as tmpdir:
             requirements_path = os.path.join(absolute_import_dir,
@@ -947,9 +894,6 @@ def _import_one_helper(
                 import_summary.import_stats.get('mcf_data_size', 0) +
                 import_summary.import_stats.get('validation_data_size', 0))
             logging.info(import_summary)
-            self.uploader.upload_string(
-                json.dumps(dataclasses.asdict(import_summary), default=str),
-                os.path.join(output_dir, version, IMPORT_SUMMARY_FILE))
 
             if self.config.ignore_validation_status or validation_status:
                 import_summary.status = ImportStatus.READY
@@ -958,10 +902,7 @@ def _import_one_helper(
                     "Staging latest version update due to validation failure.")
                 import_summary.status = ImportStatus.VALIDATION
 
-            # Update version and metadata files in staging folder for failed imports
-            version_dir = output_dir if import_summary.status == ImportStatus.READY else os.path.join(
-                output_dir, STAGING_PATH)
-            self._update_latest_version(version, version_dir, import_spec,
+            self._update_latest_version(version, output_dir, import_spec,
                                         import_summary)
 
         if self.importer:
 
@@ -2,53 +2,22 @@
 # 
 # This script updates the latest version of an import.
 # 
-# It takes an existing cloud batch job name and a version as input parameters.
-# It fetches the configuration of the job including the import name, modifies the 
-# import_version_override parameter, and submits a new job with the updated configuration.
-#
-# Requirements:
-# - gcloud
-# - jq
-#
-# Usage: ./update_import_version.sh <job_name> <version>
-# Example: ./update_import_version.sh usfed-constantmaturityrates-1755659705 2025_08_15T20_18_20_801877_07_00
+# Usage: ./update_import_version.sh <import_name> <version> <reason>
+# Example: ./update_import_version.sh scripts/us_fed/treasury_constant_maturity_rates:USFed_ConstantMaturityRates_Test 2025_12_17T02_30_27_233484_08_00 'Manual validation'
 
 set -e
 
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <job_name> <version>"
-    echo "Example: $0 usfed-constantmaturityrates-1755659705 2025_08_15T20_18_20_801877_07_00"
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <import_name> <version> <reason>"
     exit 1
 fi
 
-JOB_NAME=$1
+FUNCTION_URL="https://spanner-ingestion-helper-965988403328.us-central1.run.app"
+IMPORT_NAME=$1
 VERSION=$2
-LOCATION="us-central1"
-PROJECT="datcom-import-automation-prod"
-USER_NAME=$(whoami)
-NEW_JOB_NAME="${JOB_NAME}-override-${USER_NAME}"
-TEMP_JSON_FILE=$(mktemp)
-trap 'rm -f -- "$TEMP_JSON_FILE"' EXIT
+REASON=$3
 
-echo "Fetching configuration for job '${JOB_NAME}'..."
-gcloud batch jobs describe "${JOB_NAME}" --location="${LOCATION}" --project="${PROJECT}" --format=json > "${TEMP_JSON_FILE}"
-echo "Updating job configuration with import_version_override: '${VERSION}'"
-# TODO: add check to ensure the version exists on GCS.
-COMMAND_INDEX=$(jq -r '[.taskGroups[0].taskSpec.runnables[0].container.commands[] | startswith("--import_config=")] | index(true)' "${TEMP_JSON_FILE}")  
-IMPORT_CONFIG_COMMAND=$(jq -r ".taskGroups[0].taskSpec.runnables[0].container.commands[${COMMAND_INDEX}]" "${TEMP_JSON_FILE}")
-IMPORT_CONFIG_JSON=$(echo "${IMPORT_CONFIG_COMMAND}" | sed 's/^--import_config=//')
-NEW_IMPORT_CONFIG_JSON=$(echo "${IMPORT_CONFIG_JSON}" | jq -c --arg version "${VERSION}" '. + {import_version_override: $version}')
-NEW_COMMAND="--import_config=${NEW_IMPORT_CONFIG_JSON}"
-
-UPDATED_CONFIG_FILE=$(mktemp)
-trap 'rm -f -- "$TEMP_JSON_FILE" "$UPDATED_CONFIG_FILE"' EXIT 
-
-jq --arg new_command "${NEW_COMMAND}" '.taskGroups[0].taskSpec.runnables[0].container.commands[1] = $new_command' "${TEMP_JSON_FILE}" > "${UPDATED_CONFIG_FILE}"
-
-echo "Submitting new job '${NEW_JOB_NAME}'..."
-gcloud batch jobs submit "${NEW_JOB_NAME}" \
-    --location="${LOCATION}" \
-    --project="${PROJECT}" \
-    --config="${UPDATED_CONFIG_FILE}"
-
-echo "Successfully submitted new job: ${NEW_JOB_NAME}"
+curl -X POST "${FUNCTION_URL}" \
+  -H "Authorization: bearer $(gcloud auth print-identity-token)" \
+  -H "Content-Type: application/json" \
+  -d "{\"actionType\": \"update_import_version\", \"importName\": \"${IMPORT_NAME}\", \"version\": \"${VERSION}\", \"reason\": \"${REASON}\"}"
@@ -80,6 +80,18 @@ main:
                 result: function_response
             - fail_workflow:
                 raise: ${e}
+    - update_import_version:
+        call: http.post
+        args:
+          url: ${function_url}
+          auth:
+            type: OIDC
+          body:
+            actionType: update_import_version
+            importName: ${args.importName}
+            version: 'staging'
+            reason: 'import-automation'
+        result: function_response
     - returnResult:
         return:
           jobId: ${jobId}
 
@@ -0,0 +1,149 @@
+"""Utility functions for the ingestion helper."""
+
+import logging
+import croniter
+from datetime import datetime, timezone
+import base64
+from googleapiclient.discovery import build
+import json
+from googleapiclient.errors import HttpError
+from google.oauth2 import id_token
+from google.auth.transport import requests
+from google.auth import jwt
+
+
+def get_caller_identity(request):
+    """Extracts the caller's email from the Authorization header (JWT)."""
+    auth_header = request.headers.get('Authorization')
+    if auth_header:
+        parts = auth_header.split()
+        if len(parts) == 2 and parts[0].lower() == 'bearer':
+            token = parts[1]
+            unverified_claims = {}
+            try:
+                unverified_claims = jwt.decode(token, verify=False)
+                #logging.info(f"Token claims (unverified): iss={unverified_claims.get('iss')}, aud={unverified_claims.get('aud')}, email={unverified_claims.get('email')}")
+                logging.warning(
+                    f"Could not decode unverified token for debugging: {debug_e}"
+                )
+                id_info = id_token.verify_oauth2_token(token,
+                                                       requests.Request())
+                return id_info.get('email', 'unknown_email')
+            except Exception as e:
+                if unverified_claims:
+                    email = unverified_claims.get('email', 'unknown_email')
+                    return f"{email} (unverified)"
+                return 'decode_error'
+        else:
+            logging.warning(
+                f"Invalid Authorization header format. Parts: {len(parts)}")
+    else:
+        logging.warning("No Authorization header received.")
+    return 'no_auth_header'
+
+
+def get_import_params(request_json) -> dict:
+    """Extracts and calculates import parameters from the request JSON.
+
+    Args:
+        request_json: A dictionary containing request parameters.
+
+    Returns:
+        A dictionary with keys: job_id, duration, data, version, next_refresh, status.
+    """
+    import_name = request_json.get('importName', '')
+    status = request_json.get('status', '')
+    job_id = request_json.get('jobId', '')
+    duration = request_json.get('duration', 0)
+    data = request_json.get('data', 0)
+    version = request_json.get('version', '')
+    schedule = request_json.get('schedule', '')
+    next_refresh = datetime.now(timezone.utc)
+    try:
+        next_refresh = croniter.croniter(schedule, datetime.now(
+            timezone.utc)).get_next(datetime)
+    except (croniter.CroniterError) as e:
+        logging.error(
+            f"Error calculating next refresh from schedule '{schedule}': {e}")
+    return {
+        'import_name': import_name,
+        'status': status,
+        'job_id': job_id,
+        'duration': duration,
+        'data': data,
+        'version': version,
+        'next_refresh': next_refresh
+    }
+
+
+def create_import_params(summary) -> dict:
+    """Creates import parameters from the import summary.
+
+    Args:
+        summary: A dictionary containing import summary details.
+
+    Returns:
+        A dictionary with keys: job_id, duration, data, version, next_refresh, status.
+    """
+    import_name = summary.get('import_name', '')
+    status = summary.get('status', '').removeprefix('ImportStatus.')
+    job_id = summary.get('job_id', '')
+    duration = summary.get('execution_time', 0)
+    data = summary.get('data_volume', 0)
+    version = summary.get('latest_version', '')
+    next_refresh_str = summary.get('next_refresh', '')
+    next_refresh = None
+    if next_refresh_str:
+        try:
+            next_refresh = datetime.fromisoformat(next_refresh_str)
+        except ValueError:
+            logging.error(f"Error parsing next_refresh: {next_refresh_str}")
+
+    return {
+        'import_name': import_name,
+        'status': status,
+        'job_id': job_id,
+        'duration': duration,
+        'data': data,
+        'version': version,
+        'next_refresh': next_refresh,
+    }
+
+
+def get_ingestion_metrics(project_id, location, job_id):
+    """Fetches graph metrics (nodes, edges, observations) from a Dataflow job.
+
+    Args:
+        project_id: The GCP project ID.
+        location: The location of the Dataflow job.
+        job_id: The Dataflow job ID.
+
+    Returns:
+        A dictionary containing 'obs_count', 'node_count', and 'edge_count'.
+    """
+    dataflow = build('dataflow', 'v1b3', cache_discovery=False)
+    # Fetch Dataflow metrics
+    node_count = 0
+    edge_count = 0
+    obs_count = 0
+    if project_id and job_id:
+        try:
+            metrics = dataflow.projects().locations().jobs().getMetrics(
+                projectId=project_id, location=location,
+                jobId=job_id).execute()
+            for metric in metrics.get('metrics', []):
+                name = metric['name']['name']
+                if name == 'graph_node_count':
+                    node_count = int(metric['scalar'])
+                elif name == 'graph_edge_count':
+                    edge_count = int(metric['scalar'])
+                elif name == 'graph_observation_count':
+                    obs_count = int(metric['scalar'])
+        except HttpError as e:
+            logging.error(
+                f"Error fetching dataflow metrics for job {job_id}: {e}")
+    return {
+        'obs_count': obs_count,
+        'node_count': node_count,
+        'edge_count': edge_count
+    }