Skip to content

Commit ee1e0b2

Browse files
authored
Merge branch 'datacommonsorg:master' into master
2 parents 34bb357 + 6c55a97 commit ee1e0b2

103 files changed

Lines changed: 14788 additions & 2847 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

import-automation/executor/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ fonts-liberation \
3636
xdg-utils \
3737
chromium \
3838
chromium-driver \
39-
p7zip-full
39+
p7zip-full \
40+
libeccodes-dev \
41+
libeccodes-tools \
42+
&& rm -rf /var/lib/apt/lists/*
4043
# Install the Google Cloud CLI
4144
RUN apt-get update && \
4245
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \

import-automation/executor/app/configs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ class ExecutorConfig:
153153
validation_config_file: str = 'tools/import_validation/validation_config.json'
154154
# Latest import version (overwrite)
155155
import_version_override: str = ''
156+
# Override cron schedule in import spec.
157+
cron_schedule_override: str = ''
156158
# Relative path to version folder for graph files.
157159
graph_data_path: str = '/*/*/*.mcf'
158160
# Maximum time venv creation can take in seconds.
@@ -169,6 +171,8 @@ class ExecutorConfig:
169171
disable_email_notifications: bool = True
170172
# Skip uploading the data to GCS (for local testing).
171173
skip_gcs_upload: bool = False
174+
# Skip uploading input files to GCS.
175+
skip_input_upload: bool = False
172176
# Maximum time a blocking call to the importer to
173177
# perform an import can take in seconds.
174178
importer_import_timeout: float = 20 * 60

import-automation/executor/app/executor/import_executor.py

Lines changed: 139 additions & 128 deletions
Large diffs are not rendered by default.

import-automation/executor/app/service/file_uploader.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,8 @@ def upload_file(self, src: str, dest: str) -> None:
8282
"""
8383
_strings_not_empty(src, dest)
8484
dest = self._fix_path(dest)
85-
logging.info('GCSFileUploader.upload_file: Uploading %s to %s', src,
86-
dest)
8785
blob = self.bucket.blob(dest)
8886
blob.upload_from_filename(src)
89-
logging.info('GCSFileUploader.upload_file: Uploaded %s to %s', src,
90-
dest)
9187

9288
def upload_string(self, string: str, dest: str) -> None:
9389
"""Uploads a string to a file in the bucket, overwriting it.
@@ -102,10 +98,8 @@ def upload_string(self, string: str, dest: str) -> None:
10298
"""
10399
_strings_not_empty(dest)
104100
dest = self._fix_path(dest)
105-
logging.info('GCSFileUploader.upload_string: Uploading to %s', dest)
106101
blob = self.bucket.blob(dest)
107102
blob.upload_from_string(string)
108-
logging.info('GCSFileUploader.upload_string: Uploaded to %s', dest)
109103

110104
def _fix_path(self, path):
111105
"""Returns {self.path_prefix}/{path}."""

import-automation/executor/cloudbuild.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ steps:
4646
python import_test.py
4747
env:
4848
- 'PROJECT_ID=$PROJECT_ID'
49-
- 'LOCATION=$LOCATION'
5049
- 'GCS_BUCKET=${_GCS_BUCKET}'
5150
- 'IMAGE_URI=${_DOCKER_IMAGE}:${COMMIT_SHA}'
5251
dir: 'import-automation/executor'

import-automation/executor/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ beautifulsoup4
66
chardet
77
chromedriver_py
88
croniter
9+
curl_cffi
910
dataclasses
1011
datacommons
1112
datacommons_client
@@ -41,6 +42,7 @@ omegaconf
4142
prettytable
4243
protobuf
4344
psutil
45+
pygrib
4446
pylint
4547
pyspellchecker
4648
pytest

import-automation/executor/run_import.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,11 @@ function build_docker {
254254
# Get the latest import output from GCS
255255
function get_latest_gcs_import_output {
256256
echo_log "Looking for import files on GCS at $GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME..."
257-
LATEST_VERSION=$(gsutil cat gs://$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/latest_version.txt)
258-
if [[ -n "$LATEST_VERSION" ]]; then
259-
echo_log "latest_version.txt: $LATEST_VERSION"
260-
run_cmd gsutil ls -lR gs://$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/$LATEST_VERSION
261-
echo_log "View latest import files at: https://pantheon.corp.google.com/storage/browser/$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/$LATEST_VERSION"
257+
STAGING_VERSION=$(gsutil cat gs://$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/staging_version.txt)
258+
if [[ -n "$STAGING_VERSION" ]]; then
259+
echo_log "staging_version.txt: $STAGING_VERSION"
260+
run_cmd gsutil ls -lR gs://$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/$STAGING_VERSION
261+
echo_log "View latest import files at: https://pantheon.corp.google.com/storage/browser/$GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME/$STAGING_VERSION"
262262
else
263263
echo_log "No files on GCS at $GCS_BUCKET/$IMPORT_DIR/$IMPORT_NAME"
264264
fi
@@ -317,7 +317,7 @@ function get_import_config {
317317
IMPORT_VERSION=$(cut -d: -f2 <<< "$ver_override")
318318
get_latest_gcs_import_output
319319
# Add config to update version.
320-
add_import_version_notes "$IMPORT_VERSION" "Updating latest $IMPORT_NAME from: $LATEST_VERSION to: $IMPORT_VERSION, $NOTE"
320+
add_import_version_notes "$IMPORT_VERSION" "Updating latest $IMPORT_NAME from: $STAGING_VERSION to: $IMPORT_VERSION, $NOTE"
321321
fi
322322
for c_v in $config_vals; do
323323
param=$(cut -d: -f1 <<< "$c_v")

import-automation/workflow/cloudbuild.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ substitutions:
2121
_SPANNER_PROJECT_ID: 'datcom-store'
2222
_SPANNER_INSTANCE_ID: 'dc-kg-test'
2323
_SPANNER_DATABASE_ID: 'dc_graph_import'
24+
_SPANNER_GRAPH_DATABASE_ID: 'dc_graph_2025_11_07'
2425
_GCS_BUCKET_ID: 'datcom-prod-imports'
2526
_LOCATION: 'us-central1'
2627
_GCS_MOUNT_BUCKET: 'datcom-volume-mount'
@@ -33,15 +34,15 @@ steps:
3334

3435
- id: 'spanner-ingestion-workflow'
3536
name: 'gcr.io/cloud-builders/gcloud'
36-
args: ['workflows', 'deploy', 'spanner-ingestion-workflow', '--project', '${_PROJECT_ID}', '--location', '${_LOCATION}', '--source', 'spanner-ingestion-workflow.yaml', '--set-env-vars', 'LOCATION=${_LOCATION},PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID}']
37+
args: ['workflows', 'deploy', 'spanner-ingestion-workflow', '--project', '${_PROJECT_ID}', '--location', '${_LOCATION}', '--source', 'spanner-ingestion-workflow.yaml', '--set-env-vars', 'LOCATION=${_LOCATION},PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_GRAPH_DATABASE_ID}']
3738

3839
- id: 'spanner-ingestion-helper'
3940
name: 'gcr.io/cloud-builders/gcloud'
40-
args: ['functions', 'deploy', 'spanner-ingestion-helper', '--gen2', '--project', '${_PROJECT_ID}', '--region', '${_LOCATION}', '--runtime', 'python312', '--source', 'ingestion-helper', '--no-allow-unauthenticated', '--trigger-http', '--entry-point', 'ingestion_helper', '--set-env-vars', 'PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},LOCATION=${_LOCATION}']
41+
args: ['functions', 'deploy', 'spanner-ingestion-helper', '--gen2', '--project', '${_PROJECT_ID}', '--region', '${_LOCATION}', '--runtime', 'python312', '--source', 'ingestion-helper', '--no-allow-unauthenticated', '--trigger-http', '--entry-point', 'ingestion_helper', '--set-env-vars', 'PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID},SPANNER_GRAPH_DATABASE_ID=${_SPANNER_GRAPH_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},LOCATION=${_LOCATION}']
4142

4243
- id: 'import-aggregation-helper'
4344
name: 'gcr.io/cloud-builders/gcloud'
44-
args: ['functions', 'deploy', 'import-aggregation-helper', '--runtime', 'python312', '--source', 'aggregation-helper', '--no-allow-unauthenticated', '--trigger-http', '--entry-point', 'aggregation_helper', '--project', '${_PROJECT_ID}', '--set-env-vars', 'PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},LOCATION=${_LOCATION},BQ_DATASET_ID=${_BQ_DATASET_ID}']
45+
args: ['functions', 'deploy', 'import-aggregation-helper', '--runtime', 'python312', '--source', 'aggregation-helper', '--no-allow-unauthenticated', '--trigger-http', '--entry-point', 'aggregation_helper', '--project', '${_PROJECT_ID}', '--set-env-vars', 'PROJECT_ID=${_PROJECT_ID},SPANNER_PROJECT_ID=${_SPANNER_PROJECT_ID},SPANNER_INSTANCE_ID=${_SPANNER_INSTANCE_ID},SPANNER_DATABASE_ID=${_SPANNER_GRAPH_DATABASE_ID},GCS_BUCKET_ID=${_GCS_BUCKET_ID},LOCATION=${_LOCATION},BQ_DATASET_ID=${_BQ_DATASET_ID}']
4546

4647
- id: 'import-automation-helper'
4748
name: 'gcr.io/cloud-builders/gcloud'

import-automation/workflow/cloudbuild_main.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ substitutions:
2121
_SPANNER_PROJECT_ID: 'datcom-ci'
2222
_SPANNER_INSTANCE_ID: 'datcom-spanner-test'
2323
_SPANNER_DATABASE_ID: 'dc-test-db'
24+
_SPANNER_GRAPH_DATABASE_ID: 'dc-test-db'
2425
_GCS_BUCKET_ID: 'datcom-ci-test'
2526
_GCS_MOUNT_BUCKET: 'datcom-ci-test'
2627
_BQ_DATASET_ID: 'datacommons'

import-automation/workflow/import-automation-workflow.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ main:
2121
memory: 32768
2222
disk: 100
2323
- resources: ${default(map.get(args, "resources"), defaultResources)}
24+
- runIngestion: ${default(map.get(args, "runIngestion"), false)}
25+
- ingestionArgs:
26+
importList:
27+
- ${text.split(importName, ":")[1]}
2428
- runImportJob:
2529
try:
2630
call: googleapis.batch.v1.projects.locations.jobs.create
@@ -99,6 +103,18 @@ main:
99103
override: false
100104
comment: '${"import-workflow:" + sys.get_env("GOOGLE_CLOUD_WORKFLOW_EXECUTION_ID")}'
101105
result: functionResponse
106+
- runIngestion:
107+
switch:
108+
- condition: ${runIngestion}
109+
steps:
110+
- runSpannerIngestion:
111+
call: googleapis.workflowexecutions.v1.projects.locations.workflows.executions.create
112+
args:
113+
parent: ${"projects/" + projectId + "/locations/" + region + "/workflows/spanner-ingestion-workflow"}
114+
body:
115+
argument: ${json.encode_to_string(ingestionArgs)}
116+
connector_params:
117+
skip_polling: true
102118
- returnResult:
103119
return:
104120
jobId: ${jobId}

0 commit comments

Comments
 (0)