Skip to content

Commit 712e6a0

Browse files
authored
Merge branch 'master' into internet_access
2 parents f87304c + 22f9701 commit 712e6a0

176 files changed

Lines changed: 16674 additions & 956 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

go.mod

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@ module github.com/datacommonsorg/data
33
go 1.14
44

55
require (
6-
cloud.google.com/go/storage v1.10.0 // indirect
7-
google.golang.org/grpc v1.29.1 // indirect
8-
googlemaps.github.io/maps v1.2.2 // indirect
6+
github.com/google/go-cmp v0.5.9 // indirect
7+
github.com/google/uuid v1.3.0 // indirect
8+
github.com/kr/pretty v0.3.0 // indirect
9+
github.com/rogpeppe/go-internal v1.9.0 // indirect
10+
github.com/stretchr/testify v1.8.3 // indirect
11+
go.opencensus.io v0.24.0 // indirect
12+
golang.org/x/time v0.3.0 // indirect
13+
googlemaps.github.io/maps v1.2.2
14+
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
915
)

go.sum

Lines changed: 42 additions & 253 deletions
Large diffs are not rendered by default.

import-automation/executor/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ ENV GOOGLE_CLOUD_LOGGING_MAX_LATENCY=5.0
5353
ENV GOOGLE_CLOUD_LOGGING_GRACE_PERIOD=30.0
5454
ENV GRPC_VERBOSITY=ERROR
5555

56-
RUN wget https://storage.googleapis.com/datacommons_public/import_tools/import-tool.jar
56+
ADD https://storage.googleapis.com/datacommons_public/import_tools/import-tool.jar /import-tool.jar
5757
COPY requirements.txt .
5858
RUN pip install -r requirements.txt
5959

@@ -63,7 +63,8 @@ COPY --from=data . /data
6363

6464
# build_type: cloud (clone github data repo)
6565
FROM base as cloud
66-
RUN git clone https://github.com/datacommonsorg/data.git
66+
ARG CACHE_BUSTER=1
67+
RUN echo "Cache buster: $CACHE_BUSTER" && git clone https://github.com/datacommonsorg/data.git
6768

6869
# build the final image
6970
FROM ${build_type} as final

import-automation/executor/app/configs.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class ExecutorConfig:
3434

3535
# ID of the Google Cloud project that hosts the executor. The project
3636
# needs to enable App Engine and Cloud Scheduler.
37-
gcp_project_id: str = 'datcom-import-automation'
37+
gcp_project_id: str = 'datcom-import-automation-prod'
3838
# ID of the Google Cloud project that stores generated CSVs and MCFs. The
3939
# project needs to enable Cloud Storage and gives the service account the
4040
# executor uses sufficient permissions to read and write the bucket below.
@@ -55,6 +55,10 @@ class ExecutorConfig:
5555
# Name of the Cloud Storage bucket to store the generated data files
5656
# for importing to dev.
5757
storage_dev_bucket_name: str = 'unresolved_mcf'
58+
# DataCommons API key
59+
dc_api_key: str = ''
60+
# Gemini API key
61+
gemini_api_key: str = ''
5862
# Executor output prefix in the storage_dev_bucket_name bucket.
5963
storage_executor_output_prefix: str = 'datcom-dev-imports'
6064
# Name of the file that specifies the most recently generated data files
@@ -74,6 +78,10 @@ class ExecutorConfig:
7478
# The content of latest_version.txt would be a single line of
7579
# '2020_07_15T12_07_17_365264_07_00'.
7680
storage_version_filename: str = 'latest_version.txt'
81+
# GCP secret name containg import config.
82+
import_config_secret: str = 'import-config'
83+
# Config override file.
84+
config_override_file: str = ''
7785
# File with list of historical versions with the most recent at the top
7886
storage_version_history_filename: str = 'version_history.txt'
7987
# Name of the file that contains the import_metadata_mcf for the import.
@@ -130,7 +138,10 @@ class ExecutorConfig:
130138
# Arguments for the user script
131139
user_script_args: List[str] = ()
132140
# Environment variables for the user script
133-
user_script_env: dict = None
141+
user_script_env: dict = dataclasses.field(default_factory=lambda: {
142+
"EXISTING_STATVAR_MCF":
143+
"gs://unresolved_mcf/scripts/statvar/stat_vars.mcf"
144+
})
134145
# Invoke import tool genmcf.
135146
invoke_import_tool: bool = True
136147
# Invoke differ tool.

import-automation/executor/app/executor/cloud_batch.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ def create_job_request(import_name: str, import_config: dict, import_spec: dict,
151151

152152
resources["cpu"] = resources["cpu"] * 1000
153153
resources["memory"] = resources["memory"] * 1024
154+
schedule = import_spec.get('cron_schedule')
154155
import_config_string = json.dumps(import_config)
155156
job_name = import_name.split(':')[1]
156157
job_name = job_name.replace("_", "-").lower()
@@ -159,7 +160,8 @@ def create_job_request(import_name: str, import_config: dict, import_spec: dict,
159160
"importName": import_name,
160161
"importConfig": import_config_string,
161162
"resources": resources,
162-
"timeout": timeout
163+
"timeout": timeout,
164+
"schedule": schedule
163165
}
164166
argument_string = json.dumps(argument_payload)
165167
final_payload = {
@@ -198,7 +200,7 @@ def execute_cloud_batch_job(project_id: str, location: str, job_name: str,
198200
runnable.container.image_uri = image_uri
199201
runnable.container.commands = [
200202
f"--import_name={import_name}",
201-
f'--import_config={json.dumps({"gcs_project_id": project_id, "storage_prod_bucket_name": gcs_bucket, "spanner_project_id": project_id, "spanner_instance_id": spanner_instance, "spanner_database_id": spanner_db})}'
203+
f'--import_config={json.dumps({"gcp_project_id": project_id, "gcs_project_id": project_id, "storage_prod_bucket_name": gcs_bucket, "spanner_project_id": project_id, "spanner_instance_id": spanner_instance, "spanner_database_id": spanner_db})}'
202204
]
203205

204206
# We can specify what resources are requested by a task.

0 commit comments

Comments
 (0)