Skip to content

Commit

Permalink
Integrate with latest cpg-infra changes, esp. constructing private re…
Browse files Browse the repository at this point in the history
…po url from ArtifactRegistry.
  • Loading branch information
milo-hyben committed Sep 26, 2023
1 parent 51055e5 commit 9700ec8
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 58 deletions.
13 changes: 6 additions & 7 deletions etl/test/test_etl_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import etl.load.main


ETL_SAMPLE_RECORD_1 = """
{
"identifier": "AB0002",
Expand All @@ -20,10 +19,10 @@
ETL_SAMPLE_RECORD_2 = """
{
"sample_id": "123456",
"external_id": "GRK100311",
"individual_id": "608",
"external_id": "AAA000000",
"individual_id": "678",
"sequencing_type": "exome",
"collection_centre": "KCCG",
"collection_centre": "ABCDEF",
"collection_date": "2023-08-05T01:39:28.611476",
"collection_specimen": "blood"
}
Expand Down Expand Up @@ -54,10 +53,10 @@
"data":
{
"sample_id": "123456",
"external_id": "GRK100311",
"individual_id": "608",
"external_id": "AAA000000",
"individual_id": "678",
"sequencing_type": "exome",
"collection_centre": "KCCG",
"collection_centre": "ABCDEF",
"collection_date": "2023-08-05T01:39:28.611476",
"collection_specimen": "blood"
}
Expand Down
4 changes: 3 additions & 1 deletion metamist_infrastructure/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,14 @@ pip install --editable .

### Env variables

- set environment variable METAMIST_INFRA_SLACK_CHANNEL, METAMIST_INFRA_SLACK_TOKEN_SECRET_NAME and METAMIST_INFRA_GCP_PROJECT, e.g.:
- set environment variable METAMIST_INFRA_SLACK_CHANNEL, METAMIST_INFRA_SLACK_TOKEN_SECRET_NAME, METAMIST_INFRA_GCP_PROJECT and METAMIST_INFRA_ETL_PRIVATE_REPO_NAME, e.g.:

```bash
export METAMIST_INFRA_SLACK_CHANNEL='dev-channel'
export METAMIST_INFRA_SLACK_TOKEN_SECRET_NAME='dev-slack-secret'
export METAMIST_INFRA_GCP_PROJECT='gcp-project-name'
export METAMIST_INFRA_ETL_PRIVATE_REPO_NAME='python-registry'

```

### Deploy stack
Expand Down
64 changes: 47 additions & 17 deletions metamist_infrastructure/__main__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
"""Metamist Test Python Pulumi program"""
"""
Metamist Test Python Pulumi program
This is a test program to deploy the Metamist Infrastructure to
development gcloud project.
"""
import os
from typing import NamedTuple

from cpg_infra.config import CPGInfrastructureConfig
from metamist_infrastructure import MetamistInfrastructure

GCP_PROJECT = os.getenv('METAMIST_INFRA_GCP_PROJECT')
SLACK_CHANNEL = os.getenv('METAMIST_INFRA_SLACK_CHANNEL')
SLACK_TOKEN_SECRET_NAME = os.getenv('METAMIST_INFRA_SLACK_TOKEN_SECRET_NAME')
ETL_PRIVATE_REPO_URL = os.getenv('METAMIST_INFRA_ETL_PRIVATE_REPO_URL')
ETL_PRIVATE_REPO_NAME = os.getenv('METAMIST_INFRA_ETL_PRIVATE_REPO_NAME')

# simple cpg-infra configuration
conf_dict = {
Expand All @@ -33,20 +40,18 @@
},
'etl_accessors': ['bbv'],
'slack_channel': SLACK_CHANNEL,
# TODO: comment out below once CPG_INFRA is updated
# 'etl_environment': 'DEVELOPMENT',
# 'etl_parser_default_config': {
# # Order of config overides:
# # 1. parser default config values
# # 2. etl_load_default_config
# # 3. config from payload
# 'project': 'milo-dev',
# 'default_sequencing_type': 'genome',
# 'default_sequencing_technology': 'long-read',
# 'default_sample_type': 'blood',
# },
# 'etl_private_repo_url': ETL_PRIVATE_REPO_URL,
# 'etl_private_repo_packages': ['metamist_private'],
'etl_environment': 'DEVELOPMENT',
'etl_parser_default_config': {
# Order of config overides:
# 1. parser default config values
# 2. etl_load_default_config
# 3. config from payload
'project': 'milo-dev',
'default_sequencing_type': 'genome',
'default_sequencing_technology': 'long-read',
'default_sample_type': 'blood',
},
'etl_private_repo_packages': ['metamist_private'],
},
'billing': {
'coordinator_machine_account': '',
Expand All @@ -67,6 +72,31 @@
# construct cpg-infra config
conf = CPGInfrastructureConfig.from_dict(conf_dict)

# setup dummy infrastructure, so private repo url can be constructed
class DummyRegistry(NamedTuple):
"""Dummy Registry
This is used to mockup the registry object
"""

location: str
project: str
name: str

class DummyInfra(NamedTuple):
"""Dummy Infrastructure
This is used to mockup the infrastructure object
"""

gcp_python_registry: DummyRegistry

infrastructure = DummyInfra(
DummyRegistry(
location='australia-southeast1',
project=GCP_PROJECT,
name=ETL_PRIVATE_REPO_NAME,
)
)

# deploy metamist_infrastructure driver
setup_obj = MetamistInfrastructure(conf)
setup_obj = MetamistInfrastructure(infrastructure, conf)
setup_obj.main()
166 changes: 133 additions & 33 deletions metamist_infrastructure/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import pulumi_gcp as gcp
from cpg_infra.plugin import CpgInfrastructurePlugin
from cpg_infra.utils import archive_folder

from metamist_infrastructure.slack_notification import (
SlackNotification,
SlackNotificationConfig,
Expand All @@ -31,6 +30,7 @@ def append_private_repositories_to_requirements(
"""
Append private repositories to requirements.txt
"""

with open(filename, encoding='utf-8') as file:
file_content = file.read()
if private_repo_url and private_repos:
Expand All @@ -43,6 +43,80 @@ def append_private_repositories_to_requirements(
return pulumi.StringAsset(file_content)


# def append_private_repositories_to_requirements(
# filename: str,
# private_repo_url: str | None,
# private_repos: list[str] | None,
# ) -> pulumi.Asset:
# """
# Append private repositories to requirements.txt
# """
# with open(filename, encoding='utf-8') as file:
# file_content = file.read()

# if not (private_repo_url and private_repos):
# # there is no provate repo to be added
# return pulumi.StringAsset(file_content)

# # # we need to use pulumi outputs to combine the content
# file_content_output = pulumi.Output.from_input(file_content)

# # If private_repo_url and private_repos are not Outputs, we make them Outputs
# private_repo_url_output = pulumi.Output.from_input(private_repo_url)
# private_repos_output = pulumi.Output.from_input(private_repos)

# # You can convert Outputs to strings like this
# file_content_output_str = file_content_output.apply(str)
# private_repo_url_str = private_repo_url_output.apply(str)
# private_repos_str = pulumi.Output.all(private_repos_output).apply(
# lambda private_repos: '\n'.join(map(str, private_repos))
# )

# comb = f'{file_content_output_str}\n--extra-index-url {private_repo_url_str}\n{private_repos_str}'

# print("comb:", comb)

# # private_repo_url_output.apply(
# # lambda x: print("\n\n ==== \n private_repo_url_output:", x)
# # )

# res = private_repo_url_output.apply(
# lambda x, file_content=file_content: file_content + "\n--extra-index-url " + x
# )

# print("res:", res)

# return pulumi.StringAsset(
# f'{file_content_output_str}\n--extra-index-url {private_repo_url_str}\n{private_repos_str}'
# )


# # we need to use pulumi outputs to combine the content
# file_content_output = pulumi.Output.from_input(file_content)

# # If private_repo_url and private_repos are not Outputs, we make them Outputs
# private_repo_url_output = pulumi.Output.from_input(private_repo_url)
# private_repos_output = pulumi.Output.from_input('\n'.join(private_repos))

# final_content = pulumi.Output.all(
# file_content_output, private_repo_url_output, private_repos_output
# ).apply(lambda args: args[0] + '\n--extra-index-url ' + args[1] + '\n' + args[2])

# return pulumi.StringAsset(final_content.apply(lambda x: x))

# asset = file_content_output.apply(
# lambda file_content: private_repo_url_output.apply(
# lambda private_repo_url: private_repos_output.apply(
# lambda private_repos: pulumi.StringAsset(
# f'{file_content}\n--extra-index-url {private_repo_url}\n{private_repos}'
# )
# )
# )
# )

# return asset


class MetamistInfrastructure(CpgInfrastructurePlugin):
"""
Metamist Infrastructure (as code) for Pulumi
Expand All @@ -53,14 +127,6 @@ def main(self):
# todo, eventually configure metamist cloud run server
# to be deployed here, but for now it's manually deployed

# TODO: the following should be added to SampleMetadataConfig in cpg_infra
# pylint: disable=attribute-defined-outside-init
self.extra_sample_metadata_config = {
'etl_private_repo_url': None,
'etl_private_repo_packages': None,
'etl_environment': 'DEVELOPMENT',
'etl_parser_default_config': None,
}
self._setup_etl()

@cached_property
Expand Down Expand Up @@ -297,14 +363,14 @@ def etl_bigquery_dataset(self):
),
)

def _setup_bq_table(self, schema_file_name: Path, table_name: str):
def _setup_bq_table(self, schema_file_name: Path, table_id: str, name_suffix: str):
"""Setup Bigquery table"""
with open(schema_file_name) as f:
schema = f.read()

etl_table = gcp.bigquery.Table(
f'metamist-etl-bigquery-table-{table_name}',
table_id=f'etl-{table_name}',
f'metamist-etl-bigquery-table{name_suffix}',
table_id=table_id,
dataset_id=self.etl_bigquery_dataset.dataset_id,
labels={'project': 'metamist'},
schema=schema,
Expand All @@ -318,16 +384,17 @@ def _setup_bq_table(self, schema_file_name: Path, table_name: str):
@cached_property
def etl_bigquery_table(self):
"""
Bigquery table to contain the etl data
Bigquery table to contain the etl data,
for compatibility with the old etl, we do not suffix table name
"""
return self._setup_bq_table(PATH_TO_ETL_BQ_SCHEMA, 'data')
return self._setup_bq_table(PATH_TO_ETL_BQ_SCHEMA, 'etl-data', '')

@cached_property
def etl_bigquery_log_table(self):
"""
Bigquery table to contain the etl logs
Bigquery table to contain the etl logs, append '-logs' as resource name
"""
return self._setup_bq_table(PATH_TO_ETL_BQ_LOG_SCHEMA, 'logs')
return self._setup_bq_table(PATH_TO_ETL_BQ_LOG_SCHEMA, 'etl-logs', '-logs')

def prepare_service_account_policy_data(self, role):
"""
Expand Down Expand Up @@ -451,14 +518,56 @@ def etl_extract_function(self):

@cached_property
def etl_load_function(self):
"""etl_load_function"""
return self._etl_function('load', self.etl_load_service_account, True)
"""
Setup etl_load_function
It requires private repository to be included,
we would need to wrapp it around with apply funciton as private repo url is Pulumi Output
"""
return self._private_repo_url().apply(
lambda url: self._etl_function('load', self.etl_load_service_account, url)
)

def _private_repo_url(self):
"""
Pulumi does not support config for pip (esp. [global]),
e.g. gcloud command like this:
gcloud artifacts print-settings python \
--project=cpg-common \
--repository=python-registry \
--location=australia-southeast1
output:
# Insert the following snippet into your .pypirc
[distutils]
index-servers =
python-registry
[python-registry]
repository: https://australia-southeast1-python.pkg.dev/cpg-common/python-registry/
# Insert the following snippet into your pip.conf
[global]
extra-index-url = https://australia-southeast1-python.pkg.dev/cpg-common/python-registry/simple/
So we need to manually construct the extra-index-url
"""

return pulumi.Output.all(
self.infrastructure.gcp_python_registry.location,
self.infrastructure.gcp_python_registry.project,
self.infrastructure.gcp_python_registry.name,
).apply(
lambda args: f'https://{args[0]}-python.pkg.dev/{args[1]}/{args[2]}/simple/'
)

def _etl_function(
self,
f_name: str,
sa: gcp.serviceaccount.Account,
include_private_repo: bool = False,
private_repo_url: str | None = None,
):
"""
Driver function to setup the etl cloud function
Expand All @@ -468,7 +577,7 @@ def _etl_function(

# The Cloud Function source code itself needs to be zipped up into an
# archive, which we create using the pulumi.AssetArchive primitive.
if include_private_repo:
if private_repo_url:
# include private repos and metamist package
# metamist package is only temprary ones to avoid circular dependencies
extra_assets = {
Expand All @@ -477,13 +586,8 @@ def _etl_function(
),
'requirements.txt': append_private_repositories_to_requirements(
filename=f'{str(path_to_func_folder.absolute())}/requirements.txt',
# TODO replace with metamist config, once it's available
private_repo_url=str(
self.extra_sample_metadata_config['etl_private_repo_url'],
),
private_repos=self.extra_sample_metadata_config[ # type: ignore # noqa
'etl_private_repo_packages'
],
private_repo_url=private_repo_url,
private_repos=self.config.sample_metadata.etl_private_repo_packages,
),
}
archive = archive_folder(
Expand Down Expand Up @@ -547,13 +651,9 @@ def _etl_function(
'NOTIFICATION_PUBSUB_TOPIC': self.etl_slack_notification_topic.id
if self.etl_slack_notification_topic
else '',
# TODO replace with metamist config, once it's available
'SM_ENVIRONMENT': self.extra_sample_metadata_config[
'etl_environment'
],
# TODO replace with metamist config, once it's available
'SM_ENVIRONMENT': self.config.sample_metadata.etl_environment,
'DEFAULT_LOAD_CONFIG': json.dumps(
self.extra_sample_metadata_config['etl_parser_default_config']
self.config.sample_metadata.etl_parser_default_config
),
},
ingress_settings='ALLOW_ALL',
Expand Down

0 comments on commit 9700ec8

Please sign in to comment.