From e8af32863072aae72fe4c4c7430fda77bbace646 Mon Sep 17 00:00:00 2001 From: HysunHe Date: Sun, 12 Jan 2025 23:14:20 +0800 Subject: [PATCH 1/6] [Storage]: Support buckets from different region --- docs/source/reference/storage.rst | 2 +- examples/oci/dataset-mount.yaml | 2 +- sky/data/storage.py | 22 ++++++++++++++++------ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/storage.rst b/docs/source/reference/storage.rst index 16f87c1ce2f..467b23e0d53 100644 --- a/docs/source/reference/storage.rst +++ b/docs/source/reference/storage.rst @@ -290,7 +290,7 @@ Storage YAML reference - https://.blob.core.windows.net/ - r2:// - cos:/// - - oci:// + - oci://@ If the source is local, data is uploaded to the cloud to an appropriate bucket (s3, gcs, azure, r2, oci, or ibm). If source is bucket URI, diff --git a/examples/oci/dataset-mount.yaml b/examples/oci/dataset-mount.yaml index 91bec9cda65..1f62360a5a3 100644 --- a/examples/oci/dataset-mount.yaml +++ b/examples/oci/dataset-mount.yaml @@ -11,7 +11,7 @@ resources: file_mounts: # Mount an existing oci bucket /datasets-storage: - source: oci://skybucket + source: oci://skybucket@us-sanjose-1 mode: MOUNT # Either MOUNT or COPY. Optional. # Working directory (optional) containing the project codebase. diff --git a/sky/data/storage.py b/sky/data/storage.py index 018cb2797ca..d65b46f1a07 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3980,8 +3980,15 @@ def __init__(self, self.compartment: str self.namespace: str - # Bucket region should be consistence with the OCI config file - region = oci.get_oci_config()['region'] + # Region is from the specified name in @ format + if '@' in name: + name, region = name.split('@', maxsplit=1) + # Region is from the specified source in oci://@ format + if '@' in source: + source, region = source.split('@', maxsplit=1) + # Default region set to what specified in oci config. + if region is None: + region = oci.get_oci_config()['region'] super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction, _bucket_sub_path) @@ -4137,7 +4144,8 @@ def get_file_sync_command(base_dir_path, file_names): sync_command = ( 'oci os object bulk-upload --no-follow-symlinks --overwrite ' f'--bucket-name {self.name} --namespace-name {self.namespace} ' - f'--src-dir "{base_dir_path}" {includes}') + f'--region {self.region} --src-dir "{base_dir_path}" ' + f'{includes}') return sync_command @@ -4157,8 +4165,8 @@ def get_dir_sync_command(src_dir_path, dest_dir_name): sync_command = ( 'oci os object bulk-upload --no-follow-symlinks --overwrite ' f'--bucket-name {self.name} --namespace-name {self.namespace} ' - f'--object-prefix "{dest_dir_name}" --src-dir "{src_dir_path}" ' - f'{excludes} ') + f'--region {self.region} --object-prefix "{dest_dir_name}" ' + f'--src-dir "{src_dir_path}" {excludes}') return sync_command @@ -4289,7 +4297,8 @@ def _download_file(self, remote_path: str, local_path: str) -> None: def get_file_download_command(remote_path, local_path): download_command = (f'oci os object get --bucket-name {self.name} ' f'--namespace-name {self.namespace} ' - f'--name {remote_path} --file {local_path}') + f'--region {self.region} --name {remote_path} ' + f'--file {local_path}') return download_command @@ -4346,6 +4355,7 @@ def _delete_oci_bucket(self, bucket_name: str) -> bool: @oci.with_oci_env def get_bucket_delete_command(bucket_name): remove_command = (f'oci os bucket delete --bucket-name ' + f'--region {self.region} ' f'{bucket_name} --empty --force') return remove_command From 960e29bed3640f824b6ec935b5255d011e5b50d6 Mon Sep 17 00:00:00 2001 From: HysunHe Date: Mon, 13 Jan 2025 12:30:04 +0800 Subject: [PATCH 2/6] address review comment: add validation --- sky/data/storage.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index d65b46f1a07..904ddb062f6 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3981,11 +3981,13 @@ def __init__(self, self.namespace: str # Region is from the specified name in @ format - if '@' in name: - name, region = name.split('@', maxsplit=1) + if name is not None and '@' in name: + self._validate_bucket_expr(name) + name, region = name.split('@') # Region is from the specified source in oci://@ format - if '@' in source: - source, region = source.split('@', maxsplit=1) + if source is not None and '@' in source: + self._validate_bucket_expr(source) + source, region = source.split('@') # Default region set to what specified in oci config. if region is None: region = oci.get_oci_config()['region'] @@ -3994,6 +3996,12 @@ def __init__(self, sync_on_reconstruction, _bucket_sub_path) # TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands + def _validate_bucket_expr(self, bucket_expr: str): + pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$' + assert re.match(pattern, bucket_expr), ( + 'The format for the bucket portion is @ ' + 'when specify a region with a bucket.') + def _validate(self): if self.source is not None and isinstance(self.source, str): if self.source.startswith('oci://'): From d4c924d3723d478c765045a64932ecd332d8d1b9 Mon Sep 17 00:00:00 2001 From: Hysun He Date: Tue, 14 Jan 2025 08:57:43 +0800 Subject: [PATCH 3/6] Update sky/data/storage.py Co-authored-by: Tian Xia --- sky/data/storage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 904ddb062f6..a010a7c80f5 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3998,7 +3998,8 @@ def __init__(self, def _validate_bucket_expr(self, bucket_expr: str): pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$' - assert re.match(pattern, bucket_expr), ( + if not re.match(pattern, bucket_expr): + raise ValueError( 'The format for the bucket portion is @ ' 'when specify a region with a bucket.') From 7fe69979f06d6b0fa9f83345cc3f204032466078 Mon Sep 17 00:00:00 2001 From: HysunHe Date: Tue, 14 Jan 2025 09:40:18 +0800 Subject: [PATCH 4/6] Address review comments --- sky/data/storage.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index a010a7c80f5..1f484fccbe4 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3968,7 +3968,7 @@ class OciStore(AbstractStore): def __init__(self, name: str, - source: str, + source: Optional[SourceType], region: Optional[str] = None, is_sky_managed: Optional[bool] = None, sync_on_reconstruction: Optional[bool] = True, @@ -3980,18 +3980,28 @@ def __init__(self, self.compartment: str self.namespace: str - # Region is from the specified name in @ format + # Region is from the specified name in @ format. + # Another case is name can also be set by the source, for example: + # /datasets-storage: + # source: oci://RAGData@us-sanjose-1 + # The name in above mount will be set to RAGData@us-sanjose-1 if name is not None and '@' in name: self._validate_bucket_expr(name) name, region = name.split('@') + # Region is from the specified source in oci://@ format - if source is not None and '@' in source: + if isinstance(source, + str) and source.startswith('oci://') and '@' in source: self._validate_bucket_expr(source) source, region = source.split('@') + # Default region set to what specified in oci config. if region is None: region = oci.get_oci_config()['region'] + # So far from now on, the name and source are canonical, means there + # is no region (@ suffix) associated with them anymore. + super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction, _bucket_sub_path) # TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands @@ -4000,8 +4010,8 @@ def _validate_bucket_expr(self, bucket_expr: str): pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$' if not re.match(pattern, bucket_expr): raise ValueError( - 'The format for the bucket portion is @ ' - 'when specify a region with a bucket.') + 'The format for the bucket portion is @ ' + 'when specify a region with a bucket.') def _validate(self): if self.source is not None and isinstance(self.source, str): From 5bf8b76e2a8485388a6a9b8de3e1c376b632346c Mon Sep 17 00:00:00 2001 From: HysunHe Date: Wed, 15 Jan 2025 11:09:13 +0800 Subject: [PATCH 5/6] review comments --- sky/data/storage.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index 1f484fccbe4..861ad574eb9 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3985,15 +3985,22 @@ def __init__(self, # /datasets-storage: # source: oci://RAGData@us-sanjose-1 # The name in above mount will be set to RAGData@us-sanjose-1 + region_in_name = None if name is not None and '@' in name: self._validate_bucket_expr(name) - name, region = name.split('@') + name, region_in_name = name.split('@') # Region is from the specified source in oci://@ format + region_in_source = None if isinstance(source, str) and source.startswith('oci://') and '@' in source: self._validate_bucket_expr(source) - source, region = source.split('@') + source, region_in_source = source.split('@') + + if region_in_name is not None: + region = region_in_name + elif region_in_source is not None: + region = region_in_source # Default region set to what specified in oci config. if region is None: From b02d13686c2493edee1e58161a48acbff3a9ee86 Mon Sep 17 00:00:00 2001 From: HysunHe Date: Wed, 15 Jan 2025 13:17:59 +0800 Subject: [PATCH 6/6] c --- sky/data/storage.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sky/data/storage.py b/sky/data/storage.py index 861ad574eb9..15eb0d8354d 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -3997,6 +3997,13 @@ def __init__(self, self._validate_bucket_expr(source) source, region_in_source = source.split('@') + if region_in_name is not None and region_in_source is not None: + # This should never happen because name and source will never be + # the remote bucket at the same time. + assert region_in_name == region_in_source, ( + f'Mismatch region specified. Region in name {region_in_name}, ' + f'but region in source is {region_in_source}') + if region_in_name is not None: region = region_in_name elif region_in_source is not None: