diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 0354d3d0395..c8e98bd9dfb 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -113,12 +113,14 @@ Available fields: disk_size: 256 # Disk tier to use for OS (optional). - # Could be one of 'low', 'medium', 'high' or 'best' (default: 'medium'). + # Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium'). # if 'best' is specified, use the best disk tier enabled. # Rough performance estimate: - # low: 500 IOPS; read 20MB/s; write 40 MB/s - # medium: 3000 IOPS; read 220 MB/s; write 200 MB/s - # high: 6000 IOPS; 340 MB/s; write 250 MB/s + # low: 1000 IOPS; read 90 MB/s; write 90 MB/s + # medium: 3000 IOPS; read 220 MB/s; write 220 MB/s + # high: 6000 IOPS; read 400 MB/s; write 400 MB/s + # ultra: 60000 IOPS; read 4000 MB/s; write 3000 MB/s + # Measured by examples/perf/storage_rawperf.yaml disk_tier: medium # Ports to expose (optional). diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 3a05223574d..693fc142eee 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -798,7 +798,11 @@ def instance_type_exists(self, instance_type): @classmethod def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str: - return 'standard' if disk_tier == resources_utils.DiskTier.LOW else 'gp3' + if disk_tier == resources_utils.DiskTier.LOW: + return 'standard' + if disk_tier == resources_utils.DiskTier.ULTRA: + return 'io2' + return 'gp3' @classmethod def _get_disk_specs( @@ -806,15 +810,19 @@ def _get_disk_specs( disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]: tier = cls._translate_disk_tier(disk_tier) tier2iops = { + resources_utils.DiskTier.ULTRA: 20000, resources_utils.DiskTier.HIGH: 7000, resources_utils.DiskTier.MEDIUM: 3500, - resources_utils.DiskTier.LOW: 0, # only gp3 is required to set iops + resources_utils.DiskTier.LOW: 0, # iops is not required on standard disk } return { 'disk_tier': cls._get_disk_type(tier), - 'disk_iops': tier2iops[tier], - 'disk_throughput': tier2iops[tier] // 16, - 'custom_disk_perf': tier != resources_utils.DiskTier.LOW, + 'disk_iops': tier2iops[tier] + if cls._get_disk_type(tier) != 'standard' else None, + # Custom disk throughput is only available for gp3 + # see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html + 'disk_throughput': tier2iops[tier] // 16 + if cls._get_disk_type(tier) == 'gp3' else None, } @classmethod diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 928ceb5cc52..1768cd6091e 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -60,9 +60,10 @@ class Azure(clouds.Cloud): _MAX_CLUSTER_NAME_LEN_LIMIT = 42 _BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM - # Azure does not support high disk tier. - _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) - - {resources_utils.DiskTier.HIGH}) + # Azure does not support high disk and ultra disk tier. + _SUPPORTED_DISK_TIERS = ( + set(resources_utils.DiskTier) - + {resources_utils.DiskTier.HIGH, resources_utils.DiskTier.ULTRA}) _INDENT_PREFIX = ' ' * 4 @@ -599,9 +600,10 @@ def check_disk_tier( disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST: return True, '' - if disk_tier == resources_utils.DiskTier.HIGH: - return False, ('Azure disk_tier=high is not supported now. ' - 'Please use disk_tier={low, medium} instead.') + if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + return False, ( + 'Azure disk_tier={high, ultra} is not supported now. ' + 'Please use disk_tier={low, medium, best} instead.') # Only S-series supported premium ssd # see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long if cls._get_disk_type( @@ -628,6 +630,7 @@ def _get_disk_type(cls, # TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two # cannot be used as OS disks so we might need data disk support tier2name = { + resources_utils.DiskTier.ULTRA: 'Disabled', resources_utils.DiskTier.HIGH: 'Disabled', resources_utils.DiskTier.MEDIUM: 'Premium_LRS', resources_utils.DiskTier.LOW: 'Standard_LRS', diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 9775109ac80..7d3eb157c61 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -117,7 +117,7 @@ class Cloud: _REPR = '' _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM - _BEST_DISK_TIER = resources_utils.DiskTier.HIGH + _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST} _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 643d55d7037..79a1453c581 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -7,7 +7,7 @@ import subprocess import time import typing -from typing import Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple import colorama @@ -437,6 +437,7 @@ def make_deploy_resources_variables( 'custom_resources': None, 'use_spot': r.use_spot, 'gcp_project_id': self.get_project_id(dryrun), + **GCP._get_disk_specs(r.disk_tier), } accelerators = r.accelerators if accelerators is not None: @@ -495,8 +496,6 @@ def make_deploy_resources_variables( resources_vars['machine_image'] = image_id resources_vars['image_id'] = None - resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier) - firewall_rule = None if resources.ports is not None: firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format( @@ -917,12 +916,24 @@ def _get_disk_type(cls, disk_tier: Optional[resources_utils.DiskTier]) -> str: tier = cls._translate_disk_tier(disk_tier) tier2name = { + resources_utils.DiskTier.ULTRA: 'pd-extreme', resources_utils.DiskTier.HIGH: 'pd-ssd', resources_utils.DiskTier.MEDIUM: 'pd-balanced', resources_utils.DiskTier.LOW: 'pd-standard', } return tier2name[tier] + @classmethod + def _get_disk_specs( + cls, + disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]: + specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)} + if disk_tier == resources_utils.DiskTier.ULTRA: + # Only pd-extreme supports custom iops. + # see https://cloud.google.com/compute/docs/disks#disk-types + specs['disk_iops'] = 20000 + return specs + @classmethod def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str: return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items()) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 7875e26d9cc..57f3a9ffe16 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -42,7 +42,9 @@ class OCI(clouds.Cloud): _INDENT_PREFIX = ' ' - _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier) + _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) - + {resources_utils.DiskTier.ULTRA}) + _BEST_DISK_TIER = resources_utils.DiskTier.HIGH @classmethod def _unsupported_features_for_resources( @@ -414,6 +416,19 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: f'{cls._INDENT_PREFIX}Error details: ' f'{common_utils.format_exception(e, use_bracket=True)}') + @classmethod + def check_disk_tier( + cls, instance_type: Optional[str], + disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: + del instance_type # Unused. + if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST: + return True, '' + if disk_tier == resources_utils.DiskTier.ULTRA: + return False, ('OCI disk_tier=ultra is not supported now. ' + 'Please use disk_tier={low, medium, high, best} ' + 'instead.') + return True, '' + def get_credential_file_mounts(self) -> Dict[str, str]: """Returns a dict of credential file paths to mount paths.""" oci_cfg_file = oci_adaptor.get_config_file() diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 141b356712e..2d323cbac5f 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -110,7 +110,8 @@ def get_default_instance_type( _DEFAULT_INSTANCE_FAMILY)] def _filter_disk_type(instance_type: str) -> bool: - return Azure.check_disk_tier(instance_type, disk_tier)[0] + valid, _ = Azure.check_disk_tier(instance_type, disk_tier) + return valid df = df.loc[df['InstanceType'].apply(_filter_disk_type)] return common.get_instance_type_for_cpus_mem_impl(df, cpus, diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index 2561b913dcf..a18dee79be5 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -15,6 +15,7 @@ from typing import Dict, List, Optional, Tuple from sky.adaptors import oci as oci_adaptor +from sky.clouds import OCI from sky.clouds.service_catalog import common from sky.clouds.utils import oci_utils from sky.utils import resources_utils @@ -102,7 +103,6 @@ def get_default_instance_type( cpus: Optional[str] = None, memory: Optional[str] = None, disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]: - del disk_tier # unused if cpus is None: cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+' @@ -111,12 +111,17 @@ def get_default_instance_type( else: memory_gb_or_ratio = memory + def _filter_disk_type(instance_type: str) -> bool: + valid, _ = OCI.check_disk_tier(instance_type, disk_tier) + return valid + instance_type_prefix = tuple( f'{family}' for family in oci_utils.oci_config.DEFAULT_INSTANCE_FAMILY) df = _get_df() df = df[df['InstanceType'].notna()] df = df[df['InstanceType'].str.startswith(instance_type_prefix)] + df = df.loc[df['InstanceType'].apply(_filter_disk_type)] logger.debug(f'# get_default_instance_type: {df}') return common.get_instance_type_for_cpus_mem_impl(df, cpus, diff --git a/sky/optimizer.py b/sky/optimizer.py index 10aa697258b..4326329579d 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -19,6 +19,7 @@ from sky.adaptors import common as adaptors_common from sky.utils import env_options from sky.utils import log_utils +from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import ux_utils @@ -935,6 +936,15 @@ def sort_key(row, accelerator_spot_list=accelerator_spot_list): table.add_rows(rows) logger.info(f'{table}\n') + # Warning message for using disk_tier=ultra + # TODO(yi): Consider price of disks in optimizer and + # move this warning there. + if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA: + logger.warning( + 'Using disk_tier=ultra will utilize more advanced disks ' + '(io2 Block Express on AWS and extreme persistent disk on ' + 'GCP), which can lead to significant higher costs (~$2/h).') + @staticmethod def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): for node, candidate_set in node_to_candidate_map.items(): diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 7e9dfccdaf1..6afdf381cc0 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -73,8 +73,10 @@ available_node_types: VolumeSize: {{disk_size}} VolumeType: {{disk_tier}} Encrypted: {{disk_encrypted}} - {% if custom_disk_perf %} + {% if disk_iops %} Iops: {{disk_iops}} + {% endif %} + {% if disk_throughput %} Throughput: {{disk_throughput}} {% endif %} {% if use_spot %} diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index bcc16bac531..5f06eef05c7 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -124,6 +124,9 @@ available_node_types: sourceImage: {{image_id}} {%- endif %} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + {%- if disk_iops %} + provisionedIops: {{disk_iops}} + {%- endif %} {%- if gpu is not none %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 95c784143cc..6f5c07f7d25 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -24,6 +24,7 @@ class DiskTier(enum.Enum): LOW = 'low' MEDIUM = 'medium' HIGH = 'high' + ULTRA = 'ultra' BEST = 'best' @classmethod diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py index becf3ba461a..dfda65e23da 100644 --- a/tests/test_optimizer_dryruns.py +++ b/tests/test_optimizer_dryruns.py @@ -771,3 +771,10 @@ def _get_all_candidate_cloud(r: sky.Resources) -> Set[clouds.Cloud]: assert high_tier_candidates == set( map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp', 'oci'])), high_tier_candidates + + # Only AWS, GCP supports ULTRA disk tier. + ultra_tier_resources = sky.Resources( + disk_tier=resources_utils.DiskTier.ULTRA) + ultra_tier_candidates = _get_all_candidate_cloud(ultra_tier_resources) + assert ultra_tier_candidates == set( + map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp'])), ultra_tier_candidates diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f338de2dda7..7b5ce389b8c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3304,11 +3304,11 @@ def _get_aws_query_command(region, instance_id, field, expected): f'Reservations[].Instances[].InstanceId --output text`; ' + _get_aws_query_command(region, '$id', 'VolumeType', specs['disk_tier']) + - ('' if disk_tier == resources_utils.DiskTier.LOW else - (_get_aws_query_command(region, '$id', 'Iops', - specs['disk_iops']) + - _get_aws_query_command(region, '$id', 'Throughput', - specs['disk_throughput']))), + ('' if specs['disk_tier'] + == 'standard' else _get_aws_query_command( + region, '$id', 'Iops', specs['disk_iops'])) + + ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command( + region, '$id', 'Throughput', specs['disk_throughput'])), ], f'sky down -y {name}', timeout=10 * 60, # 10 mins (it takes around ~6 mins) @@ -3344,8 +3344,8 @@ def test_gcp_disk_tier(): @pytest.mark.azure def test_azure_disk_tier(): for disk_tier in list(resources_utils.DiskTier): - if disk_tier == resources_utils.DiskTier.HIGH: - # Azure does not support high disk tier. + if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + # Azure does not support high and ultra disk tier. continue type = Azure._get_disk_type(disk_tier) name = _get_cluster_name() + '-' + disk_tier.value diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index 6fb9f1bcd14..70da0532e9b 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -125,7 +125,6 @@ def test_aws_make_deploy_variables(*mocks) -> None: 'disk_tier': 'gp3', 'disk_throughput': 218, 'disk_iops': 3500, - 'custom_disk_perf': True, 'docker_image': None, 'docker_container_name': 'sky_container', 'docker_login_config': None,