Skip to content

Commit

Permalink
[Core] Disk tier ultra for AWS and GCP (#3860)
Browse files Browse the repository at this point in the history
* Finish implementation of ultra disk tier.

* Add the new settings into yaml-spec with benchmark results.

* Fix problems with OCI disk_tier=ultra and update benchmark.

* Add unittest for disk tier ultra.

* Add some helpful details and comments.

* Fix some issues about code style.

* Fix some styles and warning messages.

* Remove redundant warning messages with disk_tier=ultra.

* Passed related smoke tests.

* Fix bugs with format and unit tests.
  • Loading branch information
Conless authored Sep 1, 2024
1 parent 50f68d2 commit 2e204d0
Show file tree
Hide file tree
Showing 15 changed files with 98 additions and 31 deletions.
10 changes: 6 additions & 4 deletions docs/source/reference/yaml-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,14 @@ Available fields:
disk_size: 256
# Disk tier to use for OS (optional).
# Could be one of 'low', 'medium', 'high' or 'best' (default: 'medium').
# Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium').
# if 'best' is specified, use the best disk tier enabled.
# Rough performance estimate:
# low: 500 IOPS; read 20MB/s; write 40 MB/s
# medium: 3000 IOPS; read 220 MB/s; write 200 MB/s
# high: 6000 IOPS; 340 MB/s; write 250 MB/s
# low: 1000 IOPS; read 90 MB/s; write 90 MB/s
# medium: 3000 IOPS; read 220 MB/s; write 220 MB/s
# high: 6000 IOPS; read 400 MB/s; write 400 MB/s
# ultra: 60000 IOPS; read 4000 MB/s; write 3000 MB/s
# Measured by examples/perf/storage_rawperf.yaml
disk_tier: medium
# Ports to expose (optional).
Expand Down
18 changes: 13 additions & 5 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,23 +798,31 @@ def instance_type_exists(self, instance_type):

@classmethod
def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
return 'standard' if disk_tier == resources_utils.DiskTier.LOW else 'gp3'
if disk_tier == resources_utils.DiskTier.LOW:
return 'standard'
if disk_tier == resources_utils.DiskTier.ULTRA:
return 'io2'
return 'gp3'

@classmethod
def _get_disk_specs(
cls,
disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
tier = cls._translate_disk_tier(disk_tier)
tier2iops = {
resources_utils.DiskTier.ULTRA: 20000,
resources_utils.DiskTier.HIGH: 7000,
resources_utils.DiskTier.MEDIUM: 3500,
resources_utils.DiskTier.LOW: 0, # only gp3 is required to set iops
resources_utils.DiskTier.LOW: 0, # iops is not required on standard disk
}
return {
'disk_tier': cls._get_disk_type(tier),
'disk_iops': tier2iops[tier],
'disk_throughput': tier2iops[tier] // 16,
'custom_disk_perf': tier != resources_utils.DiskTier.LOW,
'disk_iops': tier2iops[tier]
if cls._get_disk_type(tier) != 'standard' else None,
# Custom disk throughput is only available for gp3
# see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html
'disk_throughput': tier2iops[tier] // 16
if cls._get_disk_type(tier) == 'gp3' else None,
}

@classmethod
Expand Down
15 changes: 9 additions & 6 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ class Azure(clouds.Cloud):
_MAX_CLUSTER_NAME_LEN_LIMIT = 42
_BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
# Azure does not support high disk tier.
_SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
{resources_utils.DiskTier.HIGH})
# Azure does not support high disk and ultra disk tier.
_SUPPORTED_DISK_TIERS = (
set(resources_utils.DiskTier) -
{resources_utils.DiskTier.HIGH, resources_utils.DiskTier.ULTRA})

_INDENT_PREFIX = ' ' * 4

Expand Down Expand Up @@ -599,9 +600,10 @@ def check_disk_tier(
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
return True, ''
if disk_tier == resources_utils.DiskTier.HIGH:
return False, ('Azure disk_tier=high is not supported now. '
'Please use disk_tier={low, medium} instead.')
if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
return False, (
'Azure disk_tier={high, ultra} is not supported now. '
'Please use disk_tier={low, medium, best} instead.')
# Only S-series supported premium ssd
# see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long
if cls._get_disk_type(
Expand All @@ -628,6 +630,7 @@ def _get_disk_type(cls,
# TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two
# cannot be used as OS disks so we might need data disk support
tier2name = {
resources_utils.DiskTier.ULTRA: 'Disabled',
resources_utils.DiskTier.HIGH: 'Disabled',
resources_utils.DiskTier.MEDIUM: 'Premium_LRS',
resources_utils.DiskTier.LOW: 'Standard_LRS',
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class Cloud:

_REPR = '<Cloud>'
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
_BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
_SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False

Expand Down
17 changes: 14 additions & 3 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import time
import typing
from typing import Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple

import colorama

Expand Down Expand Up @@ -437,6 +437,7 @@ def make_deploy_resources_variables(
'custom_resources': None,
'use_spot': r.use_spot,
'gcp_project_id': self.get_project_id(dryrun),
**GCP._get_disk_specs(r.disk_tier),
}
accelerators = r.accelerators
if accelerators is not None:
Expand Down Expand Up @@ -495,8 +496,6 @@ def make_deploy_resources_variables(
resources_vars['machine_image'] = image_id
resources_vars['image_id'] = None

resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier)

firewall_rule = None
if resources.ports is not None:
firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format(
Expand Down Expand Up @@ -917,12 +916,24 @@ def _get_disk_type(cls,
disk_tier: Optional[resources_utils.DiskTier]) -> str:
tier = cls._translate_disk_tier(disk_tier)
tier2name = {
resources_utils.DiskTier.ULTRA: 'pd-extreme',
resources_utils.DiskTier.HIGH: 'pd-ssd',
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
resources_utils.DiskTier.LOW: 'pd-standard',
}
return tier2name[tier]

@classmethod
def _get_disk_specs(
cls,
disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)}
if disk_tier == resources_utils.DiskTier.ULTRA:
# Only pd-extreme supports custom iops.
# see https://cloud.google.com/compute/docs/disks#disk-types
specs['disk_iops'] = 20000
return specs

@classmethod
def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
Expand Down
17 changes: 16 additions & 1 deletion sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ class OCI(clouds.Cloud):

_INDENT_PREFIX = ' '

_SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
_SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
{resources_utils.DiskTier.ULTRA})
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH

@classmethod
def _unsupported_features_for_resources(
Expand Down Expand Up @@ -414,6 +416,19 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
f'{cls._INDENT_PREFIX}Error details: '
f'{common_utils.format_exception(e, use_bracket=True)}')

@classmethod
def check_disk_tier(
cls, instance_type: Optional[str],
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
del instance_type # Unused.
if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
return True, ''
if disk_tier == resources_utils.DiskTier.ULTRA:
return False, ('OCI disk_tier=ultra is not supported now. '
'Please use disk_tier={low, medium, high, best} '
'instead.')
return True, ''

def get_credential_file_mounts(self) -> Dict[str, str]:
"""Returns a dict of credential file paths to mount paths."""
oci_cfg_file = oci_adaptor.get_config_file()
Expand Down
3 changes: 2 additions & 1 deletion sky/clouds/service_catalog/azure_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def get_default_instance_type(
_DEFAULT_INSTANCE_FAMILY)]

def _filter_disk_type(instance_type: str) -> bool:
return Azure.check_disk_tier(instance_type, disk_tier)[0]
valid, _ = Azure.check_disk_tier(instance_type, disk_tier)
return valid

df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
Expand Down
7 changes: 6 additions & 1 deletion sky/clouds/service_catalog/oci_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import Dict, List, Optional, Tuple

from sky.adaptors import oci as oci_adaptor
from sky.clouds import OCI
from sky.clouds.service_catalog import common
from sky.clouds.utils import oci_utils
from sky.utils import resources_utils
Expand Down Expand Up @@ -102,7 +103,6 @@ def get_default_instance_type(
cpus: Optional[str] = None,
memory: Optional[str] = None,
disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
del disk_tier # unused
if cpus is None:
cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+'

Expand All @@ -111,12 +111,17 @@ def get_default_instance_type(
else:
memory_gb_or_ratio = memory

def _filter_disk_type(instance_type: str) -> bool:
valid, _ = OCI.check_disk_tier(instance_type, disk_tier)
return valid

instance_type_prefix = tuple(
f'{family}' for family in oci_utils.oci_config.DEFAULT_INSTANCE_FAMILY)

df = _get_df()
df = df[df['InstanceType'].notna()]
df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
df = df.loc[df['InstanceType'].apply(_filter_disk_type)]

logger.debug(f'# get_default_instance_type: {df}')
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
Expand Down
10 changes: 10 additions & 0 deletions sky/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sky.adaptors import common as adaptors_common
from sky.utils import env_options
from sky.utils import log_utils
from sky.utils import resources_utils
from sky.utils import rich_utils
from sky.utils import subprocess_utils
from sky.utils import ux_utils
Expand Down Expand Up @@ -935,6 +936,15 @@ def sort_key(row, accelerator_spot_list=accelerator_spot_list):
table.add_rows(rows)
logger.info(f'{table}\n')

# Warning message for using disk_tier=ultra
# TODO(yi): Consider price of disks in optimizer and
# move this warning there.
if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA:
logger.warning(
'Using disk_tier=ultra will utilize more advanced disks '
'(io2 Block Express on AWS and extreme persistent disk on '
'GCP), which can lead to significant higher costs (~$2/h).')

@staticmethod
def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
for node, candidate_set in node_to_candidate_map.items():
Expand Down
4 changes: 3 additions & 1 deletion sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,10 @@ available_node_types:
VolumeSize: {{disk_size}}
VolumeType: {{disk_tier}}
Encrypted: {{disk_encrypted}}
{% if custom_disk_perf %}
{% if disk_iops %}
Iops: {{disk_iops}}
{% endif %}
{% if disk_throughput %}
Throughput: {{disk_throughput}}
{% endif %}
{% if use_spot %}
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ available_node_types:
sourceImage: {{image_id}}
{%- endif %}
diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
{%- if disk_iops %}
provisionedIops: {{disk_iops}}
{%- endif %}
{%- if gpu is not none %}
guestAccelerators:
- acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
Expand Down
1 change: 1 addition & 0 deletions sky/utils/resources_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class DiskTier(enum.Enum):
LOW = 'low'
MEDIUM = 'medium'
HIGH = 'high'
ULTRA = 'ultra'
BEST = 'best'

@classmethod
Expand Down
7 changes: 7 additions & 0 deletions tests/test_optimizer_dryruns.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,3 +771,10 @@ def _get_all_candidate_cloud(r: sky.Resources) -> Set[clouds.Cloud]:
assert high_tier_candidates == set(
map(clouds.CLOUD_REGISTRY.get,
['aws', 'gcp', 'oci'])), high_tier_candidates

# Only AWS, GCP supports ULTRA disk tier.
ultra_tier_resources = sky.Resources(
disk_tier=resources_utils.DiskTier.ULTRA)
ultra_tier_candidates = _get_all_candidate_cloud(ultra_tier_resources)
assert ultra_tier_candidates == set(
map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp'])), ultra_tier_candidates
14 changes: 7 additions & 7 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3304,11 +3304,11 @@ def _get_aws_query_command(region, instance_id, field, expected):
f'Reservations[].Instances[].InstanceId --output text`; ' +
_get_aws_query_command(region, '$id', 'VolumeType',
specs['disk_tier']) +
('' if disk_tier == resources_utils.DiskTier.LOW else
(_get_aws_query_command(region, '$id', 'Iops',
specs['disk_iops']) +
_get_aws_query_command(region, '$id', 'Throughput',
specs['disk_throughput']))),
('' if specs['disk_tier']
== 'standard' else _get_aws_query_command(
region, '$id', 'Iops', specs['disk_iops'])) +
('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command(
region, '$id', 'Throughput', specs['disk_throughput'])),
],
f'sky down -y {name}',
timeout=10 * 60, # 10 mins (it takes around ~6 mins)
Expand Down Expand Up @@ -3344,8 +3344,8 @@ def test_gcp_disk_tier():
@pytest.mark.azure
def test_azure_disk_tier():
for disk_tier in list(resources_utils.DiskTier):
if disk_tier == resources_utils.DiskTier.HIGH:
# Azure does not support high disk tier.
if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
# Azure does not support high and ultra disk tier.
continue
type = Azure._get_disk_type(disk_tier)
name = _get_cluster_name() + '-' + disk_tier.value
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def test_aws_make_deploy_variables(*mocks) -> None:
'disk_tier': 'gp3',
'disk_throughput': 218,
'disk_iops': 3500,
'custom_disk_perf': True,
'docker_image': None,
'docker_container_name': 'sky_container',
'docker_login_config': None,
Expand Down

0 comments on commit 2e204d0

Please sign in to comment.