Skip to content

Commit

Permalink
Merge pull request #8608 from hssyoo/validate-mv-cmd
Browse files Browse the repository at this point in the history
Add S3 bucket validation to s3 mv
  • Loading branch information
hssyoo authored Mar 29, 2024
2 parents 66aff88 + d4d92d5 commit 4e8924a
Show file tree
Hide file tree
Showing 7 changed files with 696 additions and 14 deletions.
5 changes: 5 additions & 0 deletions .changes/next-release/enhancement-s3-67553.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "enhancement",
"category": "``s3``",
"description": "Add parameter to validate source and destination S3 URIs to the ``mv`` command."
}
110 changes: 100 additions & 10 deletions awscli/customizations/s3/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import sys

from botocore.client import Config
from botocore.utils import is_s3express_bucket
from botocore.utils import is_s3express_bucket, ensure_boolean
from dateutil.parser import parse
from dateutil.tz import tzlocal

Expand All @@ -31,7 +31,8 @@
from awscli.customizations.s3.s3handler import S3TransferHandlerFactory
from awscli.customizations.s3.utils import find_bucket_key, AppendFilter, \
find_dest_path_comp_key, human_readable_size, \
RequestParamsMapper, split_s3_bucket_key, block_unsupported_resources
RequestParamsMapper, split_s3_bucket_key, block_unsupported_resources, \
S3PathResolver
from awscli.customizations.utils import uni_print
from awscli.customizations.s3.syncstrategy.base import MissingFileSync, \
SizeAndLastModifiedSync, NeverSync
Expand Down Expand Up @@ -430,6 +431,27 @@
)
}

VALIDATE_SAME_S3_PATHS = {
'name': 'validate-same-s3-paths', 'action': 'store_true',
'help_text': (
'Resolves the source and destination S3 URIs to their '
'underlying buckets and verifies that the file or object '
'is not being moved onto itself. If you are using any type '
'of access point ARNs or access point aliases in your S3 URIs, '
'we strongly recommended using this parameter to help prevent '
'accidental deletions of the source file or object. This '
'parameter resolves the underlying buckets of S3 access point '
'ARNs and aliases, S3 on Outposts access point ARNs, and '
'Multi-Region Access Point ARNs. S3 on Outposts access point '
'aliases are not supported. Instead of using this parameter, '
'you can set the environment variable '
'``AWS_CLI_S3_MV_VALIDATE_SAME_S3_PATHS`` to ``true``. '
'NOTE: Path validation requires making additional API calls. '
'Future updates to this path-validation mechanism might change '
'which API calls are made.'
)
}

TRANSFER_ARGS = [DRYRUN, QUIET, INCLUDE, EXCLUDE, ACL,
FOLLOW_SYMLINKS, NO_FOLLOW_SYMLINKS, NO_GUESS_MIME_TYPE,
SSE, SSE_C, SSE_C_KEY, SSE_KMS_KEY_ID, SSE_C_COPY_SOURCE,
Expand Down Expand Up @@ -685,7 +707,9 @@ def _run_main(self, parsed_args, parsed_globals):
self._convert_path_args(parsed_args)
params = self._build_call_parameters(parsed_args, {})
cmd_params = CommandParameters(self.NAME, params,
self.USAGE)
self.USAGE,
self._session,
parsed_globals)
cmd_params.add_region(parsed_globals)
cmd_params.add_endpoint_url(parsed_globals)
cmd_params.add_verify_ssl(parsed_globals)
Expand Down Expand Up @@ -735,13 +759,13 @@ class CpCommand(S3TransferCommand):

class MvCommand(S3TransferCommand):
NAME = 'mv'
DESCRIPTION = "Moves a local file or S3 object to " \
"another location locally or in S3."
DESCRIPTION = BasicCommand.FROM_FILE('s3', 'mv', '_description.rst')
USAGE = "<LocalPath> <S3Uri> or <S3Uri> <LocalPath> " \
"or <S3Uri> <S3Uri>"
ARG_TABLE = [{'name': 'paths', 'nargs': 2, 'positional_arg': True,
'synopsis': USAGE}] + TRANSFER_ARGS +\
[METADATA, METADATA_DIRECTIVE, RECURSIVE]
[METADATA, METADATA_DIRECTIVE, RECURSIVE, VALIDATE_SAME_S3_PATHS]


class RmCommand(S3TransferCommand):
NAME = 'rm'
Expand Down Expand Up @@ -1123,7 +1147,8 @@ class CommandParameters(object):
This class is used to do some initial error based on the
parameters and arguments passed to the command line.
"""
def __init__(self, cmd, parameters, usage):
def __init__(self, cmd, parameters, usage,
session=None, parsed_globals=None):
"""
Stores command name and parameters. Ensures that the ``dir_op`` flag
is true if a certain command is being used.
Expand All @@ -1136,6 +1161,8 @@ def __init__(self, cmd, parameters, usage):
self.cmd = cmd
self.parameters = parameters
self.usage = usage
self._session = session
self._parsed_globals = parsed_globals
if 'dir_op' not in parameters:
self.parameters['dir_op'] = False
if 'follow_symlinks' not in parameters:
Expand Down Expand Up @@ -1198,9 +1225,12 @@ def _validate_streaming_paths(self):
def _validate_path_args(self):
# If we're using a mv command, you can't copy the object onto itself.
params = self.parameters
if self.cmd == 'mv' and self._same_path(params['src'], params['dest']):
raise ValueError("Cannot mv a file onto itself: '%s' - '%s'" % (
params['src'], params['dest']))
if self.cmd == 'mv' and params['paths_type']=='s3s3':
self._raise_if_mv_same_paths(params['src'], params['dest'])
if self._should_validate_same_underlying_s3_paths():
self._validate_same_underlying_s3_paths()
if self._should_emit_validate_s3_paths_warning():
self._emit_validate_s3_paths_warning()

# If the user provided local path does not exist, hard fail because
# we know that we will not be able to upload the file.
Expand All @@ -1225,6 +1255,66 @@ def _same_path(self, src, dest):
src_base = os.path.basename(src)
return src == os.path.join(dest, src_base)

def _same_key(self, src, dest):
_, src_key = split_s3_bucket_key(src)
_, dest_key = split_s3_bucket_key(dest)
return self._same_path(f'/{src_key}', f'/{dest_key}')

def _validate_same_s3_paths_enabled(self):
validate_env_var = ensure_boolean(
os.environ.get('AWS_CLI_S3_MV_VALIDATE_SAME_S3_PATHS'))
return (self.parameters.get('validate_same_s3_paths') or
validate_env_var)

def _should_emit_validate_s3_paths_warning(self):
is_same_key = self._same_key(
self.parameters['src'], self.parameters['dest'])
src_has_underlying_path = S3PathResolver.has_underlying_s3_path(
self.parameters['src'])
dest_has_underlying_path = S3PathResolver.has_underlying_s3_path(
self.parameters['dest'])
return (is_same_key and not self._validate_same_s3_paths_enabled() and
(src_has_underlying_path or dest_has_underlying_path))

def _emit_validate_s3_paths_warning(self):
msg = (
"warning: Provided s3 paths may resolve to same underlying "
"s3 object(s) and result in deletion instead of being moved. "
"To resolve and validate underlying s3 paths are not the same, "
"specify the --validate-same-s3-paths flag or set the "
"AWS_CLI_S3_MV_VALIDATE_SAME_S3_PATHS environment variable to true. "
"To resolve s3 outposts access point path, the arn must be "
"used instead of the alias.\n"
)
uni_print(msg, sys.stderr)

def _should_validate_same_underlying_s3_paths(self):
is_same_key = self._same_key(
self.parameters['src'], self.parameters['dest'])
return is_same_key and self._validate_same_s3_paths_enabled()

def _validate_same_underlying_s3_paths(self):
src_paths = S3PathResolver.from_session(
self._session,
self.parameters.get('source_region', self._parsed_globals.region),
self._parsed_globals.verify_ssl
).resolve_underlying_s3_paths(self.parameters['src'])
dest_paths = S3PathResolver.from_session(
self._session,
self._parsed_globals.region,
self._parsed_globals.verify_ssl
).resolve_underlying_s3_paths(self.parameters['dest'])
for src_path in src_paths:
for dest_path in dest_paths:
self._raise_if_mv_same_paths(src_path, dest_path)

def _raise_if_mv_same_paths(self, src, dest):
if self._same_path(src, dest):
raise ValueError(
"Cannot mv a file onto itself: "
f"{self.parameters['src']} - {self.parameters['dest']}"
)

def _normalize_s3_trailing_slash(self, paths):
for i, path in enumerate(paths):
if path.startswith('s3://'):
Expand Down
111 changes: 111 additions & 0 deletions awscli/customizations/s3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,3 +796,114 @@ def read(self, amt=None):
return self._fileobj.read()
else:
return self._fileobj.read(amt)


class S3PathResolver:
_S3_ACCESSPOINT_ARN_TO_ACCOUNT_NAME_REGEX = re.compile(
r'^arn:aws.*:s3:[a-z0-9\-]+:(?P<account>[0-9]{12}):accesspoint[:/]'
r'(?P<name>[a-z0-9\-]{3,50})$'
)
_S3_OUTPOST_ACCESSPOINT_ARN_TO_ACCOUNT_REGEX = re.compile(
r'^arn:aws.*:s3-outposts:[a-z0-9\-]+:(?P<account>[0-9]{12}):outpost/'
r'op-[a-zA-Z0-9]+/accesspoint[:/][a-z0-9\-]{3,50}$'
)
_S3_MRAP_ARN_TO_ACCOUNT_ALIAS_REGEX = re.compile(
r'^arn:aws:s3::(?P<account>[0-9]{12}):accesspoint[:/]'
r'(?P<alias>[a-zA-Z0-9]+\.mrap)$'
)

def __init__(self, s3control_client, sts_client):
self._s3control_client = s3control_client
self._sts_client = sts_client

@classmethod
def has_underlying_s3_path(self, path):
bucket, _ = split_s3_bucket_key(path)
return bool(
self._S3_ACCESSPOINT_ARN_TO_ACCOUNT_NAME_REGEX.match(bucket) or
self._S3_OUTPOST_ACCESSPOINT_ARN_TO_ACCOUNT_REGEX.match(bucket) or
self._S3_MRAP_ARN_TO_ACCOUNT_ALIAS_REGEX.match(bucket) or
bucket.endswith('-s3alias') or bucket.endswith('--op-s3'))

@classmethod
def from_session(cls, session, region, verify_ssl):
s3control_client = session.create_client(
's3control',
region_name=region,
verify=verify_ssl,
)
sts_client = session.create_client(
'sts',
verify=verify_ssl,
)
return cls(s3control_client, sts_client)

def resolve_underlying_s3_paths(self, path):
bucket, key = split_s3_bucket_key(path)
match = self._S3_ACCESSPOINT_ARN_TO_ACCOUNT_NAME_REGEX.match(bucket)
if match:
return self._resolve_accesspoint_arn(
match.group('account'), match.group('name'), key
)
match = self._S3_OUTPOST_ACCESSPOINT_ARN_TO_ACCOUNT_REGEX.match(bucket)
if match:
return self._resolve_accesspoint_arn(
match.group('account'), bucket, key
)
match = self._S3_MRAP_ARN_TO_ACCOUNT_ALIAS_REGEX.match(bucket)
if match:
return self._resolve_mrap_alias(
match.group('account'), match.group('alias'), key
)
if bucket.endswith('-s3alias'):
return self._resolve_accesspoint_alias(bucket, key)
if bucket.endswith('--op-s3'):
raise ValueError(
"Can't resolve underlying bucket name of s3 outposts "
"access point alias. Use arn instead to resolve the "
"bucket name and validate the mv command."
)
return [path]

def _resolve_accesspoint_arn(self, account, name, key):
bucket = self._get_access_point_bucket(account, name)
return [f"s3://{bucket}/{key}"]

def _resolve_accesspoint_alias(self, alias, key):
account = self._get_account_id()
bucket = self._get_access_point_bucket(account, alias)
return [f"s3://{bucket}/{key}"]

def _resolve_mrap_alias(self, account, alias, key):
buckets = self._get_mrap_buckets(account, alias)
return [f"s3://{bucket}/{key}" for bucket in buckets]

def _get_access_point_bucket(self, account, name):
return self._s3control_client.get_access_point(
AccountId=account,
Name=name
)['Bucket']

def _get_account_id(self):
return self._sts_client.get_caller_identity()['Account']

def _get_mrap_buckets(self, account, alias):
next_token = None
while True:
args = {"AccountId": account}
if next_token:
args['NextToken'] = next_token
response = self._s3control_client.list_multi_region_access_points(
**args
)
for access_point in response['AccessPoints']:
if access_point['Alias'] == alias:
return [
region["Bucket"] for region in access_point["Regions"]
]
next_token = response.get('NextToken')
if not next_token:
raise ValueError(
"Couldn't find multi-region access point "
f"with alias {alias} in account {account}"
)
13 changes: 13 additions & 0 deletions awscli/examples/s3/mv/_description.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Moves a local file or S3 object to another location locally or in S3.

.. WARNING::
If you are using any type of access point ARNs or access point aliases
in your S3 URIs, you must take extra care to make sure that your source
and destination S3 URIs resolve to different underlying buckets. If the
source and destination buckets are the same, the source file or object
can be moved onto itself, which can result in accidental deletion of
your source file or object.

To verify that the source and destination buckets are not the same,
use the ``--validate-same-s3-paths`` parameter, or set the environment
variable ``AWS_CLI_S3_MV_VALIDATE_SAME_S3_PATHS`` to ``true``.
Loading

0 comments on commit 4e8924a

Please sign in to comment.