Skip to content

Commit

Permalink
YAML signing (Azure#38855)
Browse files Browse the repository at this point in the history
* first draft : YAML signing

* YAML signing

* component ops adding prepare for sign

* resolving comments

* fixing pylint and black
  • Loading branch information
kshitij-microsoft authored Jan 2, 2025
1 parent cf28b06 commit 596b2c0
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 15 deletions.
58 changes: 54 additions & 4 deletions sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import hashlib
import logging
import os
import json
import uuid
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -15,7 +16,17 @@
from os import PathLike
from pathlib import Path
from platform import system
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union, cast
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
cast,
)

from colorama import Fore
from tqdm import TqdmWarning, tqdm
Expand Down Expand Up @@ -56,7 +67,11 @@
from azure.ai.ml._restclient.v2023_04_01.models import PendingUploadRequestDto
from azure.ai.ml._utils._pathspec import GitWildMatchPattern, normalize_file
from azure.ai.ml._utils.utils import convert_windows_path_to_unix, retry, snake_to_camel
from azure.ai.ml.constants._common import MAX_AUTOINCREMENT_ATTEMPTS, DefaultOpenEncoding, OrderString
from azure.ai.ml.constants._common import (
MAX_AUTOINCREMENT_ATTEMPTS,
DefaultOpenEncoding,
OrderString,
)
from azure.ai.ml.entities._assets.asset import Asset
from azure.ai.ml.exceptions import (
AssetPathException,
Expand Down Expand Up @@ -247,6 +262,33 @@ def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type) -> hash_
return _hash


def delete_two_catalog_files(path):
"""
Function that deletes the "catalog.json" and "catalog.json.sig" files located at 'path', if they exist
:param path: Path to the folder for signing
:type path: Union[Path, str]
:return: None
"""
# catalog.json
file_path_json = os.path.join(path, "catalog.json")
if os.path.exists(file_path_json):
module_logger.warning("%s already exists. Deleting it", file_path_json)
os.remove(file_path_json)
# catalog.json.sig
file_path_json_sig = os.path.join(path, "catalog.json.sig")
if os.path.exists(file_path_json_sig):
module_logger.warning("%s already exists. Deleting it", file_path_json_sig)
os.remove(file_path_json_sig)


def create_catalog_files(path, json_stub):
with open(os.path.join(path, "catalog.json"), "w", encoding=DefaultOpenEncoding.WRITE) as jsonFile1:
json.dump(json_stub, jsonFile1)
with open(os.path.join(path, "catalog.json.sig"), "w", encoding=DefaultOpenEncoding.WRITE) as jsonFile2:
json.dump(json_stub, jsonFile2)


def _get_dir_hash(directory: Union[str, os.PathLike], _hash: hash_type, ignore_file: IgnoreFile) -> hash_type:
dir_contents = Path(directory).iterdir()
sorted_contents = sorted(dir_contents, key=lambda path: str(path).lower())
Expand Down Expand Up @@ -349,7 +391,10 @@ def get_content_hash(path: Union[str, os.PathLike], ignore_file: IgnoreFile = Ig


def get_upload_files_from_folder(
path: Union[str, os.PathLike], *, prefix: str = "", ignore_file: IgnoreFile = IgnoreFile()
path: Union[str, os.PathLike],
*,
prefix: str = "",
ignore_file: IgnoreFile = IgnoreFile(),
) -> List[str]:
path = Path(path)
upload_paths = []
Expand Down Expand Up @@ -432,7 +477,12 @@ def traverse_directory( # pylint: disable=unused-argument
result = []
for origin_file_path in origin_file_paths:
relative_path = origin_file_path.relative_to(root)
result.append((_resolve_path(origin_file_path).as_posix(), Path(prefix).joinpath(relative_path).as_posix()))
result.append(
(
_resolve_path(origin_file_path).as_posix(),
Path(prefix).joinpath(relative_path).as_posix(),
)
)
return result


Expand Down
85 changes: 74 additions & 11 deletions sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,48 @@

# pylint: disable=protected-access,too-many-lines
import time
import collections
import types
from functools import partial
from inspect import Parameter, signature
from os import PathLike
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
import hashlib

from azure.ai.ml._restclient.v2021_10_01_dataplanepreview import (
AzureMachineLearningWorkspaces as ServiceClient102021Dataplane,
)
from azure.ai.ml._restclient.v2024_01_01_preview import AzureMachineLearningWorkspaces as ServiceClient012024
from azure.ai.ml._restclient.v2024_01_01_preview.models import ComponentVersion, ListViewType
from azure.ai.ml._restclient.v2024_01_01_preview import (
AzureMachineLearningWorkspaces as ServiceClient012024,
)
from azure.ai.ml._restclient.v2024_01_01_preview.models import (
ComponentVersion,
ListViewType,
)
from azure.ai.ml._scope_dependent_operations import (
OperationConfig,
OperationsContainer,
OperationScope,
_ScopeDependentOperations,
)
from azure.ai.ml._telemetry import ActivityType, monitor_with_activity, monitor_with_telemetry_mixin
from azure.ai.ml._telemetry import (
ActivityType,
monitor_with_activity,
monitor_with_telemetry_mixin,
)
from azure.ai.ml._utils._asset_utils import (
_archive_or_restore,
_create_or_update_autoincrement,
_get_file_hash,
_get_latest,
_get_next_version_from_container,
_resolve_label_to_asset,
get_ignore_file,
get_upload_files_from_folder,
IgnoreFile,
delete_two_catalog_files,
create_catalog_files,
)
from azure.ai.ml._utils._azureml_polling import AzureMLPolling
from azure.ai.ml._utils._endpoint_utils import polling_wait
Expand All @@ -42,7 +59,12 @@
LROConfigurations,
)
from azure.ai.ml.entities import Component, ValidationResult
from azure.ai.ml.exceptions import ComponentException, ErrorCategory, ErrorTarget, ValidationException
from azure.ai.ml.exceptions import (
ComponentException,
ErrorCategory,
ErrorTarget,
ValidationException,
)
from azure.core.exceptions import HttpResponseError, ResourceNotFoundError

from .._utils._cache_utils import CachedNodeResolver
Expand Down Expand Up @@ -282,7 +304,8 @@ def _localize_code(self, component: Component, base_dir: Path) -> None:

target_code_value = "./code"
self._code_operations.download(
**extract_name_and_version(code), download_path=base_dir.joinpath(target_code_value)
**extract_name_and_version(code),
download_path=base_dir.joinpath(target_code_value),
)

setattr(component, component._get_code_field_name(), target_code_value)
Expand Down Expand Up @@ -311,7 +334,13 @@ def _localize_environment(self, component: Component, base_dir: Path) -> None:

@experimental
@monitor_with_telemetry_mixin(ops_logger, "Component.Download", ActivityType.PUBLICAPI)
def download(self, name: str, download_path: Union[PathLike, str] = ".", *, version: Optional[str] = None) -> None:
def download(
self,
name: str,
download_path: Union[PathLike, str] = ".",
*,
version: Optional[str] = None,
) -> None:
"""Download the specified component and its dependencies to local. Local component can be used to create
the component in another workspace or for offline development.
Expand Down Expand Up @@ -491,7 +520,11 @@ def _reset_version_if_no_change(self, component: Component, current_name: str, c
return current_version, rest_component_resource

def _create_or_update_component_version(
self, component: Component, name: str, version: Optional[str], rest_component_resource: Any
self,
component: Component,
name: str,
version: Optional[str],
rest_component_resource: Any,
) -> Any:
try:
if self._registry_name:
Expand Down Expand Up @@ -652,6 +685,28 @@ def create_or_update(
)
return component

@experimental
def prepare_for_sign(self, component: Component):
ignore_file = IgnoreFile()

if isinstance(component, ComponentCodeMixin):
with component._build_code() as code:
delete_two_catalog_files(code.path)
ignore_file = get_ignore_file(code.path) if code._ignore_file is None else ignore_file
file_list = get_upload_files_from_folder(code.path, ignore_file=ignore_file)
json_stub = {}
json_stub["HashAlgorithm"] = "SHA256"
json_stub["CatalogItems"] = {} # type: ignore

for file_path, file_name in sorted(file_list, key=lambda x: str(x[1]).lower()):
file_hash = _get_file_hash(file_path, hashlib.sha256()).hexdigest().upper()
json_stub["CatalogItems"][file_name] = file_hash # type: ignore

json_stub["CatalogItems"] = collections.OrderedDict( # type: ignore
sorted(json_stub["CatalogItems"].items()) # type: ignore
)
create_catalog_files(code.path, json_stub)

@monitor_with_telemetry_mixin(ops_logger, "Component.Archive", ActivityType.PUBLICAPI)
def archive(
self,
Expand Down Expand Up @@ -860,7 +915,9 @@ def _resolve_binding_on_supported_fields_for_node(cls, node: BaseNode) -> None:
:param node: The node
:type node: BaseNode
"""
from azure.ai.ml.entities._job.pipeline._attr_dict import try_get_non_arbitrary_attr
from azure.ai.ml.entities._job.pipeline._attr_dict import (
try_get_non_arbitrary_attr,
)
from azure.ai.ml.entities._job.pipeline._io import PipelineInput

# compute binding to pipeline input is supported on node.
Expand Down Expand Up @@ -968,7 +1025,9 @@ def _try_resolve_compute_for_node(cls, node: BaseNode, _: str, resolver: _AssetR

@classmethod
def _divide_nodes_to_resolve_into_layers(
cls, component: PipelineComponent, extra_operations: List[Callable[[BaseNode, str], Any]]
cls,
component: PipelineComponent,
extra_operations: List[Callable[[BaseNode, str], Any]],
) -> List:
"""Traverse the pipeline component and divide nodes to resolve into layers. Note that all leaf nodes will be
put in the last layer.
Expand Down Expand Up @@ -1029,7 +1088,8 @@ def _divide_nodes_to_resolve_into_layers(
def _get_workspace_key(self) -> str:
try:
workspace_rest = self._workspace_operations._operation.get(
resource_group_name=self._resource_group_name, workspace_name=self._workspace_name
resource_group_name=self._resource_group_name,
workspace_name=self._workspace_name,
)
return str(workspace_rest.workspace_id)
except HttpResponseError:
Expand Down Expand Up @@ -1099,7 +1159,10 @@ def _resolve_dependencies_for_pipeline_component_jobs(
extra_operations=[
# no need to do this as we now keep the original component name for anonymous components
# self._set_default_display_name_for_anonymous_component_in_node,
partial(self._try_resolve_node_level_task_for_parallel_node, resolver=resolver),
partial(
self._try_resolve_node_level_task_for_parallel_node,
resolver=resolver,
),
partial(self._try_resolve_environment_for_component, resolver=resolver),
partial(self._try_resolve_compute_for_node, resolver=resolver),
# should we resolve code here after we do extra operations concurrently?
Expand Down
29 changes: 29 additions & 0 deletions sdk/ml/azure-ai-ml/samples/hello.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import argparse
import os
from datetime import datetime

parser = argparse.ArgumentParser()
parser.add_argument("--componentB_input", type=str)
parser.add_argument("--componentB_output", type=str)

print("Hello Python World...\nI'm componentB :-)")

args = parser.parse_args()

print("componentB_input path: %s" % args.componentB_input)
print("componentB_output path: %s" % args.componentB_output)

print("files in input path: ")
arr = os.listdir(args.componentB_input)
print(arr)

for filename in arr:
print("reading file: %s ..." % filename)
with open(os.path.join(args.componentB_input, filename), "r") as handle:
print(handle.read())

cur_time_str = datetime.now().strftime("%b-%d-%Y-%H-%M-%S")

print("Writing file: %s" % os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt"))
with open(os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt"), "wt") as text_file:
print(f"Logging date time: {cur_time_str}", file=text_file)
20 changes: 20 additions & 0 deletions sdk/ml/azure-ai-ml/samples/job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
code: ../src
command: >-
python main.py train_check --config ${{inputs.data}}/model.yaml --train ${{inputs.data}}/train.csv --sanity-check ${{inputs.data}}/sanity_check.csv --min-accuracy 0.99 --min-precision 0.95 --min-recall 0.95 --model-dir ${{outputs.model}}
inputs:
data:
path: .
mode: download
outputs:
model:
type: uri_folder
environment:
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
conda_file: ../src/environment.yml
environment_variables:
AZUREML_COMMON_RUNTIME_USE_SBOM_CAPABILITY: "true"
compute: azureml:gpu-t4-spot-vpn
display_name: Compete
experiment_name: sensei-compete
description: Sensei Compete Model
31 changes: 31 additions & 0 deletions sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from azure.identity import (
DefaultAzureCredential,
AzureCliCredential,
InteractiveBrowserCredential,
)
from azure.ai.ml import MLClient, load_job
from azure.ai.ml.entities import Data, ManagedOnlineEndpoint, Job, CommandComponent
from azure.ai.ml.sweep import SweepJob, GridSamplingAlgorithm, Choice, Objective
from azure.ai.ml import command
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities._load_functions import load_component

subscription_id = "2d385bf4-0756-4a76-aa95-28bf9ed3b625"
resource_group = "sdkv2-20240925-rg"
workspace_name = "sdkv2-20240925-ws"


credential = DefaultAzureCredential()

print(credential)
ml_client = MLClient(
credential=credential,
subscription_id=subscription_id,
resource_group_name=resource_group,
workspace_name=workspace_name,
)

component = load_component(
"C:\\Projects\\azure-sdk-for-python\\sdk\\ml\\azure-ai-ml\\azure\\ai\\ml\\YAMLsigning\\sum1.yaml"
)
ml_client.components.prepare_for_sign(component)

0 comments on commit 596b2c0

Please sign in to comment.