merge master

skypilot-org · Jan 1, 2025 · 6fd05bb · 6fd05bb
2 parents 1a878bd + 7c33403
commit 6fd05bb
Show file tree

Hide file tree

Showing 26 changed files with 1,714 additions and 300 deletions.
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
@@ -2,19 +2,20 @@
 
 Tutorial: AI Training
 ======================
-This example uses SkyPilot to train a Transformer-based language model from HuggingFace.
+This example uses SkyPilot to train a GPT-like model (inspired by Karpathy's `minGPT <https://github.com/karpathy/minGPT>`_) with Distributed Data Parallel (DDP) in PyTorch.
 
-First, define a :ref:`task YAML <yaml-spec>` with the resource requirements, the setup commands,
+We define a :ref:`task YAML <yaml-spec>` with the resource requirements, the setup commands,
 and the commands to run:
 
 .. code-block:: yaml
 
-  # dnn.yaml
+  # train.yaml
 
-  name: huggingface
+  name: minGPT-ddp
 
   resources:
-    accelerators: V100:4
+      cpus: 4+
+      accelerators: L4:4  # Or A100:8, H100:8
 
   # Optional: upload a working directory to remote ~/sky_workdir.
   # Commands in "setup" and "run" will be executed under it.
@@ -30,38 +31,37 @@ and the commands to run:
   #   ~/.netrc: ~/.netrc
 
   setup: |
-    set -e  # Exit if any command failed.
-    git clone https://github.com/huggingface/transformers/ || true
-    cd transformers
-    pip install .
-    cd examples/pytorch/text-classification
-    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+      git clone --depth 1 https://github.com/pytorch/examples || true
+      cd examples
+      git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp
+      # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
+      uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113
 
   run: |
-    set -e  # Exit if any command failed.
-    cd transformers/examples/pytorch/text-classification
-    python run_glue.py \
-      --model_name_or_path bert-base-cased \
-      --dataset_name imdb  \
-      --do_train \
-      --max_seq_length 128 \
-      --per_device_train_batch_size 32 \
-      --learning_rate 2e-5 \
-      --max_steps 50 \
-      --output_dir /tmp/imdb/ --overwrite_output_dir \
-      --fp16
+      cd examples/mingpt
+      export LOGLEVEL=INFO
+
+      echo "Starting minGPT-ddp training"
+
+      torchrun \
+      --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
+      main.py
 
 .. tip::
 
   In the YAML, the ``workdir`` and ``file_mounts`` fields are commented out. To
   learn about how to use them to mount local dirs/files or object store buckets
   (S3, GCS, R2) into your cluster, see :ref:`sync-code-artifacts`.
 
+.. tip::
+
+  The ``SKYPILOT_NUM_GPUS_PER_NODE`` environment variable is automatically set by SkyPilot to the number of GPUs per node. See :ref:`env-vars` for more.
+
 Then, launch training:
 
 .. code-block:: console
 
-   $ sky launch -c lm-cluster dnn.yaml
+   $ sky launch -c mingpt train.yaml
 
 This will provision the cheapest cluster with the required resources, execute the setup
 commands, then execute the run commands.

diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst
@@ -24,6 +24,10 @@ Available fields and semantics:
   #
   # Ref: https://docs.skypilot.co/en/latest/examples/managed-jobs.html#customizing-job-controller-resources
   jobs:
+    # Bucket to store managed jobs mount files and tmp files. Bucket must already exist. 
+    # Optional. If not set, SkyPilot will create a new bucket for each managed job launch.
+    # Supports s3://, gs://, https://<azure_storage_account>.blob.core.windows.net/<container>, r2://, cos://<region>/<bucket>
+    bucket: s3://my-bucket/
     controller:
       resources:  # same spec as 'resources' in a task YAML
         cloud: gcp

diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst
@@ -6,39 +6,40 @@ Distributed Multi-Node Jobs
 SkyPilot supports multi-node cluster
 provisioning and distributed execution on many nodes.
 
-For example, here is a simple PyTorch Distributed training example:
+For example, here is a simple example to train a GPT-like model (inspired by Karpathy's `minGPT <https://github.com/karpathy/minGPT>`_) across 2 nodes with Distributed Data Parallel (DDP) in PyTorch.
 
 .. code-block:: yaml
-   :emphasize-lines: 6-6,21-21,23-26
+  :emphasize-lines: 6,19,23-24,26
 
-   name: resnet-distributed-app
+  name: minGPT-ddp
 
-   resources:
-     accelerators: A100:8
+  resources:
+      accelerators: A100:8
 
-   num_nodes: 2
+  num_nodes: 2
 
-   setup: |
-     pip3 install --upgrade pip
-     git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
-     cd pytorch-distributed-resnet
-     # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
-     pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
-     mkdir -p data  && mkdir -p saved_models && cd data && \
-       wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-     tar -xvzf cifar-10-python.tar.gz
+  setup: |
+      git clone --depth 1 https://github.com/pytorch/examples || true
+      cd examples
+      git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp
+      # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
+      uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113
 
-   run: |
-     cd pytorch-distributed-resnet
+  run: |
+      cd examples/mingpt
+      export LOGLEVEL=INFO
+
+      MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+      echo "Starting distributed training, head node: $MASTER_ADDR"
 
-     MASTER_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1`
-     torchrun \
+      torchrun \
       --nnodes=$SKYPILOT_NUM_NODES \
-      --master_addr=$MASTER_ADDR \
       --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
-      --node_rank=$SKYPILOT_NODE_RANK \
-      --master_port=12375 \
-       resnet_ddp.py --num_epochs 20
+      --master_addr=$MASTER_ADDR \
+      --node_rank=${SKYPILOT_NODE_RANK} \
+      --master_port=8008 \
+      main.py
+
 
 In the above,
 
@@ -55,6 +56,7 @@ In the above,
 
         ulimit -n 65535
 
+You can find more `distributed training examples <https://github.com/skypilot-org/skypilot/tree/master/examples/distributed-pytorch>`_ (including `using rdvz backend for pytorch <https://github.com/skypilot-org/skypilot/blob/master/examples/distributed-pytorch/train-rdzv.yaml>`_) in our `GitHub repository <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
 
 Environment variables
 -----------------------------------------

diff --git a/examples/local_docker/docker_in_docker.yaml b/examples/local_docker/docker_in_docker.yaml
diff --git a/examples/local_docker/ping.py b/examples/local_docker/ping.py
diff --git a/examples/local_docker/ping.yaml b/examples/local_docker/ping.yaml
diff --git a/examples/oci/dataset-mount.yaml b/examples/oci/dataset-mount.yaml
@@ -0,0 +1,35 @@
+name: cpu-task1
+
+resources:
+  cloud: oci
+  region: us-sanjose-1
+  cpus: 2
+  disk_size: 256
+  disk_tier: medium
+  use_spot: False
+
+file_mounts:
+  # Mount an existing oci bucket
+  /datasets-storage:
+    source: oci://skybucket
+    mode: MOUNT  # Either MOUNT or COPY. Optional.
+
+# Working directory (optional) containing the project codebase.
+# Its contents are synced to ~/sky_workdir/ on the cluster.
+workdir: .
+
+num_nodes: 1
+
+# Typical use: pip install -r requirements.txt
+# Invoked under the workdir (i.e., can use its files).
+setup: |
+  echo "*** Running setup for the task. ***"
+
+# Typical use: make use of resources, such as running training.
+# Invoked under the workdir (i.e., can use its files).
+run: |
+  echo "*** Running the task on OCI ***"
+  timestamp=$(date +%s)
+  ls -lthr /datasets-storage
+  echo "hi" >> /datasets-storage/foo.txt
+  ls -lthr /datasets-storage
diff --git a/examples/oci/dataset-upload-and-mount.yaml b/examples/oci/dataset-upload-and-mount.yaml
@@ -0,0 +1,47 @@
+name: cpu-task1
+
+resources:
+  cloud: oci
+  region: us-sanjose-1
+  cpus: 2
+  disk_size: 256
+  disk_tier: medium
+  use_spot: False
+
+file_mounts:
+  /datasets-storage:
+    name: skybucket  # Name of storage, optional when source is bucket URI
+    source: ['./examples/oci']  # Source path, can be local or bucket URL. Optional, do not specify to create an empty bucket.
+    store: oci  # E.g 'oci', 's3', 'gcs'...; default: None. Optional.
+    persistent: True  # Defaults to True; can be set to false. Optional.
+    mode: MOUNT  # Either MOUNT or COPY. Optional.
+
+  /datasets-storage2:
+    name: skybucket2  # Name of storage, optional when source is bucket URI
+    source: './examples/oci'  # Source path, can be local or bucket URL. Optional, do not specify to create an empty bucket.
+    store: oci  # E.g 'oci', 's3', 'gcs'...; default: None. Optional.
+    persistent: True  # Defaults to True; can be set to false. Optional.
+    mode: MOUNT  # Either MOUNT or COPY. Optional.
+
+# Working directory (optional) containing the project codebase.
+# Its contents are synced to ~/sky_workdir/ on the cluster.
+workdir: .
+
+num_nodes: 1
+
+# Typical use: pip install -r requirements.txt
+# Invoked under the workdir (i.e., can use its files).
+setup: |
+  echo "*** Running setup for the task. ***"
+
+# Typical use: make use of resources, such as running training.
+# Invoked under the workdir (i.e., can use its files).
+run: |
+  echo "*** Running the task on OCI ***"
+  ls -lthr /datasets-storage
+  echo "hi" >> /datasets-storage/foo.txt
+  ls -lthr /datasets-storage
+
+  ls -lthr /datasets-storage2
+  echo "hi" >> /datasets-storage2/foo2.txt
+  ls -lthr /datasets-storage2
diff --git a/examples/oci/oci-mounts.yaml b/examples/oci/oci-mounts.yaml
@@ -0,0 +1,26 @@
+resources:
+  cloud: oci
+
+file_mounts:
+  ~/tmpfile: ~/tmpfile
+  ~/a/b/c/tmpfile: ~/tmpfile
+  /tmp/workdir: ~/tmp-workdir
+
+  /mydir:
+    name: skybucket
+    source: ['~/tmp-workdir']
+    store: oci
+    mode: MOUNT
+
+setup: |
+  echo "*** Setup ***"
+
+run: |
+  echo "*** Run ***"
+
+  ls -lthr ~/tmpfile
+  ls -lthr ~/a/b/c
+  echo hi >> /tmp/workdir/new_file
+  ls -lthr /tmp/workdir
+
+  ls -lthr /mydir
diff --git a/sky/adaptors/oci.py b/sky/adaptors/oci.py
@@ -1,9 +1,11 @@
 """Oracle OCI cloud adaptor"""
 
+import functools
 import logging
 import os
 
 from sky.adaptors import common
+from sky.clouds.utils import oci_utils
 
 # Suppress OCI circuit breaker logging before lazy import, because
 # oci modules prints additional message during imports, i.e., the
@@ -30,10 +32,16 @@ def get_config_file() -> str:
 
 def get_oci_config(region=None, profile='DEFAULT'):
     conf_file_path = get_config_file()
+    if not profile or profile == 'DEFAULT':
+        config_profile = oci_utils.oci_config.get_profile()
+    else:
+        config_profile = profile
+
     oci_config = oci.config.from_file(file_location=conf_file_path,
-                                      profile_name=profile)
+                                      profile_name=config_profile)
     if region is not None:
         oci_config['region'] = region
+
     return oci_config
 
 
@@ -54,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
     return oci.identity.IdentityClient(get_oci_config(region, profile))
 
 
+def get_object_storage_client(region=None, profile='DEFAULT'):
+    return oci.object_storage.ObjectStorageClient(
+        get_oci_config(region, profile))
+
+
 def service_exception():
     """OCI service exception."""
     return oci.exceptions.ServiceError
+
+
+def with_oci_env(f):
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        # pylint: disable=line-too-long
+        enter_env_cmds = [
+            'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
+            '. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
+            'conda activate sky-oci-cli-env', 'pip install oci-cli',
+            'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
+        ]
+        operation_cmd = [f(*args, **kwargs)]
+        leave_env_cmds = ['conda deactivate']
+        return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
+
+    return wrapper