Skip to content

Commit

Permalink
deploy: bfbea22
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzheng422 committed Jan 7, 2025
1 parent c51b475 commit cee96c0
Showing 1 changed file with 35 additions and 10 deletions.
45 changes: 35 additions & 10 deletions ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
"outputs": [],
"source": [
"# Import pieces from codeflare-sdk\n",
"from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
"from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication\n",
"\n",
"# import os\n",
"\n",
"# %env RAY_TLS_VERIFY=0"
]
},
{
Expand Down Expand Up @@ -75,9 +79,10 @@
" worker_cpu_limits=1,\n",
" worker_memory_requests=4,\n",
" worker_memory_limits=6,\n",
" image=\"quay.io/wangzheng422/qimgs:llama-factory-ray-20250103-v01\", # Optional Field \n",
" image=\"quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v06\", # Optional Field \n",
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
" verify_tls=False,\n",
"))"
]
},
Expand Down Expand Up @@ -158,6 +163,9 @@
"assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
"\n",
"import ray\n",
"import os\n",
"\n",
"%env RAY_TLS_VERIFY=0\n",
"\n",
"# reset the ray context in case there's already one. \n",
"ray.shutdown()\n",
Expand All @@ -168,7 +176,7 @@
"runtime_env = {}\n",
"# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
"# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
"ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n",
"ray.init(address=ray_cluster_uri, runtime_env=runtime_env, ignore_reinit_error=True, _ssl_verify=False)\n",
"\n",
"print(\"Ray cluster is up and running: \", ray.is_initialized())"
]
Expand Down Expand Up @@ -204,14 +212,28 @@
" return \"IP address not found\"\n",
" except subprocess.CalledProcessError as e:\n",
" return f\"Error getting IP address: {e}\"\n",
" def execute_short_command(self, ip_address, nnodes, node_rank):\n",
" command = f'source /opt/py_env/bin/activate; cd /app; llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n",
" return self._run_command_in_host_env(command)\n",
"\n",
" def execute_command(self, ip_address, nnodes, node_rank):\n",
" command = f'FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml'\n",
" command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n",
" try:\n",
" os.system(command)\n",
" return \"Command executed successfully\"\n",
" except Exception as e:\n",
" return f\"Error executing command: {e}\""
" return f\"Error executing command: {e}\"\n",
"\n",
" def _run_command_in_host_env(self, command):\n",
" try:\n",
" # Run the command in a new shell\n",
" result = subprocess.run(command, shell=True, capture_output=True, text=True)\n",
" if result.returncode == 0:\n",
" return result.stdout # Command executed successfully\n",
" else:\n",
" return f\"Error executing command: {result.stderr}\"\n",
" except Exception as e:\n",
" return f\"Unexpected error: {e}\"\n"
]
},
{
Expand Down Expand Up @@ -244,12 +266,15 @@
"node_rank1 = 0 # Rank for actor 1\n",
"node_rank2 = 1 # Rank for actor 2\n",
"\n",
"# Example of executing command with actor 1's IP\n",
"result1 = ray.get(actor1.execute_command.remote(ip1, nnodes, node_rank1))\n",
"print(f\"Actor 1 command result: {result1}\")\n",
"# Call the remote functions to execute commands\n",
"result1_future = actor1.execute_command.remote(ip1, nnodes, node_rank1)\n",
"result2_future = actor2.execute_command.remote(ip2, nnodes, node_rank2)\n",
"\n",
"# Example of executing command with actor 2's IP\n",
"result2 = ray.get(actor2.execute_command.remote(ip2, nnodes, node_rank2))\n",
"# Retrieve the results (will wait for both to complete)\n",
"result1 = ray.get(result1_future)\n",
"result2 = ray.get(result2_future)\n",
"\n",
"print(f\"Actor 1 command result: {result1}\")\n",
"print(f\"Actor 2 command result: {result2}\")"
]
},
Expand Down

0 comments on commit cee96c0

Please sign in to comment.