deploy: bfbea22

wangzheng422 · Jan 7, 2025 · cee96c0 · cee96c0
1 parent c51b475
commit cee96c0
Showing 1 changed file with 35 additions and 10 deletions.
diff --git a/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb b/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb
@@ -16,7 +16,11 @@
    "outputs": [],
    "source": [
     "# Import pieces from codeflare-sdk\n",
-    "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
+    "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication\n",
+    "\n",
+    "# import os\n",
+    "\n",
+    "# %env RAY_TLS_VERIFY=0"
    ]
   },
   {
@@ -75,9 +79,10 @@
     "    worker_cpu_limits=1,\n",
     "    worker_memory_requests=4,\n",
     "    worker_memory_limits=6,\n",
-    "    image=\"quay.io/wangzheng422/qimgs:llama-factory-ray-20250103-v01\", # Optional Field \n",
+    "    image=\"quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v06\", # Optional Field \n",
     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+    "    verify_tls=False,\n",
     "))"
    ]
   },
@@ -158,6 +163,9 @@
     "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
     "\n",
     "import ray\n",
+    "import os\n",
+    "\n",
+    "%env RAY_TLS_VERIFY=0\n",
     "\n",
     "# reset the ray context in case there's already one. \n",
     "ray.shutdown()\n",
@@ -168,7 +176,7 @@
     "runtime_env = {}\n",
     "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
     "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
-    "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n",
+    "ray.init(address=ray_cluster_uri, runtime_env=runtime_env, ignore_reinit_error=True, _ssl_verify=False)\n",
     "\n",
     "print(\"Ray cluster is up and running: \", ray.is_initialized())"
    ]
@@ -204,14 +212,28 @@
     "                return \"IP address not found\"\n",
     "        except subprocess.CalledProcessError as e:\n",
     "            return f\"Error getting IP address: {e}\"\n",
+    "    def execute_short_command(self, ip_address, nnodes, node_rank):\n",
+    "        command = f'source /opt/py_env/bin/activate; cd /app; llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n",
+    "        return self._run_command_in_host_env(command)\n",
     "\n",
     "    def execute_command(self, ip_address, nnodes, node_rank):\n",
-    "        command = f'FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml'\n",
+    "        command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n",
     "        try:\n",
     "            os.system(command)\n",
     "            return \"Command executed successfully\"\n",
     "        except Exception as e:\n",
-    "            return f\"Error executing command: {e}\""
+    "            return f\"Error executing command: {e}\"\n",
+    "\n",
+    "    def _run_command_in_host_env(self, command):\n",
+    "        try:\n",
+    "            # Run the command in a new shell\n",
+    "            result = subprocess.run(command, shell=True, capture_output=True, text=True)\n",
+    "            if result.returncode == 0:\n",
+    "                return result.stdout  # Command executed successfully\n",
+    "            else:\n",
+    "                return f\"Error executing command: {result.stderr}\"\n",
+    "        except Exception as e:\n",
+    "            return f\"Unexpected error: {e}\"\n"
    ]
   },
   {
@@ -244,12 +266,15 @@
     "node_rank1 = 0  # Rank for actor 1\n",
     "node_rank2 = 1  # Rank for actor 2\n",
     "\n",
-    "# Example of executing command with actor 1's IP\n",
-    "result1 = ray.get(actor1.execute_command.remote(ip1, nnodes, node_rank1))\n",
-    "print(f\"Actor 1 command result: {result1}\")\n",
+    "# Call the remote functions to execute commands\n",
+    "result1_future = actor1.execute_command.remote(ip1, nnodes, node_rank1)\n",
+    "result2_future = actor2.execute_command.remote(ip2, nnodes, node_rank2)\n",
     "\n",
-    "# Example of executing command with actor 2's IP\n",
-    "result2 = ray.get(actor2.execute_command.remote(ip2, nnodes, node_rank2))\n",
+    "# Retrieve the results (will wait for both to complete)\n",
+    "result1 = ray.get(result1_future)\n",
+    "result2 = ray.get(result2_future)\n",
+    "\n",
+    "print(f\"Actor 1 command result: {result1}\")\n",
     "print(f\"Actor 2 command result: {result2}\")"
    ]
   },