diff --git a/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb b/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb index 9ed080b9..9b92e48b 100644 --- a/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb +++ b/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -259,8 +259,8 @@ "vm_size = \"Standard_NC12s_v3\" # or choose a different GPU enabled VM type\n", "\n", "docker_image = (\n", - " # \"nvcr.io/nvidia/rapidsai/rapidsai-core:23.06-cuda11.8-runtime-ubuntu22.04-py3.10\"\n", " \"rapidsai/base:23.08-cuda12.0-py3.10\"\n", + " # nvcr.io/nvidia/rapidsai/base:23.08-cuda12.0-py3.10\n", ")\n", "docker_args = \"--shm-size=256m\"\n", "worker_class = \"dask_cuda.CUDAWorker\"" @@ -289,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -314,7 +314,7 @@ " 'version': '23.03.0'}}" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -345,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -362,19 +362,19 @@ " \"privacyPolicyLink\": \"https://www.nvidia.com/en-us/about-nvidia/privacy-policy/\",\n", " \"product\": \"ngc_azure_17_11\",\n", " \"publisher\": \"nvidia\",\n", - " \"retrieveDatetime\": \"2023-08-29T18:09:42.2592493Z\",\n", - " \"signature\": \"RS4B4LMK6XWI6GNY56XH34EJDIUOSPVDCSLSAPLNIQTEPUFPG37UKLMGALXEJOHX2JHPOGZQLMIZWZ74PW7X55SS43MGT7CP3RCBFRI\",\n", + " \"retrieveDatetime\": \"2023-09-21T19:29:04.8722893Z\",\n", + " \"signature\": \"TNRP4JBL7RC3YQ232RKEGQ7HVTAYFFXWVMVCOQJ3TGCCYSQY4IVQBN7WMYHRW7NUOGSSDQOPJCYHV3EFCTWDR5JLCR4WLRMPMBLQBCY\",\n", " \"systemData\": {\n", - " \"createdAt\": \"2023-08-29T18:09:44.451002+00:00\",\n", + " \"createdAt\": \"2023-09-21T19:29:08.170175+00:00\",\n", " \"createdBy\": \"fc4f4a6b-4041-4b1c-8249-854d68edcf62\",\n", " \"createdByType\": \"ManagedIdentity\",\n", - " \"lastModifiedAt\": \"2023-08-29T18:09:44.451002+00:00\",\n", + " \"lastModifiedAt\": \"2023-09-21T19:29:08.170175+00:00\",\n", " \"lastModifiedBy\": \"fc4f4a6b-4041-4b1c-8249-854d68edcf62\",\n", " \"lastModifiedByType\": \"ManagedIdentity\"\n", " },\n", " \"type\": \"Microsoft.MarketplaceOrdering/offertypes\"\n", "}\n", - "\u001b[32mCommand ran in 6.000 seconds (init: 0.143, invoke: 5.856)\u001b[0m\n" + "\u001b[32mCommand ran in 7.309 seconds (init: 0.149, invoke: 7.160)\u001b[0m\n" ] } ], @@ -399,6 +399,7 @@ { "cell_type": "markdown", "metadata": { + "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -454,7 +455,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "#### d. Write packer configuration to a configuration file\n", "We now need to provide `packer` with a build file with platform related and cloud-init configurations. `packer` will use this to create the customized VM. \n", @@ -696,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -708,8 +711,8 @@ "Network interface ready\n", "Using Marketplace VM image with a Plan\n", "Creating VM\n", - "Created VM dask-400ac9c4-scheduler\n", - "Waiting for scheduler to run at 20.3.228.17:8786\n", + "Created VM dask-6792b68c-scheduler\n", + "Waiting for scheduler to run at 20.59.13.142:8786\n", "Scheduler is running\n" ] }, @@ -733,10 +736,10 @@ "Network interface ready\n", "Using Marketplace VM image with a Plan\n", "Creating VM\n", - "Created VM dask-400ac9c4-worker-0cd31592\n", - "Created VM dask-400ac9c4-worker-1838a39d\n", - "CPU times: user 1.73 s, sys: 322 ms, total: 2.05 s\n", - "Wall time: 7min 8s\n" + "Created VM dask-6792b68c-worker-ff4082b3\n", + "Created VM dask-6792b68c-worker-b20eb368\n", + "CPU times: user 1.36 s, sys: 304 ms, total: 1.66 s\n", + "Wall time: 7min 24s\n" ] } ], @@ -763,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -773,7 +776,7 @@ "
\n", "
\n", "

Client

\n", - "

Client-33fe62ae-469a-11ee-a5fc-80e82cd32958

\n", + "

Client-aa78dad8-58c9-11ee-b15c-80e82cd32958

\n", " \n", "\n", " \n", @@ -786,7 +789,7 @@ " \n", " \n", " \n", " \n", " \n", @@ -804,11 +807,11 @@ " \n", "
\n", "

AzureVMCluster

\n", - "

10225ce5

\n", + "

d597a65d

\n", "
\n", - " Dashboard: http://20.3.228.17:8787/status\n", + " Dashboard: http://20.59.13.142:8787/status\n", "
\n", " \n", " \n", "
\n", - " Dashboard: http://20.3.228.17:8787/status\n", + " Dashboard: http://20.59.13.142:8787/status\n", " \n", " Workers: 4\n", @@ -835,11 +838,11 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-6aaec2fe-6997-48c2-89d8-faadcb29bdba

\n", + "

Scheduler-83fa74d7-cddf-4b6a-839b-4b9dabfce39d

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", - " Comm: tls://10.5.0.5:8786\n", + " Comm: tls://10.5.0.14:8786\n", " \n", " Workers: 4\n", @@ -847,7 +850,7 @@ "
\n", - " Dashboard: http://10.5.0.5:8787/status\n", + " Dashboard: http://10.5.0.14:8787/status\n", " \n", " Total threads: 4\n", @@ -855,7 +858,7 @@ "
\n", - " Started: 15 minutes ago\n", + " Started: 2 hours ago\n", " \n", " Total memory: 440.42 GiB\n", @@ -876,12 +879,12 @@ "
\n", "
\n", " \n", - "

Worker: dask-400ac9c4-worker-0cd31592-0

\n", + "

Worker: dask-6792b68c-worker-b20eb368-0

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -930,12 +933,12 @@ "
\n", "
\n", " \n", - "

Worker: dask-400ac9c4-worker-0cd31592-1

\n", + "

Worker: dask-6792b68c-worker-b20eb368-1

\n", "
\n", "
\n", - " Comm: tls://10.5.0.7:36717\n", + " Comm: tls://10.5.0.15:41455\n", " \n", " Total threads: 1\n", @@ -889,7 +892,7 @@ "
\n", - " Dashboard: http://10.5.0.7:32773/status\n", + " Dashboard: http://10.5.0.15:44363/status\n", " \n", " Memory: 110.11 GiB\n", @@ -897,13 +900,13 @@ "
\n", - " Nanny: tls://10.5.0.7:43275\n", + " Nanny: tls://10.5.0.15:46303\n", "
\n", - " Local directory: /tmp/dask-scratch-space/worker-qwqxw8uf\n", + " Local directory: /tmp/dask-scratch-space/worker-lno8ooka\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -984,12 +987,12 @@ "
\n", "
\n", " \n", - "

Worker: dask-400ac9c4-worker-1838a39d-0

\n", + "

Worker: dask-6792b68c-worker-ff4082b3-0

\n", "
\n", "
\n", - " Comm: tls://10.5.0.7:33055\n", + " Comm: tls://10.5.0.15:41227\n", " \n", " Total threads: 1\n", @@ -943,7 +946,7 @@ "
\n", - " Dashboard: http://10.5.0.7:34773/status\n", + " Dashboard: http://10.5.0.15:35713/status\n", " \n", " Memory: 110.11 GiB\n", @@ -951,13 +954,13 @@ "
\n", - " Nanny: tls://10.5.0.7:40845\n", + " Nanny: tls://10.5.0.15:44833\n", "
\n", - " Local directory: /tmp/dask-scratch-space/worker-vr1_9itb\n", + " Local directory: /tmp/dask-scratch-space/worker-u5334h1g\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1038,12 +1041,12 @@ "
\n", "
\n", " \n", - "

Worker: dask-400ac9c4-worker-1838a39d-1

\n", + "

Worker: dask-6792b68c-worker-ff4082b3-1

\n", "
\n", "
\n", - " Comm: tls://10.5.0.6:38307\n", + " Comm: tls://10.5.0.5:46495\n", " \n", " Total threads: 1\n", @@ -997,7 +1000,7 @@ "
\n", - " Dashboard: http://10.5.0.6:37659/status\n", + " Dashboard: http://10.5.0.5:42275/status\n", " \n", " Memory: 110.11 GiB\n", @@ -1005,13 +1008,13 @@ "
\n", - " Nanny: tls://10.5.0.6:43501\n", + " Nanny: tls://10.5.0.5:41239\n", "
\n", - " Local directory: /tmp/dask-scratch-space/worker-v7juc99g\n", + " Local directory: /tmp/dask-scratch-space/worker-cl96gwc0\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1101,333 +1104,58 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "client = Client(cluster)\n", - "client" + "client = Client(cluster)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'a69bd546-cbb2-47bb-b7d3-28b196a4'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# client.cluster.workers[0].admin_password" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "
\n", - "

AzureVMCluster

\n", - "

10225ce5

\n", - "
\n", - " Comm: tls://10.5.0.6:40869\n", + " Comm: tls://10.5.0.5:33017\n", " \n", " Total threads: 1\n", @@ -1051,7 +1054,7 @@ "
\n", - " Dashboard: http://10.5.0.6:45593/status\n", + " Dashboard: http://10.5.0.5:37589/status\n", " \n", " Memory: 110.11 GiB\n", @@ -1059,13 +1062,13 @@ "
\n", - " Nanny: tls://10.5.0.6:36861\n", + " Nanny: tls://10.5.0.5:41029\n", "
\n", - " Local directory: /tmp/dask-scratch-space/worker-ql9rg3bd\n", + " Local directory: /tmp/dask-scratch-space/worker-xr62hpnn\n", "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Dashboard: http://20.3.228.17:8787/status\n", - " \n", - " Workers: 4\n", - "
\n", - " Total threads: 4\n", - " \n", - " Total memory: 440.42 GiB\n", - "
\n", - "\n", - "
\n", - " \n", - "

Scheduler Info

\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

Scheduler

\n", - "

Scheduler-6aaec2fe-6997-48c2-89d8-faadcb29bdba

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Comm: tls://10.5.0.5:8786\n", - " \n", - " Workers: 4\n", - "
\n", - " Dashboard: http://10.5.0.5:8787/status\n", - " \n", - " Total threads: 4\n", - "
\n", - " Started: 15 minutes ago\n", - " \n", - " Total memory: 440.42 GiB\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "

Workers

\n", - "
\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: dask-400ac9c4-worker-0cd31592-0

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tls://10.5.0.7:36717\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://10.5.0.7:32773/status\n", - " \n", - " Memory: 110.11 GiB\n", - "
\n", - " Nanny: tls://10.5.0.7:43275\n", - "
\n", - " Local directory: /tmp/dask-scratch-space/worker-qwqxw8uf\n", - "
\n", - " GPU: Tesla V100-PCIE-16GB\n", - " \n", - " GPU memory: 16.00 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: dask-400ac9c4-worker-0cd31592-1

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tls://10.5.0.7:33055\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://10.5.0.7:34773/status\n", - " \n", - " Memory: 110.11 GiB\n", - "
\n", - " Nanny: tls://10.5.0.7:40845\n", - "
\n", - " Local directory: /tmp/dask-scratch-space/worker-vr1_9itb\n", - "
\n", - " GPU: Tesla V100-PCIE-16GB\n", - " \n", - " GPU memory: 16.00 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: dask-400ac9c4-worker-1838a39d-0

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tls://10.5.0.6:38307\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://10.5.0.6:37659/status\n", - " \n", - " Memory: 110.11 GiB\n", - "
\n", - " Nanny: tls://10.5.0.6:43501\n", - "
\n", - " Local directory: /tmp/dask-scratch-space/worker-v7juc99g\n", - "
\n", - " GPU: Tesla V100-PCIE-16GB\n", - " \n", - " GPU memory: 16.00 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: dask-400ac9c4-worker-1838a39d-1

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tls://10.5.0.6:40869\n", - " \n", - " Total threads: 1\n", - "
\n", - " Dashboard: http://10.5.0.6:45593/status\n", - " \n", - " Memory: 110.11 GiB\n", - "
\n", - " Nanny: tls://10.5.0.6:36861\n", - "
\n", - " Local directory: /tmp/dask-scratch-space/worker-ql9rg3bd\n", - "
\n", - " GPU: Tesla V100-PCIE-16GB\n", - " \n", - " GPU memory: 16.00 GiB\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "" - ], "text/plain": [ - "AzureVMCluster(10225ce5, 'tls://20.3.228.17:8786', workers=4, threads=4, memory=440.42 GiB)" + "'1f38ed84-5435-46b5-91e3-10690a60'" ] }, + "execution_count": 42, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "client.cluster" + "# client.cluster.scheduler.admin_password" ] }, { @@ -1450,15 +1178,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.52 ms, sys: 1.8 ms, total: 5.31 ms\n", - "Wall time: 29.1 ms\n" + "CPU times: user 4.63 ms, sys: 0 ns, total: 4.63 ms\n", + "Wall time: 28.5 ms\n" ] } ], @@ -1476,192 +1204,192 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'address': 'tls://10.5.0.5:8786',\n", - " 'id': 'Scheduler-6aaec2fe-6997-48c2-89d8-faadcb29bdba',\n", + "{'address': 'tls://10.5.0.14:8786',\n", + " 'id': 'Scheduler-83fa74d7-cddf-4b6a-839b-4b9dabfce39d',\n", " 'services': {'dashboard': 8787},\n", - " 'started': 1693332941.4093616,\n", + " 'started': 1695324933.1167624,\n", " 'type': 'Scheduler',\n", - " 'workers': {'tls://10.5.0.6:38307': {'gpu': {'memory-total': 17179869184,\n", + " 'workers': {'tls://10.5.0.15:41227': {'gpu': {'memory-total': 17179869184,\n", + " 'name': 'Tesla V100-PCIE-16GB'},\n", + " 'host': '10.5.0.15',\n", + " 'id': 'dask-6792b68c-worker-b20eb368-1',\n", + " 'last_seen': 1695333445.6055431,\n", + " 'local_directory': '/tmp/dask-scratch-space/worker-u5334h1g',\n", + " 'memory_limit': 118225672192,\n", + " 'metrics': {'bandwidth': {'total': 100000000,\n", + " 'types': {},\n", + " 'workers': {}},\n", + " 'cpu': 2.0,\n", + " 'digests_total_since_heartbeat': {'latency': 0.003626108169555664,\n", + " 'tick-duration': 0.5000865459442139},\n", + " 'event_loop_interval': 0.020003232955932617,\n", + " 'gpu': {'memory-used': 598867968,\n", + " 'utilization': 0},\n", + " 'gpu_memory_used': 598867968,\n", + " 'gpu_utilization': 0,\n", + " 'host_disk_io': {'read_bps': 0.0,\n", + " 'write_bps': 0.0},\n", + " 'host_net_io': {'read_bps': 1090.916619434013,\n", + " 'write_bps': 3772.2538049293344},\n", + " 'managed_bytes': 0,\n", + " 'memory': 630988800,\n", + " 'num_fds': 86,\n", + " 'rmm': {'rmm-total': 0,\n", + " 'rmm-used': 0},\n", + " 'spilled_bytes': {'disk': 0,\n", + " 'memory': 0},\n", + " 'task_counts': {},\n", + " 'time': 1695333445.1217656,\n", + " 'transfer': {'incoming_bytes': 0,\n", + " 'incoming_count': 0,\n", + " 'incoming_count_total': 0,\n", + " 'outgoing_bytes': 0,\n", + " 'outgoing_count': 0,\n", + " 'outgoing_count_total': 0}},\n", + " 'name': 'dask-6792b68c-worker-b20eb368-1',\n", + " 'nanny': 'tls://10.5.0.15:44833',\n", + " 'nthreads': 1,\n", + " 'resources': {},\n", + " 'services': {'dashboard': 35713},\n", + " 'status': 'running',\n", + " 'type': 'Worker'},\n", + " 'tls://10.5.0.15:41455': {'gpu': {'memory-total': 17179869184,\n", + " 'name': 'Tesla V100-PCIE-16GB'},\n", + " 'host': '10.5.0.15',\n", + " 'id': 'dask-6792b68c-worker-b20eb368-0',\n", + " 'last_seen': 1695333445.604299,\n", + " 'local_directory': '/tmp/dask-scratch-space/worker-lno8ooka',\n", + " 'memory_limit': 118225672192,\n", + " 'metrics': {'bandwidth': {'total': 100000000,\n", + " 'types': {},\n", + " 'workers': {}},\n", + " 'cpu': 2.0,\n", + " 'digests_total_since_heartbeat': {'latency': 0.004132270812988281,\n", + " 'tick-duration': 0.5016992092132568},\n", + " 'event_loop_interval': 0.020002989768981932,\n", + " 'gpu': {'memory-used': 598867968,\n", + " 'utilization': 0},\n", + " 'gpu_memory_used': 598867968,\n", + " 'gpu_utilization': 0,\n", + " 'host_disk_io': {'read_bps': 0.0,\n", + " 'write_bps': 0.0},\n", + " 'host_net_io': {'read_bps': 1090.9530211103709,\n", + " 'write_bps': 3772.379677392638},\n", + " 'managed_bytes': 0,\n", + " 'memory': 633405440,\n", + " 'num_fds': 86,\n", + " 'rmm': {'rmm-total': 0,\n", + " 'rmm-used': 0},\n", + " 'spilled_bytes': {'disk': 0,\n", + " 'memory': 0},\n", + " 'task_counts': {},\n", + " 'time': 1695333445.1203494,\n", + " 'transfer': {'incoming_bytes': 0,\n", + " 'incoming_count': 0,\n", + " 'incoming_count_total': 0,\n", + " 'outgoing_bytes': 0,\n", + " 'outgoing_count': 0,\n", + " 'outgoing_count_total': 0}},\n", + " 'name': 'dask-6792b68c-worker-b20eb368-0',\n", + " 'nanny': 'tls://10.5.0.15:46303',\n", + " 'nthreads': 1,\n", + " 'resources': {},\n", + " 'services': {'dashboard': 44363},\n", + " 'status': 'running',\n", + " 'type': 'Worker'},\n", + " 'tls://10.5.0.5:33017': {'gpu': {'memory-total': 17179869184,\n", " 'name': 'Tesla V100-PCIE-16GB'},\n", - " 'host': '10.5.0.6',\n", - " 'id': 'dask-400ac9c4-worker-1838a39d-0',\n", - " 'last_seen': 1693334115.716486,\n", - " 'local_directory': '/tmp/dask-scratch-space/worker-v7juc99g',\n", - " 'memory_limit': 118225672192,\n", - " 'metrics': {'bandwidth': {'total': 100000000,\n", - " 'types': {},\n", - " 'workers': {}},\n", - " 'cpu': 2.0,\n", - " 'digests_total_since_heartbeat': {'latency': 0.00422215461730957,\n", - " 'tick-duration': 0.5002996921539307},\n", - " 'event_loop_interval': 0.02001443862915039,\n", - " 'gpu': {'memory-used': 598867968,\n", - " 'utilization': 0},\n", - " 'gpu_memory_used': 598867968,\n", - " 'gpu_utilization': 0,\n", - " 'host_disk_io': {'read_bps': 0.0,\n", - " 'write_bps': 4285579.749180802},\n", - " 'host_net_io': {'read_bps': 7896.1441979567935,\n", - " 'write_bps': 13579.687564157768},\n", - " 'managed_bytes': 0,\n", - " 'memory': 625389568,\n", - " 'num_fds': 86,\n", - " 'rmm': {'rmm-total': 0,\n", - " 'rmm-used': 0},\n", - " 'spilled_bytes': {'disk': 0,\n", - " 'memory': 0},\n", - " 'task_counts': {},\n", - " 'time': 1693334115.2101967,\n", - " 'transfer': {'incoming_bytes': 0,\n", - " 'incoming_count': 0,\n", - " 'incoming_count_total': 0,\n", - " 'outgoing_bytes': 0,\n", - " 'outgoing_count': 0,\n", - " 'outgoing_count_total': 0}},\n", - " 'name': 'dask-400ac9c4-worker-1838a39d-0',\n", - " 'nanny': 'tls://10.5.0.6:43501',\n", - " 'nthreads': 1,\n", - " 'resources': {},\n", - " 'services': {'dashboard': 37659},\n", - " 'status': 'running',\n", - " 'type': 'Worker'},\n", - " 'tls://10.5.0.6:40869': {'gpu': {'memory-total': 17179869184,\n", - " 'name': 'Tesla V100-PCIE-16GB'},\n", - " 'host': '10.5.0.6',\n", - " 'id': 'dask-400ac9c4-worker-1838a39d-1',\n", - " 'last_seen': 1693334115.7172642,\n", - " 'local_directory': '/tmp/dask-scratch-space/worker-ql9rg3bd',\n", - " 'memory_limit': 118225672192,\n", - " 'metrics': {'bandwidth': {'total': 100000000,\n", - " 'types': {},\n", - " 'workers': {}},\n", - " 'cpu': 0.0,\n", - " 'digests_total_since_heartbeat': {'latency': 0.0043697357177734375,\n", - " 'tick-duration': 0.49964427947998047},\n", - " 'event_loop_interval': 0.020003252029418946,\n", - " 'gpu': {'memory-used': 598867968,\n", - " 'utilization': 0},\n", - " 'gpu_memory_used': 598867968,\n", - " 'gpu_utilization': 0,\n", - " 'host_disk_io': {'read_bps': 0.0,\n", - " 'write_bps': 4282769.5748925805},\n", - " 'host_net_io': {'read_bps': 7890.966475758198,\n", - " 'write_bps': 13570.782983898314},\n", - " 'managed_bytes': 0,\n", - " 'memory': 622383104,\n", - " 'num_fds': 86,\n", - " 'rmm': {'rmm-total': 0,\n", - " 'rmm-used': 0},\n", - " 'spilled_bytes': {'disk': 0,\n", - " 'memory': 0},\n", - " 'task_counts': {},\n", - " 'time': 1693334115.2123637,\n", - " 'transfer': {'incoming_bytes': 0,\n", - " 'incoming_count': 0,\n", - " 'incoming_count_total': 0,\n", - " 'outgoing_bytes': 0,\n", - " 'outgoing_count': 0,\n", - " 'outgoing_count_total': 0}},\n", - " 'name': 'dask-400ac9c4-worker-1838a39d-1',\n", - " 'nanny': 'tls://10.5.0.6:36861',\n", - " 'nthreads': 1,\n", - " 'resources': {},\n", - " 'services': {'dashboard': 45593},\n", - " 'status': 'running',\n", - " 'type': 'Worker'},\n", - " 'tls://10.5.0.7:33055': {'gpu': {'memory-total': 17179869184,\n", - " 'name': 'Tesla V100-PCIE-16GB'},\n", - " 'host': '10.5.0.7',\n", - " 'id': 'dask-400ac9c4-worker-0cd31592-1',\n", - " 'last_seen': 1693334115.5610104,\n", - " 'local_directory': '/tmp/dask-scratch-space/worker-vr1_9itb',\n", + " 'host': '10.5.0.5',\n", + " 'id': 'dask-6792b68c-worker-ff4082b3-1',\n", + " 'last_seen': 1695333445.6695802,\n", + " 'local_directory': '/tmp/dask-scratch-space/worker-xr62hpnn',\n", " 'memory_limit': 118225670144,\n", " 'metrics': {'bandwidth': {'total': 100000000,\n", " 'types': {},\n", " 'workers': {}},\n", " 'cpu': 2.0,\n", - " 'digests_total_since_heartbeat': {'latency': 0.003664255142211914,\n", - " 'tick-duration': 0.5014660358428955},\n", - " 'event_loop_interval': 0.019995880126953126,\n", + " 'digests_total_since_heartbeat': {'latency': 0.004124879837036133,\n", + " 'tick-duration': 0.5008127689361572},\n", + " 'event_loop_interval': 0.019995789527893066,\n", " 'gpu': {'memory-used': 598867968,\n", " 'utilization': 0},\n", " 'gpu_memory_used': 598867968,\n", " 'gpu_utilization': 0,\n", " 'host_disk_io': {'read_bps': 0.0,\n", - " 'write_bps': 8389245.85113992},\n", - " 'host_net_io': {'read_bps': 612.0465351221121,\n", - " 'write_bps': 4902.372737203585},\n", + " 'write_bps': 0.0},\n", + " 'host_net_io': {'read_bps': 612.1778082669234,\n", + " 'write_bps': 3340.970391522098},\n", " 'managed_bytes': 0,\n", - " 'memory': 624271360,\n", + " 'memory': 636633088,\n", " 'num_fds': 86,\n", " 'rmm': {'rmm-total': 0,\n", " 'rmm-used': 0},\n", " 'spilled_bytes': {'disk': 0,\n", " 'memory': 0},\n", " 'task_counts': {},\n", - " 'time': 1693334115.0549479,\n", + " 'time': 1695333445.160522,\n", " 'transfer': {'incoming_bytes': 0,\n", " 'incoming_count': 0,\n", " 'incoming_count_total': 0,\n", " 'outgoing_bytes': 0,\n", " 'outgoing_count': 0,\n", " 'outgoing_count_total': 0}},\n", - " 'name': 'dask-400ac9c4-worker-0cd31592-1',\n", - " 'nanny': 'tls://10.5.0.7:40845',\n", + " 'name': 'dask-6792b68c-worker-ff4082b3-1',\n", + " 'nanny': 'tls://10.5.0.5:41029',\n", " 'nthreads': 1,\n", " 'resources': {},\n", - " 'services': {'dashboard': 34773},\n", + " 'services': {'dashboard': 37589},\n", " 'status': 'running',\n", " 'type': 'Worker'},\n", - " 'tls://10.5.0.7:36717': {'gpu': {'memory-total': 17179869184,\n", + " 'tls://10.5.0.5:46495': {'gpu': {'memory-total': 17179869184,\n", " 'name': 'Tesla V100-PCIE-16GB'},\n", - " 'host': '10.5.0.7',\n", - " 'id': 'dask-400ac9c4-worker-0cd31592-0',\n", - " 'last_seen': 1693334115.5593781,\n", - " 'local_directory': '/tmp/dask-scratch-space/worker-qwqxw8uf',\n", + " 'host': '10.5.0.5',\n", + " 'id': 'dask-6792b68c-worker-ff4082b3-0',\n", + " 'last_seen': 1695333445.7062016,\n", + " 'local_directory': '/tmp/dask-scratch-space/worker-cl96gwc0',\n", " 'memory_limit': 118225670144,\n", " 'metrics': {'bandwidth': {'total': 100000000,\n", " 'types': {},\n", " 'workers': {}},\n", - " 'cpu': 0.0,\n", - " 'digests_total_since_heartbeat': {'latency': 0.004190206527709961,\n", - " 'tick-duration': 0.5009698867797852},\n", - " 'event_loop_interval': 0.01999223232269287,\n", + " 'cpu': 2.0,\n", + " 'digests_total_since_heartbeat': {'latency': 0.003979921340942383,\n", + " 'tick-duration': 0.49993157386779785},\n", + " 'event_loop_interval': 0.02000217914581299,\n", " 'gpu': {'memory-used': 598867968,\n", " 'utilization': 0},\n", " 'gpu_memory_used': 598867968,\n", " 'gpu_utilization': 0,\n", " 'host_disk_io': {'read_bps': 0.0,\n", - " 'write_bps': 8401951.8118285},\n", - " 'host_net_io': {'read_bps': 612.9735122727205,\n", - " 'write_bps': 3345.312959135436},\n", + " 'write_bps': 0.0},\n", + " 'host_net_io': {'read_bps': 612.4327413008988,\n", + " 'write_bps': 3342.3616927205912},\n", " 'managed_bytes': 0,\n", - " 'memory': 625373184,\n", + " 'memory': 637931520,\n", " 'num_fds': 86,\n", " 'rmm': {'rmm-total': 0,\n", " 'rmm-used': 0},\n", " 'spilled_bytes': {'disk': 0,\n", " 'memory': 0},\n", " 'task_counts': {},\n", - " 'time': 1693334115.0521584,\n", + " 'time': 1695333445.1984127,\n", " 'transfer': {'incoming_bytes': 0,\n", " 'incoming_count': 0,\n", " 'incoming_count_total': 0,\n", " 'outgoing_bytes': 0,\n", " 'outgoing_count': 0,\n", " 'outgoing_count_total': 0}},\n", - " 'name': 'dask-400ac9c4-worker-0cd31592-0',\n", - " 'nanny': 'tls://10.5.0.7:43275',\n", + " 'name': 'dask-6792b68c-worker-ff4082b3-0',\n", + " 'nanny': 'tls://10.5.0.5:41239',\n", " 'nthreads': 1,\n", " 'resources': {},\n", - " 'services': {'dashboard': 32773},\n", + " 'services': {'dashboard': 42275},\n", " 'status': 'running',\n", " 'type': 'Worker'}}}\n" ] @@ -1704,26 +1432,24 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'tls://10.5.0.6:38307': {'status': 'OK'},\n", - " 'tls://10.5.0.6:40869': {'status': 'OK'},\n", - " 'tls://10.5.0.7:33055': {'status': 'OK'},\n", - " 'tls://10.5.0.7:36717': {'status': 'OK'}}" + "{'tls://10.5.0.15:41227': {'status': 'OK'},\n", + " 'tls://10.5.0.15:41455': {'status': 'OK'},\n", + " 'tls://10.5.0.5:33017': {'status': 'OK'},\n", + " 'tls://10.5.0.5:46495': {'status': 'OK'}}" ] }, - "execution_count": 17, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from dask.distributed import PipInstall\n", - "\n", "client.register_worker_plugin(PipInstall(packages=[\"adlfs\"]))" ] }, @@ -1764,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1866,7 +1592,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 45, "metadata": { "editable": true, "slideshow": { @@ -2015,7 +1741,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "### Step 3.c: Get the split data and persist across workers " ] @@ -2029,7 +1757,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -2041,40 +1769,10 @@ ] }, { - "ename": "RuntimeError", - "evalue": "An error occurred while calling the read_parquet method registered to the pandas backend.\nOriginal Message: Error during deserialization of the task graph. This frequently occurs if the Scheduler and Client have different environments. For more information, see https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/distributed/scheduler.py:4346\u001b[0m, in \u001b[0;36mupdate_graph\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/distributed/protocol/serialize.py:432\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/distributed/protocol/serialize.py:98\u001b[0m, in \u001b[0;36mpickle_loads\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/distributed/protocol/pickle.py:94\u001b[0m, in \u001b[0;36mloads\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'adlfs'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/backends.py:136\u001b[0m, in \u001b[0;36mCreationDispatch.register_inplace..decorator..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py:543\u001b[0m, in \u001b[0;36mread_parquet\u001b[0;34m(path, columns, filters, categories, index, storage_options, engine, use_nullable_dtypes, dtype_backend, calculate_divisions, ignore_metadata_file, metadata_task_size, split_row_groups, blocksize, aggregate_files, parquet_file_extension, filesystem, **kwargs)\u001b[0m\n\u001b[1;32m 541\u001b[0m blocksize \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 543\u001b[0m read_metadata_result \u001b[38;5;241m=\u001b[39m \u001b[43mengine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_metadata\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[43mfs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 546\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategories\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategories\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 547\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 548\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 549\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype_backend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 550\u001b[0m \u001b[43m \u001b[49m\u001b[43mgather_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcalculate_divisions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_row_groups\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_row_groups\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m \u001b[49m\u001b[43mblocksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblocksize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[43m \u001b[49m\u001b[43maggregate_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maggregate_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 555\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_metadata_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_metadata_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 556\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata_task_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata_task_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 557\u001b[0m \u001b[43m \u001b[49m\u001b[43mparquet_file_extension\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparquet_file_extension\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 558\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 559\u001b[0m \u001b[43m \u001b[49m\u001b[43mread\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 560\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mother_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 561\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;66;03m# In the future, we may want to give the engine the\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;66;03m# option to return a dedicated element for `common_kwargs`.\u001b[39;00m\n\u001b[1;32m 565\u001b[0m \u001b[38;5;66;03m# However, to avoid breaking the API, we just embed this\u001b[39;00m\n\u001b[1;32m 566\u001b[0m \u001b[38;5;66;03m# data in the first element of `parts` for now.\u001b[39;00m\n\u001b[1;32m 567\u001b[0m \u001b[38;5;66;03m# The logic below is inteded to handle backward and forward\u001b[39;00m\n\u001b[1;32m 568\u001b[0m \u001b[38;5;66;03m# compatibility with a user-defined engine.\u001b[39;00m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/dataframe/io/parquet/arrow.py:554\u001b[0m, in \u001b[0;36mArrowDatasetEngine.read_metadata\u001b[0;34m(cls, fs, paths, categories, index, use_nullable_dtypes, dtype_backend, gather_statistics, filters, split_row_groups, blocksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, **kwargs)\u001b[0m\n\u001b[1;32m 553\u001b[0m \u001b[38;5;66;03m# Stage 3: Generate parts and stats\u001b[39;00m\n\u001b[0;32m--> 554\u001b[0m parts, stats, common_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_construct_collection_plan\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_info\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 556\u001b[0m \u001b[38;5;66;03m# Add `common_kwargs` and `aggregation_depth` to the first\u001b[39;00m\n\u001b[1;32m 557\u001b[0m \u001b[38;5;66;03m# element of `parts`. We can return as a separate element\u001b[39;00m\n\u001b[1;32m 558\u001b[0m \u001b[38;5;66;03m# in the future, but should avoid breaking the API for now.\u001b[39;00m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/dataframe/io/parquet/arrow.py:1483\u001b[0m, in \u001b[0;36mArrowDatasetEngine._construct_collection_plan\u001b[0;34m(cls, dataset_info)\u001b[0m\n\u001b[1;32m 1482\u001b[0m gather_parts_dsk[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfinal-\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m name] \u001b[38;5;241m=\u001b[39m (_combine_parts, finalize_list)\n\u001b[0;32m-> 1483\u001b[0m parts, stats \u001b[38;5;241m=\u001b[39m \u001b[43mDelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfinal-\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgather_parts_dsk\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1485\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parts, stats, common_kwargs\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/base.py:381\u001b[0m, in \u001b[0;36mDaskMethodsMixin.compute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[1;32m 359\u001b[0m \n\u001b[1;32m 360\u001b[0m \u001b[38;5;124;03mThis turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;124;03mdask.compute\u001b[39;00m\n\u001b[1;32m 380\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m--> 381\u001b[0m (result,) \u001b[38;5;241m=\u001b[39m \u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/base.py:666\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m 664\u001b[0m postcomputes\u001b[38;5;241m.\u001b[39mappend(x\u001b[38;5;241m.\u001b[39m__dask_postcompute__())\n\u001b[0;32m--> 666\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 667\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m repack([f(r, \u001b[38;5;241m*\u001b[39ma) \u001b[38;5;28;01mfor\u001b[39;00m r, (f, a) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(results, postcomputes)])\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/client.py:3259\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m 3258\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3259\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgather\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacked\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3260\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/client.py:2384\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m 2383\u001b[0m local_worker \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2384\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2385\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gather\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2386\u001b[0m \u001b[43m \u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2387\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2388\u001b[0m \u001b[43m \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2389\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_worker\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_worker\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2390\u001b[0m \u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2391\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/utils.py:359\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 359\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 360\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 361\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/utils.py:426\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m typ, exc, tb \u001b[38;5;241m=\u001b[39m error\n\u001b[0;32m--> 426\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/utils.py:399\u001b[0m, in \u001b[0;36msync..f\u001b[0;34m()\u001b[0m\n\u001b[1;32m 398\u001b[0m future \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 399\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m future\n\u001b[1;32m 400\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/tornado/gen.py:767\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 766\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 767\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 768\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 769\u001b[0m \u001b[38;5;66;03m# Save the exception for later. It's important that\u001b[39;00m\n\u001b[1;32m 770\u001b[0m \u001b[38;5;66;03m# gen.throw() not be called inside this try/except block\u001b[39;00m\n\u001b[1;32m 771\u001b[0m \u001b[38;5;66;03m# because that makes sys.exc_info behave unexpectedly.\u001b[39;00m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/distributed/client.py:2247\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m 2246\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2247\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception\u001b[38;5;241m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m 2248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n", - "\u001b[0;31mRuntimeError\u001b[0m: Error during deserialization of the task graph. This frequently occurs if the Scheduler and Client have different environments. For more information, see https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments\n", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m tic \u001b[38;5;241m=\u001b[39m timer()\n\u001b[0;32m----> 2\u001b[0m X_train, y_train, X_infer, y_infer \u001b[38;5;241m=\u001b[39m \u001b[43mtaxi_data_loader\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43madlsaccount\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mazureopendatastorage\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43madlspath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maz://nyctlc/yellow/puYear=2014/puMonth=1*/*.parquet\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_frac\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m toc \u001b[38;5;241m=\u001b[39m timer()\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWall clock time taken for ETL and persisting : \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtoc\u001b[38;5;241m-\u001b[39mtic\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m s\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[19], line 95\u001b[0m, in \u001b[0;36mtaxi_data_loader\u001b[0;34m(client, adlsaccount, adlspath, response_dtype, infer_frac, random_state)\u001b[0m\n\u001b[1;32m 93\u001b[0m response_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfareAmount\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 94\u001b[0m storage_options \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maccount_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: adlsaccount}\n\u001b[0;32m---> 95\u001b[0m taxi_data \u001b[38;5;241m=\u001b[39m \u001b[43mdask_cudf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_parquet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[43m \u001b[49m\u001b[43madlspath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m25e6\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[43m \u001b[49m\u001b[43mnpartitions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mworkers\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 101\u001b[0m taxi_data \u001b[38;5;241m=\u001b[39m clean(taxi_data, must_haves)\n\u001b[1;32m 102\u001b[0m taxi_data \u001b[38;5;241m=\u001b[39m taxi_data\u001b[38;5;241m.\u001b[39mmap_partitions(add_features)\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask_cudf/io/parquet.py:539\u001b[0m, in \u001b[0;36mread_parquet\u001b[0;34m(path, columns, **kwargs)\u001b[0m\n\u001b[1;32m 536\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mread\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 537\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mread\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcheck_file_size\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m check_file_size\n\u001b[0;32m--> 539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mCudfEngine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/backends.py:138\u001b[0m, in \u001b[0;36mCreationDispatch.register_inplace..decorator..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(e)(\n\u001b[1;32m 139\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfuncname(func)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmethod registered to the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m backend.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOriginal Message: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 142\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", - "\u001b[0;31mRuntimeError\u001b[0m: An error occurred while calling the read_parquet method registered to the pandas backend.\nOriginal Message: Error during deserialization of the task graph. This frequently occurs if the Scheduler and Client have different environments. For more information, see https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments\n" + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall clock time taken for ETL and persisting : 70.74975417199312 s\n" ] } ], @@ -2093,9 +1791,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "48817562" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X_train.shape[0].compute()" ] @@ -2109,9 +1818,152 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexpassengerCounttripDistancestartLonstartLatrateCodeIdendLonendLatdiffh_distanceday_of_weekis_weekend
3004463004461.04.00-73.98495540.7685431.0-74.00878940.7193301324.05.809536e+006.00.0
1638171638173.01.93-74.00817940.7221981.0-73.99298940.739151840.01.395489e+002.00.0
2369582369585.01.10-73.98759540.7753601.0-73.97649440.785755360.01.394768e+005.00.0
73461734611.00.76-73.99469840.7259291.0-73.99469840.725929180.01.005159e-135.00.0
2944642944641.00.60-73.97434240.7481651.0-73.98253640.750767229.01.395336e+006.00.0
\n", + "
" + ], + "text/plain": [ + " index passengerCount tripDistance startLon startLat \\\n", + "300446 300446 1.0 4.00 -73.984955 40.768543 \n", + "163817 163817 3.0 1.93 -74.008179 40.722198 \n", + "236958 236958 5.0 1.10 -73.987595 40.775360 \n", + "73461 73461 1.0 0.76 -73.994698 40.725929 \n", + "294464 294464 1.0 0.60 -73.974342 40.748165 \n", + "\n", + " rateCodeId endLon endLat diff h_distance day_of_week \\\n", + "300446 1.0 -74.008789 40.719330 1324.0 5.809536e+00 6.0 \n", + "163817 1.0 -73.992989 40.739151 840.0 1.395489e+00 2.0 \n", + "236958 1.0 -73.976494 40.785755 360.0 1.394768e+00 5.0 \n", + "73461 1.0 -73.994698 40.725929 180.0 1.005159e-13 5.0 \n", + "294464 1.0 -73.982536 40.750767 229.0 1.395336e+00 6.0 \n", + "\n", + " is_weekend \n", + "300446 0.0 \n", + "163817 0.0 \n", + "236958 0.0 \n", + "73461 0.0 \n", + "294464 0.0 " + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X_train.head()" ] @@ -2119,7 +1971,6 @@ { "cell_type": "markdown", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -2136,12 +1987,12 @@ "\n", "In this training example, we will use RMSE as the evaluation metric. It is also worth noting that performing HPO will lead to a set of more optimal hyperparameters.\n", "\n", - "Refer to the notebook [HPO-RAPIDS](./HPO-RAPIDS.ipynb) in this repository for how to perform HPO on Azure." + "Refer to the notebook [HPO-RAPIDS](../rapids-azureml-hpo/notebook.ipynb) in this repository for how to perform HPO on Azure." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -2179,9 +2030,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall clock time taken for this cell : 11.563872193990392 s\n" + ] + } + ], "source": [ "data_train = xgb.dask.DaskDMatrix(client, X_train, y_train)\n", "tic = timer()\n", @@ -2193,6 +2052,26 @@ "print(f\"Wall clock time taken for this cell : {toc-tic} s\")" ] }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgb_gpu_model" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2202,7 +2081,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -2213,7 +2092,6 @@ { "cell_type": "markdown", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -2235,9 +2113,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DoneAndNotDoneFutures(done=set(), not_done=set())" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "_y_test = y_infer.compute()\n", "wait(_y_test)" @@ -2245,9 +2134,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall clock time taken for xgb.dask.predict : 1.6368747379892739 s\n" + ] + } + ], "source": [ "d_test = xgb.dask.DaskDMatrix(client, X_infer)\n", "tic = timer()\n", @@ -2273,9 +2170,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall clock time taken for inplace inference : 3.98977595299948 s\n" + ] + } + ], "source": [ "tic = timer()\n", "y_pred = xgb.dask.inplace_predict(client, xgb_gpu_model, X_infer)\n", @@ -2287,7 +2192,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": { "editable": true, "slideshow": { @@ -2295,7 +2200,17 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculating MSE\n", + "Workflow Complete - RMSE: 2.2926068\n", + "Wall clock time taken for this cell : 0.8989280160021735 s\n" + ] + } + ], "source": [ "tic = timer()\n", "print(\"Calculating MSE\")\n", @@ -2321,7 +2236,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -2342,19 +2257,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ - "workers = client.has_what().keys()\n", - "print(workers)\n", - "n_workers = len(workers)\n", + "n_workers = len(client.scheduler_info()[\"workers\"])\n", "n_partitions = n_workers" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_partitions" + ] + }, + { + "cell_type": "code", + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -2376,39 +2309,53 @@ "\n", "\n", "def workerModelInit(model_file):\n", - " # this function will run in each worker and initialize the worker\n", + " # This function will run in each worker and initialize the worker with the model\n", " import os\n", + " from forest.inference import ForestInference\n", "\n", " worker = get_worker()\n", - " worker.data[\"fil_model\"] = ForestInference.load(\n", - " filename=os.path.join(worker.local_directory, model_file), model_type=\"xgboost\"\n", - " )\n", - " worker.data[\"fil_model\"]\n", + " local_model_file = os.path.join(worker.local_directory, model_file)\n", + "\n", + " if os.path.exists(local_model_file):\n", + " worker.data[\"fil_model\"] = ForestInference.load(\n", + " filename=local_model_file, model_type=\"xgboost\"\n", + " )\n", + " else:\n", + " raise FileNotFoundError(f\"Model file {local_model_file} not found on worker.\")\n", "\n", "\n", "def predict(input_df):\n", - " # this function will run in each worker and predict\n", + " # This function will run in each worker and predict\n", " worker = get_worker()\n", - " return worker.data[\"fil_model\"].predict(input_df)\n", + " fil_model = worker.data.get(\"fil_model\")\n", + "\n", + " if fil_model is not None:\n", + " return fil_model.predict(input_df)\n", + " else:\n", + " raise ValueError(\"Model not found in worker.\")\n", "\n", "\n", "def persistModelonWorkers(client, zip_file_name, model_file_name):\n", " import zipfile\n", + " import os\n", "\n", - " zf = zipfile.ZipFile(zip_file_name, mode=\"w\")\n", - " zf.write(f\"./{model_file_name}\")\n", - " zf.close()\n", - " # check to see if local directory present in workers\n", - " # if not present make it\n", - " fut = client.submit(checkOrMakeLocalDir)\n", + " # Create a ZIP file containing the model file\n", + " with zipfile.ZipFile(zip_file_name, mode=\"w\") as zf:\n", + " zf.write(model_file_name)\n", + "\n", + " # Upload the ZIP file to the Dask workers\n", + " fut = client.upload_file(zip_file_name)\n", " wait(fut)\n", - " # upload the zip file in workers\n", - " fut = client.upload_file(f\"./{zip_file_name}\")\n", + "\n", + " # Check to see if the local directory is present on workers and make it if not\n", + " fut = client.submit(checkOrMakeLocalDir)\n", " wait(fut)\n", - " # unzip file in the workers\n", + "\n", + " # Unzip the uploaded file on the workers\n", " fut = client.submit(unzipFile, zip_file_name)\n", " wait(fut)\n", - " # load model using FIL in workers\n", + "\n", + " # Load the model using FIL in workers\n", " fut = client.submit(workerModelInit, model_file_name)\n", " wait(fut)" ] @@ -2422,9 +2369,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 56.4 ms, sys: 11.8 ms, total: 68.2 ms\n", + "Wall time: 441 ms\n" + ] + } + ], "source": [ "%%time\n", "persistModelonWorkers(client, \"zipfile_write.zip\", \"trained-model_nyctaxi.xgb\")" @@ -2439,23 +2395,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall clock time taken for this cell: 0.1948759630031418 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/skirui/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/dask/dataframe/core.py:7047: FutureWarning: Meta is not valid, `map_partitions` and `map_overlap` expects output to be a pandas object. Try passing a pandas object as meta or a dict or tuple representing the (name, dtype) of the columns. In the future the meta you passed will not work.\n", + " warnings.warn(\n" + ] + } + ], "source": [ + "# Submit the predict function to the Dask cluster\n", + "predict_future = client.submit(predict, X_infer)\n", + "\n", + "# Map the predict_future to partitions of X_infer\n", "tic = timer()\n", - "predictions = X_infer.map_partitions(predict, meta=\"float\") # this is like MPI reduce\n", + "predictions = X_infer.map_partitions(lambda df: predict_future, meta=\"float\")\n", "y_pred = predictions.compute()\n", "wait(y_pred)\n", "toc = timer()\n", - "print(f\"Wall clock time taken for this cell : {toc-tic} s\")" + "print(f\"Wall clock time taken for this cell: {toc - tic} s\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "It took 0.1948759630031418 seconds to predict on 5426301 rows using FIL distributedly on each worker\n" + ] + } + ], "source": [ "rows_csv = X_infer.iloc[:, 0].shape[0].compute()\n", "print(\n", @@ -2465,7 +2449,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": { "editable": true, "slideshow": { @@ -2473,7 +2457,34 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "ValueError", + "evalue": "Unsupported dtype object", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[88], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m tic \u001b[38;5;241m=\u001b[39m timer()\n\u001b[0;32m----> 2\u001b[0m score \u001b[38;5;241m=\u001b[39m \u001b[43mmean_squared_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_y_test\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m toc \u001b[38;5;241m=\u001b[39m timer()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal - RMSE: \u001b[39m\u001b[38;5;124m\"\u001b[39m, np\u001b[38;5;241m.\u001b[39msqrt(score))\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/api_decorators.py:190\u001b[0m, in \u001b[0;36m_make_decorator_function..decorator_function..decorator_closure..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 188\u001b[0m ret \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cm\u001b[38;5;241m.\u001b[39mprocess_return(ret)\n", + "File \u001b[0;32mregression.pyx:200\u001b[0m, in \u001b[0;36mcuml.metrics.regression.mean_squared_error\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mregression.pyx:111\u001b[0m, in \u001b[0;36mcuml.metrics.regression._prepare_input_reg\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/nvtx/nvtx.py:101\u001b[0m, in \u001b[0;36mannotate.__call__..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 100\u001b[0m libnvtx_push_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattributes, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[0;32m--> 101\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m libnvtx_pop_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/input_utils.py:369\u001b[0m, in \u001b[0;36minput_to_cuml_array\u001b[0;34m(X, order, deepcopy, check_dtype, convert_to_dtype, check_mem_type, convert_to_mem_type, safe_dtype_conversion, check_cols, check_rows, fail_on_order, force_contiguous)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;129m@nvtx_annotate\u001b[39m(\n\u001b[1;32m 282\u001b[0m message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcommon.input_utils.input_to_cuml_array\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 283\u001b[0m category\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutils\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 298\u001b[0m force_contiguous\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 299\u001b[0m ):\n\u001b[1;32m 300\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[38;5;124;03m Convert input X to CumlArray.\u001b[39;00m\n\u001b[1;32m 302\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 367\u001b[0m \n\u001b[1;32m 368\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 369\u001b[0m arr \u001b[38;5;241m=\u001b[39m \u001b[43mCumlArray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_input\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeepcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdeepcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_to_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_to_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_mem_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_mem_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_to_mem_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_to_mem_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m \u001b[49m\u001b[43msafe_dtype_conversion\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msafe_dtype_conversion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 378\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_rows\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_rows\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43mfail_on_order\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfail_on_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_contiguous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_contiguous\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 384\u001b[0m shape \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39m__cuda_array_interface__[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/memory_utils.py:87\u001b[0m, in \u001b[0;36mwith_cupy_rmm..cupy_rmm_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m GPU_ENABLED:\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m cupy_using_allocator(rmm_cupy_allocator):\n\u001b[0;32m---> 87\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/nvtx/nvtx.py:101\u001b[0m, in \u001b[0;36mannotate.__call__..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 100\u001b[0m libnvtx_push_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattributes, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[0;32m--> 101\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m libnvtx_pop_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/array.py:1117\u001b[0m, in \u001b[0;36mCumlArray.from_input\u001b[0;34m(cls, X, order, deepcopy, check_dtype, convert_to_dtype, check_mem_type, convert_to_mem_type, safe_dtype_conversion, check_cols, check_rows, fail_on_order, force_contiguous)\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1110\u001b[0m (X \u001b[38;5;241m<\u001b[39m target_dtype_range\u001b[38;5;241m.\u001b[39mmin) \u001b[38;5;241m|\u001b[39m (X \u001b[38;5;241m>\u001b[39m target_dtype_range\u001b[38;5;241m.\u001b[39mmax)\n\u001b[1;32m 1111\u001b[0m )\u001b[38;5;241m.\u001b[39many():\n\u001b[1;32m 1112\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 1113\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData type conversion on values outside\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1114\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m representable range of target dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1115\u001b[0m )\n\u001b[1;32m 1116\u001b[0m arr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(\n\u001b[0;32m-> 1117\u001b[0m \u001b[43marr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_output\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1118\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_to_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1119\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_mem_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_to_mem_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1120\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m 1121\u001b[0m order\u001b[38;5;241m=\u001b[39mrequested_order,\n\u001b[1;32m 1122\u001b[0m index\u001b[38;5;241m=\u001b[39mindex,\n\u001b[1;32m 1123\u001b[0m validate\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 1124\u001b[0m )\n\u001b[1;32m 1126\u001b[0m make_copy \u001b[38;5;241m=\u001b[39m force_contiguous \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mis_contiguous\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m fail_on_order \u001b[38;5;129;01mand\u001b[39;00m order \u001b[38;5;241m!=\u001b[39m arr\u001b[38;5;241m.\u001b[39morder \u001b[38;5;129;01mand\u001b[39;00m order \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1130\u001b[0m ) \u001b[38;5;129;01mor\u001b[39;00m make_copy:\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/memory_utils.py:87\u001b[0m, in \u001b[0;36mwith_cupy_rmm..cupy_rmm_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m GPU_ENABLED:\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m cupy_using_allocator(rmm_cupy_allocator):\n\u001b[0;32m---> 87\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/nvtx/nvtx.py:101\u001b[0m, in \u001b[0;36mannotate.__call__..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 100\u001b[0m libnvtx_push_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattributes, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[0;32m--> 101\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m libnvtx_pop_range(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdomain\u001b[38;5;241m.\u001b[39mhandle)\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cuml/internals/array.py:625\u001b[0m, in \u001b[0;36mCumlArray.to_output\u001b[0;34m(self, output_type, output_dtype, output_mem_type)\u001b[0m\n\u001b[1;32m 618\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39masarray(\n\u001b[1;32m 619\u001b[0m \u001b[38;5;28mself\u001b[39m, dtype\u001b[38;5;241m=\u001b[39moutput_dtype, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morder\n\u001b[1;32m 620\u001b[0m )\n\u001b[1;32m 621\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cp\u001b[38;5;241m.\u001b[39masnumpy(\n\u001b[1;32m 622\u001b[0m cp\u001b[38;5;241m.\u001b[39masarray(\u001b[38;5;28mself\u001b[39m, dtype\u001b[38;5;241m=\u001b[39moutput_dtype, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morder),\n\u001b[1;32m 623\u001b[0m order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morder,\n\u001b[1;32m 624\u001b[0m )\n\u001b[0;32m--> 625\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moutput_mem_type\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mxpy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 626\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_dtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43morder\u001b[49m\n\u001b[1;32m 627\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m output_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumba\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cuda\u001b[38;5;241m.\u001b[39mas_cuda_array(\n\u001b[1;32m 631\u001b[0m cp\u001b[38;5;241m.\u001b[39masarray(\u001b[38;5;28mself\u001b[39m, dtype\u001b[38;5;241m=\u001b[39moutput_dtype, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morder)\n\u001b[1;32m 632\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/envs/rapids-23.08/lib/python3.10/site-packages/cupy/_creation/from_data.py:75\u001b[0m, in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21masarray\u001b[39m(a, dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 50\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Converts an object to array.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[1;32m 52\u001b[0m \u001b[38;5;124;03m This is equivalent to ``array(a, dtype, copy=False, order=order)``.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 73\u001b[0m \n\u001b[1;32m 74\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 75\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_core\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32mcupy/_core/core.pyx:2376\u001b[0m, in \u001b[0;36mcupy._core.core.array\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mcupy/_core/core.pyx:2400\u001b[0m, in \u001b[0;36mcupy._core.core.array\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mcupy/_core/core.pyx:2527\u001b[0m, in \u001b[0;36mcupy._core.core._array_default\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Unsupported dtype object" + ] + } + ], "source": [ "tic = timer()\n", "score = mean_squared_error(y_pred, _y_test)\n", @@ -2481,6 +2492,57 @@ "print(\"Final - RMSE: \", np.sqrt(score))" ] }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(dtype('O'), dtype('float32'))" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred.dtype, _y_test.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0