diff --git a/starter/ProfilerReports/benchmark/profiler-report.html b/starter/ProfilerReports/benchmark/profiler-report.html new file mode 100644 index 00000000..cdc457ba --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-report.html @@ -0,0 +1,15030 @@ + + + + + + profiler-report + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+

+ SageMaker Debugger Profiling Report + + ¶ + +

+

+ SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. +

+

+ + Legal disclaimer: + + This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information. +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+ In [4]: +
+
+
+
+
# Parameters
+processing_job_arn = "arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-5f46c0a2"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Training job summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ System usage statistics + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Framework metrics summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Rules summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ The following table shows a profiling summary of the Debugger built-in rules. +The table is sorted by the rules that triggered the most frequently. During your training job, the LowGPUUtilization rule +was the most frequently triggered. It processed 1751 datapoints and was triggered 14 times. +

+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Description + + Recommendation + + Number of times rule triggered + + Number of datapoints + + Rule parameters +
+ LowGPUUtilization + + Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size. + + Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size. + + 14 + + 1751 + + threshold_p95:70 +
+ threshold_p5:10 +
+ window:500 +
+ patience:1000 +
+ BatchSize + + Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization. + + The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size. + + 14 + + 1750 + + cpu_threshold_p95:70 +
+ gpu_threshold_p95:70 +
+ gpu_memory_threshold_p95:70 +
+ patience:1000 +
+ window:500 +
+ Dataloader + + Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU. + + Change the number of data loader processes. + + 1 + + 8373 + + min_threshold:70 +
+ max_threshold:200 +
+ MaxInitializationTime + + Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes. + + Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework. + + 0 + + 0 + + threshold:20 +
+ GPUMemoryIncrease + + Measures the average GPU memory footprint and triggers if there is a large increase. + + Choose a larger instance type with more memory if footprint is close to maximum available memory. + + 0 + + 1751 + + increase:5 +
+ patience:1000 +
+ window:10 +
+ CPUBottleneck + + Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent. + + Consider increasing the number of data loaders or applying data pre-fetching. + + 0 + + 3514 + + threshold:50 +
+ cpu_threshold:90 +
+ gpu_threshold:10 +
+ patience:1000 +
+ LoadBalancing + + Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization. + + Choose a different distributed training strategy or a different distributed training framework. + + 0 + + 1751 + + threshold:0.2 +
+ patience:1000 +
+ IOBottleneck + + Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent. + + Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance. + + 0 + + 3514 + + threshold:50 +
+ io_threshold:50 +
+ gpu_threshold:10 +
+ patience:1000 +
+ StepOutlier + + Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues. + + Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers. + + 0 + + 0 + + threshold:3 +
+ mode:None +
+ n_outliers:10 +
+ stddev:3 +
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Analyzing the training loop + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Step duration analysis + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ GPU utilization analysis + + ¶ + +

+
+
+
+
+
+
+

+ + Usage per GPU + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

+ + GPU utilization of gpu0 on node algo-1: + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

+ + GPU utilization of gpu0 on node algo-2: + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ + Workload balancing + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Dataloading analysis + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Batch size + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ CPU bottlenecks + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ I/O bottlenecks + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ GPU memory + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+ + diff --git a/starter/ProfilerReports/benchmark/profiler-report.ipynb b/starter/ProfilerReports/benchmark/profiler-report.ipynb new file mode 100644 index 00000000..219aee6c --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-report.ipynb @@ -0,0 +1,4055 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.018366, + "end_time": "2023-04-10T21:36:45.853355", + "exception": false, + "start_time": "2023-04-10T21:36:45.834989", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# SageMaker Debugger Profiling Report\n", + "\n", + "SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. \n", + "\n", + "**Legal disclaimer:** This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:45.896306Z", + "iopub.status.busy": "2023-04-10T21:36:45.895449Z", + "iopub.status.idle": "2023-04-10T21:36:46.472822Z", + "shell.execute_reply": "2023-04-10T21:36:46.473229Z" + }, + "papermill": { + "duration": 0.602697, + "end_time": "2023-04-10T21:36:46.473456", + "exception": false, + "start_time": "2023-04-10T21:36:45.870759", + "status": "completed" + }, + "tags": [ + "hide-output", + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-04-10 21:36:46.465 ip-10-0-197-104.ec2.internal:407 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: /opt/ml/processing/input/profiler/signals/ProfilerReport\n" + ] + } + ], + "source": [ + "import json\n", + "import pandas as pd\n", + "import glob\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import datetime\n", + "from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time\n", + "from smdebug.core.utils import setup_profiler_report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.514268Z", + "iopub.status.busy": "2023-04-10T21:36:46.513764Z", + "iopub.status.idle": "2023-04-10T21:36:46.710158Z", + "shell.execute_reply": "2023-04-10T21:36:46.709718Z" + }, + "papermill": { + "duration": 0.219068, + "end_time": "2023-04-10T21:36:46.710270", + "exception": false, + "start_time": "2023-04-10T21:36:46.491202", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + " var JS_MIME_TYPE = 'application/javascript';\n", + " var HTML_MIME_TYPE = 'text/html';\n", + " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " var CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " var cell = handle.cell;\n", + "\n", + " var id = cell.output_area._bokeh_element_id;\n", + " var server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " var id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + "\n", + " \n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " var NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " var el = document.getElementById(null);\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error() {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (var i = 0; i < css_urls.length; i++) {\n", + " var url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n", + "\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.src = url;\n", + " if (url in hashes) {\n", + " element.crossOrigin = \"anonymous\";\n", + " element.integrity = \"sha384-\" + hashes[url];\n", + " }\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " \n", + " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n", + " var css_urls = [];\n", + " \n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " function(Bokeh) {\n", + " \n", + " \n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " \n", + " if (root.Bokeh !== undefined || force === true) {\n", + " \n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + "\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import bokeh\n", + "from bokeh.io import output_notebook, show\n", + "from bokeh.layouts import column, row\n", + "from bokeh.plotting import figure\n", + "from bokeh.models.widgets import DataTable, DateFormatter, TableColumn\n", + "from bokeh.models import ColumnDataSource, PreText\n", + "from math import pi\n", + "from bokeh.transform import cumsum\n", + "import warnings\n", + "from bokeh.models.widgets import Paragraph\n", + "from bokeh.models import Legend\n", + "from bokeh.util.warnings import BokehDeprecationWarning, BokehUserWarning\n", + "warnings.simplefilter('ignore', BokehDeprecationWarning)\n", + "warnings.simplefilter('ignore', BokehUserWarning)\n", + "\n", + "output_notebook(hide_banner=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.749887Z", + "iopub.status.busy": "2023-04-10T21:36:46.749364Z", + "iopub.status.idle": "2023-04-10T21:36:46.751166Z", + "shell.execute_reply": "2023-04-10T21:36:46.751540Z" + }, + "papermill": { + "duration": 0.023262, + "end_time": "2023-04-10T21:36:46.751665", + "exception": false, + "start_time": "2023-04-10T21:36:46.728403", + "status": "completed" + }, + "tags": [ + "parameters", + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "processing_job_arn = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "66f3a526", + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.790432Z", + "iopub.status.busy": "2023-04-10T21:36:46.789953Z", + "iopub.status.idle": "2023-04-10T21:36:46.791690Z", + "shell.execute_reply": "2023-04-10T21:36:46.792067Z" + }, + "papermill": { + "duration": 0.022733, + "end_time": "2023-04-10T21:36:46.792188", + "exception": false, + "start_time": "2023-04-10T21:36:46.769455", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "processing_job_arn = \"arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-5f46c0a2\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.831123Z", + "iopub.status.busy": "2023-04-10T21:36:46.830651Z", + "iopub.status.idle": "2023-04-10T21:36:46.832738Z", + "shell.execute_reply": "2023-04-10T21:36:46.832268Z" + }, + "papermill": { + "duration": 0.022822, + "end_time": "2023-04-10T21:36:46.832840", + "exception": false, + "start_time": "2023-04-10T21:36:46.810018", + "status": "completed" + }, + "tags": [ + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "setup_profiler_report(processing_job_arn)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.876438Z", + "iopub.status.busy": "2023-04-10T21:36:46.875941Z", + "iopub.status.idle": "2023-04-10T21:36:46.877676Z", + "shell.execute_reply": "2023-04-10T21:36:46.878061Z" + }, + "papermill": { + "duration": 0.027248, + "end_time": "2023-04-10T21:36:46.878186", + "exception": false, + "start_time": "2023-04-10T21:36:46.850938", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "def create_piechart(data_dict, title=None, height=400, width=400, x1=0, x2=0.1, radius=0.4, toolbar_location='right'):\n", + " \n", + " plot = figure(plot_height=height, \n", + " plot_width=width,\n", + " toolbar_location=toolbar_location,\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " tooltips=\"@phase:@value\", \n", + " title=title,\n", + " x_range=(-radius-x1, radius+x2))\n", + "\n", + " data = pd.Series(data_dict).reset_index(name='value').rename(columns={'index':'phase'})\n", + " data['angle'] = data['value']/data['value'].sum() * 2*pi\n", + " data['color'] = bokeh.palettes.viridis(len(data_dict))\n", + "\n", + " plot.wedge(x=0, y=0., radius=radius,\n", + " start_angle=cumsum('angle', include_zero=True), \n", + " end_angle=cumsum('angle'),\n", + " line_color=\"white\", \n", + " source=data, \n", + " fill_color='color', \n", + " legend='phase'\n", + " )\n", + " plot.legend.label_text_font_size = \"8pt\"\n", + " plot.legend.location = 'center_right'\n", + " plot.axis.axis_label=None\n", + " plot.axis.visible=False\n", + " plot.grid.grid_line_color = None\n", + " plot.outline_line_color = \"white\"\n", + " \n", + " return plot" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.918186Z", + "iopub.status.busy": "2023-04-10T21:36:46.917703Z", + "iopub.status.idle": "2023-04-10T21:36:46.919369Z", + "shell.execute_reply": "2023-04-10T21:36:46.919746Z" + }, + "papermill": { + "duration": 0.02369, + "end_time": "2023-04-10T21:36:46.919865", + "exception": false, + "start_time": "2023-04-10T21:36:46.896175", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import display, HTML, Markdown, Image\n", + "def pretty_print(df):\n", + " raw_html = df.to_html().replace(\"\\\\n\",\"
\").replace('','')\n", + " return display(HTML(raw_html))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.017969, + "end_time": "2023-04-10T21:36:46.955899", + "exception": false, + "start_time": "2023-04-10T21:36:46.937930", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Training job summary" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:46.996450Z", + "iopub.status.busy": "2023-04-10T21:36:46.995968Z", + "iopub.status.idle": "2023-04-10T21:36:46.997795Z", + "shell.execute_reply": "2023-04-10T21:36:46.998192Z" + }, + "papermill": { + "duration": 0.0241, + "end_time": "2023-04-10T21:36:46.998315", + "exception": false, + "start_time": "2023-04-10T21:36:46.974215", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "def load_report(rule_name):\n", + " try:\n", + " report = json.load(open('/opt/ml/processing/output/rule/profiler-output/profiler-reports/'+rule_name+'.json'))\n", + " return report\n", + " except FileNotFoundError:\n", + " print (rule_name + ' not triggered')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.046687Z", + "iopub.status.busy": "2023-04-10T21:36:47.045081Z", + "iopub.status.idle": "2023-04-10T21:36:47.048376Z", + "shell.execute_reply": "2023-04-10T21:36:47.048754Z" + }, + "papermill": { + "duration": 0.032231, + "end_time": "2023-04-10T21:36:47.048880", + "exception": false, + "start_time": "2023-04-10T21:36:47.016649", + "status": "completed" + }, + "tags": [ + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "\n", + "job_statistics = {}\n", + "report = load_report('MaxInitializationTime')\n", + "if report:\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " first_step = report['Details'][\"step_num\"][\"first\"]\n", + " last_step = report['Details'][\"step_num\"][\"last\"]\n", + " tmp = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Start time\"] = f\"{hour} {day}\"\n", + " tmp = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"End time\"] = f\"{hour} {day}\"\n", + " job_duration_in_seconds = int(report['Details']['job_end'] - report['Details']['job_start']) \n", + " job_statistics[\"Job duration\"] = f\"{job_duration_in_seconds} seconds\"\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " tmp = us_since_epoch_to_human_readable_time(first_step)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Training loop start\"] = f\"{hour} {day}\"\n", + " tmp = us_since_epoch_to_human_readable_time(last_step)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Training loop end\"] = f\"{hour} {day}\"\n", + " training_loop_duration_in_seconds = int((last_step - first_step) / 1000000)\n", + " job_statistics[\"Training loop duration\"] = f\"{training_loop_duration_in_seconds} seconds\"\n", + " initialization_in_seconds = int(first_step/1000000 - report['Details']['job_start'])\n", + " job_statistics[\"Initialization time\"] = f\"{initialization_in_seconds} seconds\"\n", + " finalization_in_seconds = int(np.abs(report['Details']['job_end'] - last_step/1000000))\n", + " job_statistics[\"Finalization time\"] = f\"{finalization_in_seconds} seconds\"\n", + " initialization_perc = int(initialization_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Initialization\"] = f\"{initialization_perc} %\"\n", + " training_loop_perc = int(training_loop_duration_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Training loop\"] = f\"{training_loop_perc} %\"\n", + " finalization_perc = int(finalization_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Finalization\"] = f\"{finalization_perc} %\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.097578Z", + "iopub.status.busy": "2023-04-10T21:36:47.093154Z", + "iopub.status.idle": "2023-04-10T21:36:47.109492Z", + "shell.execute_reply": "2023-04-10T21:36:47.109917Z" + }, + "papermill": { + "duration": 0.042888, + "end_time": "2023-04-10T21:36:47.110049", + "exception": false, + "start_time": "2023-04-10T21:36:47.067161", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"e3441a89-9fd5-4718-8738-5aaf05128577\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1006\"},{\"id\":\"1007\"}]},\"id\":\"1008\",\"type\":\"Column\"},{\"attributes\":{\"text\":\"The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took. \\n Your training job started on 04/10/2023 at 21:21:23 and ran for 876 seconds. \\n Your training job started on 04/10/2023 at 21:21:23 and ran for 876 seconds.. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.\",\"width\":800},\"id\":\"1006\",\"type\":\"Paragraph\"},{\"attributes\":{\"data\":{\"0\":[\"Start time\",\"End time\",\"Job duration\"],\"1\":[\"21:21:23 04/10/2023\",\"21:35:59 04/10/2023\",\"876 seconds\"],\"index\":[0,1,2]},\"selected\":{\"id\":\"1014\"},\"selection_policy\":{\"id\":\"1013\"}},\"id\":\"1001\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1009\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1010\"},\"field\":\"0\",\"formatter\":{\"id\":\"1009\"},\"title\":\"\"},\"id\":\"1002\",\"type\":\"TableColumn\"},{\"attributes\":{\"children\":[{\"id\":\"1004\"}]},\"id\":\"1007\",\"type\":\"Row\"},{\"attributes\":{},\"id\":\"1011\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1012\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"StringEditor\"},{\"attributes\":{\"source\":{\"id\":\"1001\"}},\"id\":\"1005\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1013\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"columns\":[{\"id\":\"1002\"},{\"id\":\"1003\"}],\"height\":380,\"source\":{\"id\":\"1001\"},\"view\":{\"id\":\"1005\"},\"width\":450},\"id\":\"1004\",\"type\":\"DataTable\"},{\"attributes\":{\"editor\":{\"id\":\"1012\"},\"field\":\"1\",\"formatter\":{\"id\":\"1011\"},\"title\":\"Job Statistics\"},\"id\":\"1003\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1014\",\"type\":\"Selection\"}],\"root_ids\":[\"1008\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"e3441a89-9fd5-4718-8738-5aaf05128577\",\"root_ids\":[\"1008\"],\"roots\":{\"1008\":\"92bf97d7-7f65-4fd8-b5b9-a66b90438466\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1008" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if report:\n", + " text = \"\"\"The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took.\"\"\"\n", + " if len(job_statistics) > 0:\n", + " df = pd.DataFrame.from_dict(job_statistics, orient='index')\n", + " start_time = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n", + " date = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " duration = job_duration_in_seconds\n", + " text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n", + "\n", + " #pretty_print(df)\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " if finalization_perc < 0:\n", + " job_statistics[\"Finalization%\"] = 0\n", + " if training_loop_perc < 0:\n", + " job_statistics[\"Training loop\"] = 0\n", + " if initialization_perc < 0:\n", + " job_statistics[\"Initialization\"] = 0\n", + " else:\n", + " text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n", + " \n", + " if len(job_statistics) > 0:\n", + " df2 = df.reset_index()\n", + " df2.columns = [\"0\", \"1\"]\n", + " source = ColumnDataSource(data=df2)\n", + " columns = [TableColumn(field='0', title=\"\"),\n", + " TableColumn(field='1', title=\"Job Statistics\"),]\n", + " table = DataTable(source=source, columns=columns, width=450, height=380)\n", + "\n", + " plot = None\n", + "\n", + " if \"Initialization\" in job_statistics:\n", + " piechart_data = {}\n", + " piechart_data[\"Initialization\"] = initialization_perc \n", + " piechart_data[\"Training loop\"] = training_loop_perc\n", + " piechart_data[\"Finalization\"] = finalization_perc \n", + "\n", + " plot = create_piechart(piechart_data, \n", + " height=350,\n", + " width=500,\n", + " x1=0.15,\n", + " x2=0.15,\n", + " radius=0.15, \n", + " toolbar_location=None)\n", + "\n", + " if plot != None:\n", + " paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width = 800)\n", + " show(column(paragraph, row(table, plot)))\n", + " else:\n", + " paragraph = Paragraph(text=f\"\"\"{text}. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.\"\"\" , width = 800)\n", + " show(column(paragraph, row(table)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.018991, + "end_time": "2023-04-10T21:36:47.148429", + "exception": false, + "start_time": "2023-04-10T21:36:47.129438", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## System usage statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.189994Z", + "iopub.status.busy": "2023-04-10T21:36:47.189464Z", + "iopub.status.idle": "2023-04-10T21:36:47.191935Z", + "shell.execute_reply": "2023-04-10T21:36:47.191521Z" + }, + "papermill": { + "duration": 0.024692, + "end_time": "2023-04-10T21:36:47.192038", + "exception": false, + "start_time": "2023-04-10T21:36:47.167346", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "report = load_report('OverallSystemUsage')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.238842Z", + "iopub.status.busy": "2023-04-10T21:36:47.238317Z", + "iopub.status.idle": "2023-04-10T21:36:47.240236Z", + "shell.execute_reply": "2023-04-10T21:36:47.240616Z" + }, + "papermill": { + "duration": 0.02954, + "end_time": "2023-04-10T21:36:47.240744", + "exception": false, + "start_time": "2023-04-10T21:36:47.211204", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "text1 = ''\n", + "if report:\n", + " if \"GPU\" in report[\"Details\"]:\n", + " for node_id in report[\"Details\"][\"GPU\"]:\n", + " gpu_p95 = report[\"Details\"][\"GPU\"][node_id][\"p95\"]\n", + " gpu_p50 = report[\"Details\"][\"GPU\"][node_id][\"p50\"]\n", + " cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n", + " cpu_p50 = report[\"Details\"][\"CPU\"][node_id][\"p50\"]\n", + " \n", + " if gpu_p95 < 70 and cpu_p95 < 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n", + " The 95th percentile of the total CPU utilization is only {int(cpu_p95)}%. Node {node_id} is underutilized. \n", + " You may want to consider switching to a smaller instance type.\"\"\"\n", + " elif gpu_p95 < 70 and cpu_p95 > 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n", + " However, the 95th percentile of the total CPU utilization is {int(cpu_p95)}%. GPUs on node {node_id} are underutilized, \n", + " likely because of CPU bottlenecks.\"\"\"\n", + " elif gpu_p50 > 70:\n", + " text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n", + " GPUs on node {node_id} are well utilized.\"\"\"\n", + " else:\n", + " text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n", + " The median total CPU utilization is {int(cpu_p50)}%.\"\"\"\n", + " else:\n", + " for node_id in report[\"Details\"][\"CPU\"]:\n", + " cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n", + " if cpu_p95 > 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%. CPUs on node {node_id} are well utilized.\"\"\"\n", + " text1 = Paragraph(text=f\"\"\"{text1}\"\"\", width=1100)\n", + " text2 = Paragraph(text=f\"\"\"The following table shows statistics of resource utilization per worker (node), \n", + " such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. \n", + " The table also includes the total I/O wait time and the total amount of data sent or received in bytes.\n", + " The table shows min and max values as well as p99, p90 and p50 percentiles.\"\"\", width=900)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.289834Z", + "iopub.status.busy": "2023-04-10T21:36:47.288303Z", + "iopub.status.idle": "2023-04-10T21:36:47.308134Z", + "shell.execute_reply": "2023-04-10T21:36:47.308514Z" + }, + "papermill": { + "duration": 0.048463, + "end_time": "2023-04-10T21:36:47.308642", + "exception": false, + "start_time": "2023-04-10T21:36:47.260179", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"9991a4c1-c273-472d-a7a5-448d291501fa\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1045\"},{\"id\":\"1046\"},{\"id\":\"1058\"}]},\"id\":\"1059\",\"type\":\"Column\"},{\"attributes\":{},\"id\":\"1068\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1075\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1073\"},\"field\":\"max\",\"formatter\":{\"id\":\"1072\"},\"title\":\"max\"},\"id\":\"1051\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1076\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1074\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1079\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1069\"},\"field\":\"metric\",\"formatter\":{\"id\":\"1068\"},\"title\":\"metric\"},\"id\":\"1049\",\"type\":\"TableColumn\"},{\"attributes\":{\"editor\":{\"id\":\"1077\"},\"field\":\"p95\",\"formatter\":{\"id\":\"1076\"},\"title\":\"p95\"},\"id\":\"1053\",\"type\":\"TableColumn\"},{\"attributes\":{\"editor\":{\"id\":\"1081\"},\"field\":\"min\",\"formatter\":{\"id\":\"1080\"},\"title\":\"min\"},\"id\":\"1055\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1080\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1067\"},\"field\":\"Node\",\"formatter\":{\"id\":\"1066\"},\"title\":\"node\"},\"id\":\"1048\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1071\",\"type\":\"StringEditor\"},{\"attributes\":{\"data\":{\"Node\":[\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\"],\"index\":[0,1,2,3,4,5,6,7,8,9,10,11],\"level_0\":[0,1,2,3,4,5,6,7,8,9,10,11],\"max\":{\"__ndarray__\":\"ZmZmIk5flkGuR+EnxkSXQQAAAAAAAExAAAAAAAAATEAAAAAAAABZQAAAAAAAAFlASOF6FK6HQEApXI/C9UhAQAAAAAAAAEVAAAAAAACAREAzMzMzMxNDQFyPwvUovENA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"metric\":[\"Network\",\"Network\",\"GPU\",\"GPU\",\"CPU\",\"CPU\",\"CPU memory\",\"CPU memory\",\"GPU memory\",\"GPU memory\",\"I/O\",\"I/O\"],\"min\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABcj8L1KFzfP2ZmZmZmZhJAmpmZmZmZEUDXo3A9CtcRQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p50\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAD9AAAAAAAAAP0BI4XoUrodEQHsUrkfhmkRAhetRuB7FPUD2KFyPwrU9QAAAAAAAADZAAAAAAAAANkAAAAAAAAAAAAAAAAAAAAAA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p95\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAgEtAAAAAAACAS0DNzMzMzCxTQB+F61G4DlRAUrgeheuRP0CPwvUoXA8/QAAAAAAAgERAAAAAAAAARECuR+F6FK4yQMP1KFyPgjJA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p99\":{\"__ndarray__\":\"pHA9CtcjSkAAAAAAAAAAAAAAAAAAAExAAAAAAAAATEDNzMzMzCxYQI/C9Shcn1hA9ihcj8I1QEBxPQrXo7A/QAAAAAAAgERAAAAAAACAREB7FK5H4Xo6QArXo3A9ijtA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"unit\":[\"bytes\",\"bytes\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\"]},\"selected\":{\"id\":\"1083\"},\"selection_policy\":{\"id\":\"1082\"}},\"id\":\"1047\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1069\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1075\"},\"field\":\"p99\",\"formatter\":{\"id\":\"1074\"},\"title\":\"p99\"},\"id\":\"1052\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1078\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1077\",\"type\":\"StringEditor\"},{\"attributes\":{\"source\":{\"id\":\"1047\"}},\"id\":\"1057\",\"type\":\"CDSView\"},{\"attributes\":{\"text\":\"The 95th percentile of the total GPU utilization on node algo-1 is only 55%. \\n However, the 95th percentile of the total CPU utilization is 76%. GPUs on node algo-1 are underutilized, \\n likely because of CPU bottlenecks.The 95th percentile of the total GPU utilization on node algo-2 is only 55%. \\n However, the 95th percentile of the total CPU utilization is 80%. GPUs on node algo-2 are underutilized, \\n likely because of CPU bottlenecks.\",\"width\":1100},\"id\":\"1045\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"1083\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1073\",\"type\":\"StringEditor\"},{\"attributes\":{\"children\":[{\"id\":\"1056\"}]},\"id\":\"1058\",\"type\":\"Row\"},{\"attributes\":{},\"id\":\"1082\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"text\":\"The following table shows statistics of resource utilization per worker (node), \\n such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. \\n The table also includes the total I/O wait time and the total amount of data sent or received in bytes.\\n The table shows min and max values as well as p99, p90 and p50 percentiles.\",\"width\":900},\"id\":\"1046\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"1070\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1067\",\"type\":\"StringEditor\"},{\"attributes\":{\"columns\":[{\"id\":\"1048\"},{\"id\":\"1049\"},{\"id\":\"1050\"},{\"id\":\"1051\"},{\"id\":\"1052\"},{\"id\":\"1053\"},{\"id\":\"1054\"},{\"id\":\"1055\"}],\"height\":360,\"source\":{\"id\":\"1047\"},\"view\":{\"id\":\"1057\"},\"width\":800},\"id\":\"1056\",\"type\":\"DataTable\"},{\"attributes\":{},\"id\":\"1081\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1079\"},\"field\":\"p50\",\"formatter\":{\"id\":\"1078\"},\"title\":\"p50\"},\"id\":\"1054\",\"type\":\"TableColumn\"},{\"attributes\":{\"editor\":{\"id\":\"1071\"},\"field\":\"unit\",\"formatter\":{\"id\":\"1070\"},\"title\":\"unit\"},\"id\":\"1050\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1072\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1066\",\"type\":\"StringFormatter\"}],\"root_ids\":[\"1059\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"9991a4c1-c273-472d-a7a5-448d291501fa\",\"root_ids\":[\"1059\"],\"roots\":{\"1059\":\"dcd991e7-75f3-4a32-b03b-f2c36e7a5d23\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1059" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "units = {\"CPU\": \"percentage\", \"CPU memory\": \"percentage\", \"GPU\": \"percentage\", \"Network\": \"bytes\", \"GPU memory\": \"percentage\", \"I/O\": \"percentage\"}\n", + "if report:\n", + " for metric in report['Details']:\n", + " for node_id in report['Details'][metric]:\n", + " values = report['Details'][metric][node_id]\n", + " rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']\n", + " df2 = df.reset_index()\n", + " source = ColumnDataSource(data=df2)\n", + " columns = [TableColumn(field='Node', title=\"node\"),\n", + " TableColumn(field='metric', title=\"metric\"),\n", + " TableColumn(field='unit', title=\"unit\"),\n", + " TableColumn(field='max', title=\"max\"),\n", + " TableColumn(field='p99', title=\"p99\"),\n", + " TableColumn(field='p95', title=\"p95\"),\n", + " TableColumn(field='p50', title=\"p50\"),\n", + " TableColumn(field='min', title=\"min\"),]\n", + " table = DataTable(source=source, columns=columns, width=800, height=df2.shape[0]*30)\n", + "\n", + " show(column( text1, text2, row(table)))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.364443Z", + "iopub.status.busy": "2023-04-10T21:36:47.363907Z", + "iopub.status.idle": "2023-04-10T21:36:47.399182Z", + "shell.execute_reply": "2023-04-10T21:36:47.399567Z" + }, + "papermill": { + "duration": 0.071045, + "end_time": "2023-04-10T21:36:47.399711", + "exception": false, + "start_time": "2023-04-10T21:36:47.328666", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Framework metrics summary" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"0b29b1bf-af40-43f6-8e42-8cd06ee83d36\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1219\"},{\"id\":\"1220\"}]},\"id\":\"1221\",\"type\":\"Column\"},{\"attributes\":{\"data_source\":{\"id\":\"1204\"},\"glyph\":{\"id\":\"1206\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1207\"},\"selection_glyph\":null,\"view\":{\"id\":\"1209\"}},\"id\":\"1208\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1203\"},\"units\":\"rad\"},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"field\":\"color\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"radius\":{\"units\":\"data\",\"value\":0.3},\"start_angle\":{\"expr\":{\"id\":\"1202\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":0.0}},\"id\":\"1207\",\"type\":\"Wedge\"},{\"attributes\":{\"data\":{\"angle\":{\"__ndarray__\":\"U1ywWk97Fj9o0vTY5CEZQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[2]},\"color\":[\"#440154\",\"#FDE724\"],\"index\":[0,1],\"phase\":[\"DataLoaderIterInitialize\",\"DataLoaderIter\"],\"value\":{\"__ndarray__\":\"9i6wpulcVj9PWRaj6f9YQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[2]}},\"selected\":{\"id\":\"1216\"},\"selection_policy\":{\"id\":\"1215\"}},\"id\":\"1204\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1193\"},{\"id\":\"1194\"},{\"id\":\"1195\"},{\"id\":\"1196\"}]},\"id\":\"1197\",\"type\":\"Toolbar\"},{\"attributes\":{\"field\":\"angle\"},\"id\":\"1203\",\"type\":\"CumSum\"},{\"attributes\":{\"field\":\"angle\",\"include_zero\":true},\"id\":\"1202\",\"type\":\"CumSum\"},{\"attributes\":{\"width\":1100},\"id\":\"1219\",\"type\":\"Paragraph\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1214\"},\"ticker\":{\"id\":\"1186\"},\"visible\":false},\"id\":\"1185\",\"type\":\"LinearAxis\"},{\"attributes\":{\"callback\":null,\"tooltips\":\"@phase:@value\"},\"id\":\"1193\",\"type\":\"HoverTool\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1203\"},\"units\":\"rad\"},\"fill_color\":{\"field\":\"color\"},\"line_color\":{\"value\":\"white\"},\"radius\":{\"units\":\"data\",\"value\":0.3},\"start_angle\":{\"expr\":{\"id\":\"1202\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":0.0}},\"id\":\"1206\",\"type\":\"Wedge\"},{\"attributes\":{},\"id\":\"1194\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"items\":[{\"id\":\"1218\"}],\"label_text_font_size\":\"8pt\",\"location\":\"center_right\"},\"id\":\"1217\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"1195\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1216\",\"type\":\"Selection\"},{\"attributes\":{\"end\":0.8999999999999999,\"start\":-0.5},\"id\":\"1177\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"1212\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"below\":[{\"id\":\"1185\"}],\"center\":[{\"id\":\"1188\"},{\"id\":\"1192\"},{\"id\":\"1217\"}],\"left\":[{\"id\":\"1189\"}],\"outline_line_color\":\"white\",\"plot_height\":350,\"renderers\":[{\"id\":\"1208\"}],\"title\":{\"id\":\"1175\"},\"toolbar\":{\"id\":\"1197\"},\"x_range\":{\"id\":\"1177\"},\"x_scale\":{\"id\":\"1181\"},\"y_range\":{\"id\":\"1179\"},\"y_scale\":{\"id\":\"1183\"}},\"id\":\"1174\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1196\",\"type\":\"PanTool\"},{\"attributes\":{\"text\":\"General framework operations\"},\"id\":\"1175\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1181\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"1185\"},\"grid_line_color\":null,\"ticker\":null},\"id\":\"1188\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1204\"}},\"id\":\"1209\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1186\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1183\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1215\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"children\":[{\"id\":\"1174\"}]},\"id\":\"1220\",\"type\":\"Row\"},{\"attributes\":{},\"id\":\"1214\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"label\":{\"field\":\"phase\"},\"renderers\":[{\"id\":\"1208\"}]},\"id\":\"1218\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1190\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1189\"},\"dimension\":1,\"grid_line_color\":null,\"ticker\":null},\"id\":\"1192\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1179\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1212\"},\"ticker\":{\"id\":\"1190\"},\"visible\":false},\"id\":\"1189\",\"type\":\"LinearAxis\"}],\"root_ids\":[\"1221\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"0b29b1bf-af40-43f6-8e42-8cd06ee83d36\",\"root_ids\":[\"1221\"],\"roots\":{\"1221\":\"482249a8-e335-448c-9979-eb90cea5eee1\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1221" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "report = load_report('OverallFrameworkMetrics')\n", + "if report:\n", + " if 'Details' in report:\n", + "\n", + " display(Markdown(f\"\"\"## Framework metrics summary\"\"\"))\n", + " plots = []\n", + " text = ''\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"The following two pie charts show the time spent on the TRAIN phase, the EVAL phase, \n", + " and others. The 'others' includes the time spent between steps (after one step has finished and before\n", + " the next step has started). Ideally, most of the training time should be spent on the \n", + " TRAIN and EVAL phases. If TRAIN/EVAL were not specified in the training script, steps will be recorded as \n", + " GLOBAL.\"\"\"\n", + "\n", + " if 'others' in report['Details']['phase']:\n", + " others = float(report['Details']['phase']['others'])\n", + "\n", + " if others > 25:\n", + " text = f\"\"\"{text} Your training job spent quite a significant amount of time ({round(others,2)}%) in phase \"others\".\n", + " You should check what is happening in between the steps.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase and others\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details']:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie chart on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the time was spent in event \"{event}\".\"\"\"\n", + "\n", + " if perc > 70:\n", + " text = f\"\"\"There is quite a significant difference between the time spent on forward and backward\n", + " pass.\"\"\"\n", + " else:\n", + " text = f\"\"\"{text} It shows that {int(perc)}% of the training time\n", + " was spent on \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text=''\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following piechart shows a breakdown of the CPU/GPU operators. \n", + " It shows that {int(ratio)}% of training time was spent on executing the \"{key}\" operator.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details']:\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General framework operations\")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = ''\n", + " if 'horovod' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: Horovod metrics\"\"\"))\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"{text} The following pie chart shows a detailed breakdown of the Horovod metrics profiled\n", + " from your training job. The most expensive function was \"{event}\" with {int(perc)}%.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Horovod metrics \")\n", + "\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plot)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.452274Z", + "iopub.status.busy": "2023-04-10T21:36:47.451566Z", + "iopub.status.idle": "2023-04-10T21:36:47.453705Z", + "shell.execute_reply": "2023-04-10T21:36:47.454102Z" + }, + "papermill": { + "duration": 0.03243, + "end_time": "2023-04-10T21:36:47.454233", + "exception": false, + "start_time": "2023-04-10T21:36:47.421803", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "values = []\n", + "if report:\n", + " if 'CPU_total' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: CPU operators\"\"\"))\n", + " event = max(report['Details']['CPU'], key=report['Details']['CPU'].get)\n", + " perc = report['Details']['CPU'][event]\n", + "\n", + " for function in report['Details']['CPU']:\n", + " percentage = round(report['Details']['CPU'][function],2)\n", + " time = report['Details']['CPU_total'][function] \n", + " rows.append([percentage, time, function])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['percentage', 'time', 'operator']\n", + "\n", + " df = df.sort_values(by=['percentage'], ascending=False)\n", + " source = ColumnDataSource(data=df)\n", + " columns = [TableColumn(field='percentage', title=\"Percentage\"),\n", + " TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n", + " TableColumn(field='operator', title=\"CPU operator\"),]\n", + "\n", + " table = DataTable(source=source, columns=columns, width=550, height=350)\n", + "\n", + " text = Paragraph(text=f\"\"\"The following table shows a list of operators that ran on the CPUs.\n", + " The most expensive operator on the CPUs was \"{event}\" with {int(perc)} %.\"\"\")\n", + "\n", + " plot = create_piechart(report['Details']['CPU'],\n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " )\n", + "\n", + " show(column(text, row(table, plot)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.504784Z", + "iopub.status.busy": "2023-04-10T21:36:47.504159Z", + "iopub.status.idle": "2023-04-10T21:36:47.506200Z", + "shell.execute_reply": "2023-04-10T21:36:47.506592Z" + }, + "papermill": { + "duration": 0.031593, + "end_time": "2023-04-10T21:36:47.506718", + "exception": false, + "start_time": "2023-04-10T21:36:47.475125", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "values = []\n", + "if report:\n", + " if 'GPU_total' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: GPU operators\"\"\"))\n", + " event = max(report['Details']['GPU'], key=report['Details']['GPU'].get)\n", + " perc = report['Details']['GPU'][event]\n", + "\n", + " for function in report['Details']['GPU']:\n", + " percentage = round(report['Details']['GPU'][function],2)\n", + " time = report['Details']['GPU_total'][function] \n", + " rows.append([percentage, time, function])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['percentage', 'time', 'operator']\n", + "\n", + " df = df.sort_values(by=['percentage'], ascending=False)\n", + " source = ColumnDataSource(data=df)\n", + " columns = [TableColumn(field='percentage', title=\"Percentage\"),\n", + " TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n", + " TableColumn(field='operator', title=\"GPU operator\"),]\n", + " table = DataTable(source=source, columns=columns, width=450, height=350)\n", + "\n", + " text = Paragraph(text=f\"\"\"The following table shows a list of operators that your training job ran on GPU.\n", + " The most expensive operator on GPU was \"{event}\" with {int(perc)} %\"\"\")\n", + "\n", + " plot = create_piechart(report['Details']['GPU'],\n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " )\n", + "\n", + " show(column(text, row(table, plot)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.021054, + "end_time": "2023-04-10T21:36:47.548917", + "exception": false, + "start_time": "2023-04-10T21:36:47.527863", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Rules summary" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.597767Z", + "iopub.status.busy": "2023-04-10T21:36:47.597040Z", + "iopub.status.idle": "2023-04-10T21:36:47.599102Z", + "shell.execute_reply": "2023-04-10T21:36:47.599489Z" + }, + "papermill": { + "duration": 0.029322, + "end_time": "2023-04-10T21:36:47.599620", + "exception": false, + "start_time": "2023-04-10T21:36:47.570298", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "description = {}\n", + "description['CPUBottleneck'] = 'Checks if the CPU utilization is high and the GPU utilization is low. \\\n", + "It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive \\\n", + "from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue \\\n", + "if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n", + "description['IOBottleneck'] = 'Checks if the data I/O wait time is high and the GPU utilization is low. \\\n", + "It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. \\\n", + "The rule evaluates the I/O and GPU utilization rates and triggers the issue \\\n", + "if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n", + "description['Dataloader'] = 'Checks how many data loaders are running in parallel and whether the total number is equal the number \\\n", + "of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. \\\n", + "If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.'\n", + "description['GPUMemoryIncrease'] = 'Measures the average GPU memory footprint and triggers if there is a large increase.'\n", + "description['BatchSize'] = 'Checks if GPUs are underutilized because the batch size is too small. \\\n", + "To detect this problem, the rule analyzes the average GPU memory footprint, \\\n", + "the CPU and the GPU utilization. '\n", + "description['LowGPUUtilization'] = 'Checks if the GPU utilization is low or fluctuating. \\\n", + "This can happen due to bottlenecks, blocking calls for synchronizations, \\\n", + "or a small batch size.'\n", + "description['MaxInitializationTime'] = 'Checks if the time spent on initialization exceeds a threshold percent of the total training time. \\\n", + "The rule waits until the first step of training loop starts. The initialization can take longer \\\n", + "if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.'\n", + "description['LoadBalancing'] = 'Detects workload balancing issues across GPUs. \\\n", + "Workload imbalance can occur in training jobs with data parallelism. \\\n", + "The gradients are accumulated on a primary GPU, and this GPU might be overused \\\n", + "with regard to other GPUs, resulting in reducing the efficiency of data parallelization.'\n", + "description['StepOutlier'] = 'Detects outliers in step duration. The step duration for forward and backward pass should be \\\n", + "roughly the same throughout the training. If there are significant outliers, \\\n", + "it may indicate a system stall or bottleneck issues.'" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.646756Z", + "iopub.status.busy": "2023-04-10T21:36:47.646126Z", + "iopub.status.idle": "2023-04-10T21:36:47.648525Z", + "shell.execute_reply": "2023-04-10T21:36:47.648115Z" + }, + "papermill": { + "duration": 0.027979, + "end_time": "2023-04-10T21:36:47.648630", + "exception": false, + "start_time": "2023-04-10T21:36:47.620651", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "recommendation = {}\n", + "recommendation['CPUBottleneck'] = 'Consider increasing the number of data loaders \\\n", + "or applying data pre-fetching.'\n", + "recommendation['IOBottleneck'] = 'Pre-fetch data or choose different file formats, such as binary formats that \\\n", + "improve I/O performance.'\n", + "recommendation['Dataloader'] = 'Change the number of data loader processes.'\n", + "recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory if footprint is close to maximum available memory.'\n", + "recommendation['BatchSize'] = 'The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.'\n", + "recommendation['LowGPUUtilization'] = 'Check if there are bottlenecks, minimize blocking calls, \\\n", + "change distributed training strategy, or increase the batch size.'\n", + "recommendation['MaxInitializationTime'] = 'Initialization takes too long. \\\n", + "If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.'\n", + "recommendation['LoadBalancing'] = 'Choose a different distributed training strategy or \\\n", + "a different distributed training framework.'\n", + "recommendation['StepOutlier'] = 'Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.'" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.697707Z", + "iopub.status.busy": "2023-04-10T21:36:47.697187Z", + "iopub.status.idle": "2023-04-10T21:36:47.707998Z", + "shell.execute_reply": "2023-04-10T21:36:47.707591Z" + }, + "papermill": { + "duration": 0.038225, + "end_time": "2023-04-10T21:36:47.708099", + "exception": false, + "start_time": "2023-04-10T21:36:47.669874", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "The following table shows a profiling summary of the Debugger built-in rules. \n", + "The table is sorted by the rules that triggered the most frequently. During your training job, the LowGPUUtilization rule\n", + "was the most frequently triggered. It processed 1751 datapoints and was triggered 14 times." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DescriptionRecommendationNumber of times rule triggeredNumber of datapointsRule parameters
LowGPUUtilizationChecks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.141751threshold_p95:70
threshold_p5:10
window:500
patience:1000
BatchSizeChecks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.141750cpu_threshold_p95:70
gpu_threshold_p95:70
gpu_memory_threshold_p95:70
patience:1000
window:500
DataloaderChecks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.Change the number of data loader processes.18373min_threshold:70
max_threshold:200
MaxInitializationTimeChecks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.00threshold:20
GPUMemoryIncreaseMeasures the average GPU memory footprint and triggers if there is a large increase.Choose a larger instance type with more memory if footprint is close to maximum available memory.01751increase:5
patience:1000
window:10
CPUBottleneckChecks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.Consider increasing the number of data loaders or applying data pre-fetching.03514threshold:50
cpu_threshold:90
gpu_threshold:10
patience:1000
LoadBalancingDetects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.Choose a different distributed training strategy or a different distributed training framework.01751threshold:0.2
patience:1000
IOBottleneckChecks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.03514threshold:50
io_threshold:50
gpu_threshold:10
patience:1000
StepOutlierDetects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.00threshold:3
mode:None
n_outliers:10
stddev:3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "files = glob.glob('/opt/ml/processing/output/rule/profiler-output/profiler-reports/*json')\n", + "summary = {}\n", + "for i in files:\n", + " rule_name = i.split('/')[-1].replace('.json','')\n", + " if rule_name == \"OverallSystemUsage\" or rule_name == \"OverallFrameworkMetrics\":\n", + " continue\n", + " rule_report = json.load(open(i))\n", + " summary[rule_name] = {}\n", + " summary[rule_name]['Description'] = description[rule_name]\n", + " summary[rule_name]['Recommendation'] = recommendation[rule_name]\n", + " summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] \n", + " #summary[rule_name]['Number of violations'] = rule_report['Violations'] \n", + " summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']\n", + " summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']\n", + "\n", + "df = pd.DataFrame.from_dict(summary, orient='index')\n", + "df = df.sort_values(by=['Number of times rule triggered'], ascending=False)\n", + "\n", + "\n", + "display(Markdown(f\"\"\"The following table shows a profiling summary of the Debugger built-in rules. \n", + "The table is sorted by the rules that triggered the most frequently. During your training job, the {df.index[0]} rule\n", + "was the most frequently triggered. It processed {df.values[0,3]} datapoints and was triggered {df.values[0,2]} times.\"\"\"))\n", + "\n", + "with pd.option_context('display.colheader_justify','left'): \n", + " pretty_print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.758254Z", + "iopub.status.busy": "2023-04-10T21:36:47.757747Z", + "iopub.status.idle": "2023-04-10T21:36:47.760140Z", + "shell.execute_reply": "2023-04-10T21:36:47.760517Z" + }, + "papermill": { + "duration": 0.030389, + "end_time": "2023-04-10T21:36:47.760640", + "exception": false, + "start_time": "2023-04-10T21:36:47.730251", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Analyzing the training loop\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "analyse_phase = \"training\"\n", + "if job_statistics and \"initialization_in_seconds\" in job_statistics:\n", + " if job_statistics[\"initialization_in_seconds\"] > job_statistics[\"training_loop_duration_in_seconds\"]:\n", + " analyse_phase = \"initialization\"\n", + " time = job_statistics[\"initialization_in_seconds\"]\n", + " perc = job_statistics[\"initialization_%\"]\n", + " display(Markdown(f\"\"\"The initialization phase took {int(time)} seconds, which is {int(perc)}%*\n", + " of the total training time. Since the training loop has taken the most time, \n", + " we dive deep into the events occurring during this phase\"\"\"))\n", + " display(Markdown(\"\"\"## Analyzing initialization\\n\\n\"\"\"))\n", + " time = job_statistics[\"training_loop_duration_in_seconds\"]\n", + " perc = job_statistics[\"training_loop_%\"]\n", + " display(Markdown(f\"\"\"The training loop lasted for {int(time)} seconds which is {int(perc)}% of the training job time.\n", + " Since the training loop has taken the most time, we dive deep into the events occured during this phase.\"\"\"))\n", + "if analyse_phase == 'training':\n", + " display(Markdown(\"\"\"## Analyzing the training loop\\n\\n\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.809858Z", + "iopub.status.busy": "2023-04-10T21:36:47.809280Z", + "iopub.status.idle": "2023-04-10T21:36:47.810979Z", + "shell.execute_reply": "2023-04-10T21:36:47.811369Z" + }, + "papermill": { + "duration": 0.028294, + "end_time": "2023-04-10T21:36:47.811491", + "exception": false, + "start_time": "2023-04-10T21:36:47.783197", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "if analyse_phase == \"initialization\":\n", + " display(Markdown(\"\"\"### MaxInitializationTime\\n\\nThis rule helps to detect if the training initialization is taking too much time. \\nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\\nYou can run the rule locally in the following way:\n", + " \"\"\"))\n", + " \n", + " _ = load_report(\"MaxInitializationTime\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.881275Z", + "iopub.status.busy": "2023-04-10T21:36:47.859609Z", + "iopub.status.idle": "2023-04-10T21:36:47.891669Z", + "shell.execute_reply": "2023-04-10T21:36:47.892049Z" + }, + "papermill": { + "duration": 0.058317, + "end_time": "2023-04-10T21:36:47.892174", + "exception": false, + "start_time": "2023-04-10T21:36:47.833857", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Step duration analysis" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"f81a27ae-4de5-4eaf-8d7e-5f8b64b81710\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1288\"}]},\"id\":\"1289\",\"type\":\"Column\"},{\"attributes\":{\"text\":\"The StepOutlier rule measures step durations and checks for outliers. The rule \\n returns True if duration is larger than 3 times the standard deviation. The rule \\n also takes the parameter mode, that specifies whether steps from training or validation phase \\n should be checked. In your processing job mode was specified as None. \\n Typically the first step is taking significantly more time and to avoid the \\n rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. \\n n_outliers was set to 10.\\n The rule analysed 0 datapoints and triggered 0 times.\\n \",\"width\":900},\"id\":\"1288\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1289\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"f81a27ae-4de5-4eaf-8d7e-5f8b64b81710\",\"root_ids\":[\"1289\"],\"roots\":{\"1289\":\"672a69f0-9e5f-42ed-9fc3-e20e7bd1182f\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1289" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### Step duration analysis\"\"\"))\n", + " report = load_report('StepOutlier')\n", + " if report:\n", + " parameters = report['RuleParameters']\n", + " params = report['RuleParameters'].split('\\n')\n", + " stddev = params[3].split(':')[1]\n", + " mode = params[1].split(':')[1]\n", + " n_outlier = params[2].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + "\n", + " text = f\"\"\"The StepOutlier rule measures step durations and checks for outliers. The rule \n", + " returns True if duration is larger than {stddev} times the standard deviation. The rule \n", + " also takes the parameter mode, that specifies whether steps from training or validation phase \n", + " should be checked. In your processing job mode was specified as {mode}. \n", + " Typically the first step is taking significantly more time and to avoid the \n", + " rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. \n", + " n_outliers was set to {n_outlier}.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\n", + " \"\"\"\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph))\n", + "\n", + " if report and len(report['Details']['step_details']) > 0:\n", + " for node_id in report['Details']['step_details']:\n", + " tmp = report['RuleParameters'].split('threshold:')\n", + " threshold = tmp[1].split('\\n')[0]\n", + " n_outliers = report['Details']['step_details'][node_id]['number_of_outliers']\n", + " mean = report['Details']['step_details'][node_id]['step_stats']['mean']\n", + " stddev = report['Details']['step_details'][node_id]['stddev']\n", + " phase = report['Details']['step_details'][node_id]['phase']\n", + " display(Markdown(f\"\"\"**Step durations on node {node_id}:**\"\"\"))\n", + " display(Markdown(f\"\"\"The following table is a summary of the statistics of step durations measured on node {node_id}.\n", + " The rule has analyzed the step duration from {phase} phase.\n", + " The average step duration on node {node_id} was {round(mean, 2)}s. \n", + " The rule detected {n_outliers} outliers, where step duration was larger than {threshold} times the standard deviation of {stddev}s\n", + " \\n\"\"\"))\n", + " step_stats_df = pd.DataFrame.from_dict(report['Details']['step_details'][node_id]['step_stats'], orient='index').T\n", + " step_stats_df.index = ['Step Durations in [s]']\n", + " pretty_print(step_stats_df)\n", + "\n", + " display(Markdown(f\"\"\"The following histogram shows the step durations measured on the different nodes. \n", + " You can turn on or turn off the visualization of histograms by selecting or unselecting the labels in the legend.\"\"\"))\n", + "\n", + " plot = figure(plot_height=450, \n", + " plot_width=850, \n", + " title=f\"\"\"Step durations\"\"\") \n", + "\n", + " colors = bokeh.palettes.viridis(len(report['Details']['step_details']))\n", + "\n", + " for index, node_id in enumerate(report['Details']['step_details']):\n", + " probs = report['Details']['step_details'][node_id]['probs']\n", + " binedges = report['Details']['step_details'][node_id]['binedges']\n", + "\n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=binedges[:-1],\n", + " right=binedges[1:],\n", + " line_color=\"white\",\n", + " fill_color=colors[index],\n", + " fill_alpha=0.7,\n", + " legend=node_id)\n", + "\n", + " plot.add_layout(Legend(), 'right') \n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Step durations in [s]\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + " plot.legend.location = 'center_right'\n", + " show(plot)\n", + "\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " text=f\"\"\"To get a better understanding of what may have caused those outliers,\n", + " we correlate the timestamps of step outliers with other framework metrics that happened at the same time.\n", + " The left chart shows how much time was spent in the different framework\n", + " metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without\n", + " outliers). The following chart shows how much time was spent in the different \n", + " framework metrics when step outliers occurred. In this chart framework metrics are not aggregated byphase.\"\"\"\n", + " plots = []\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether step outliers mainly happened during TRAIN or EVAL phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie chart on the right shows a detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The Ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators executed during the step outliers. \n", + " It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n", + " recorded when step outliers happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot))) " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:47.957314Z", + "iopub.status.busy": "2023-04-10T21:36:47.953243Z", + "iopub.status.idle": "2023-04-10T21:36:48.083850Z", + "shell.execute_reply": "2023-04-10T21:36:48.084236Z" + }, + "papermill": { + "duration": 0.168081, + "end_time": "2023-04-10T21:36:48.084377", + "exception": false, + "start_time": "2023-04-10T21:36:47.916296", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### GPU utilization analysis\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Usage per GPU** \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"e3b12297-d3a6-4ed2-8ff5-98eb574356e4\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is \\n consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily \\n fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th \\n percentile of GPU utilization on 500 continuous datapoints and found 14 cases where \\n p95 was above 70% and p5 was below 10%. If p95 is high and p5 is low,\\n it might indicate that the GPU usage is highly fluctuating. If both values are very low, \\n it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, \\n so the rule skipped the first 1000 data points.\\n The rule analysed 1751 datapoints and triggered 14 times.\",\"width\":800},\"id\":\"1321\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1321\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"e3b12297-d3a6-4ed2-8ff5-98eb574356e4\",\"root_ids\":[\"1321\"],\"roots\":{\"1321\":\"c0d8604f-05db-4313-a790-d2485bb89f08\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1321" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"ae87b283-1c99-4729-8c88-e76031f333d1\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"Your training job is underutilizing the instance. You may want to consider\\n to either switch to a smaller instance type or to increase the batch size. \\n The last time that the LowGPUUtilization rule was triggered in your training job was on 04/10/2023 at 21:35:00.\\n The following boxplots are a snapshot from the timestamps. \\n They show the utilization per GPU (without outliers).\\n To get a better understanding of the workloads throughout the whole training,\\n you can check the workload histogram in the next section.\",\"width\":800},\"id\":\"1353\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1353\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"ae87b283-1c99-4729-8c88-e76031f333d1\",\"root_ids\":[\"1353\"],\"roots\":{\"1353\":\"f502bd09-b4a6-42fc-91f3-79b6fdddb4e7\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1353" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**GPU utilization of gpu0 on node algo-1:**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"35f32287-3ef9-4e7f-8400-0653832cfaf0\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" The max utilization of gpu0 on node algo-1 was 56.0% and the 95th percentile was only 55.0%. \\n gpu0 on node algo-1 is underutilized and the 5th percentile was only 0.0% The difference between 5th percentile 0.0% and 95th percentile 55.0% is quite \\n significant, which means that utilization on gpu0 is fluctuating quite a lot.\\n\",\"width\":900},\"id\":\"1443\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1443\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"35f32287-3ef9-4e7f-8400-0653832cfaf0\",\"root_ids\":[\"1443\"],\"roots\":{\"1443\":\"7b89cf4b-18f4-4c8c-bf1b-2655121746b2\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1443" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"e4720530-8612-4535-a407-a4daff439305\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1396\"}],\"center\":[{\"id\":\"1399\"},{\"id\":\"1403\"}],\"left\":[{\"id\":\"1400\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"1416\"},{\"id\":\"1421\"},{\"id\":\"1426\"},{\"id\":\"1431\"},{\"id\":\"1436\"},{\"id\":\"1441\"}],\"title\":{\"id\":\"1386\"},\"toolbar\":{\"id\":\"1408\"},\"x_range\":{\"id\":\"1388\"},\"x_scale\":{\"id\":\"1392\"},\"y_range\":{\"id\":\"1390\"},\"y_scale\":{\"id\":\"1394\"}},\"id\":\"1385\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1394\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1430\",\"type\":\"VBar\"},{\"attributes\":{\"end\":17},\"id\":\"1388\",\"type\":\"Range1d\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":0.0}},\"id\":\"1434\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"1428\"},\"glyph\":{\"id\":\"1429\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1430\"},\"selection_glyph\":null,\"view\":{\"id\":\"1432\"}},\"id\":\"1431\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"1510\"},\"ticker\":{\"id\":\"1401\"}},\"id\":\"1400\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1423\"}},\"id\":\"1427\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1510\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"1413\"}},\"id\":\"1417\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"1433\"}},\"id\":\"1437\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"1419\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1521\"},\"selection_policy\":{\"id\":\"1520\"}},\"id\":\"1433\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"1413\"},\"glyph\":{\"id\":\"1414\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1415\"},\"selection_glyph\":null,\"view\":{\"id\":\"1417\"}},\"id\":\"1416\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1515\"},\"selection_policy\":{\"id\":\"1514\"}},\"id\":\"1418\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.0},\"y1\":{\"value\":44.0}},\"id\":\"1414\",\"type\":\"Segment\"},{\"attributes\":{\"callback\":null},\"id\":\"1404\",\"type\":\"HoverTool\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":0.0}},\"id\":\"1435\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1401\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1405\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1514\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1418\"},\"glyph\":{\"id\":\"1419\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1420\"},\"selection_glyph\":null,\"view\":{\"id\":\"1422\"}},\"id\":\"1421\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.0},\"y1\":{\"value\":44.0}},\"id\":\"1415\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1406\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1523\",\"type\":\"Selection\"},{\"attributes\":{\"formatter\":{\"id\":\"1511\"},\"major_label_overrides\":{\"1\":\"gpu0\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"1475\"}},\"id\":\"1396\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":44.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1424\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1515\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1433\"},\"glyph\":{\"id\":\"1434\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1435\"},\"selection_glyph\":null,\"view\":{\"id\":\"1437\"}},\"id\":\"1436\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1512\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1513\"},\"selection_policy\":{\"id\":\"1512\"}},\"id\":\"1413\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"1418\"}},\"id\":\"1422\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1518\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1392\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1517\"},\"selection_policy\":{\"id\":\"1516\"}},\"id\":\"1423\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1511\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1404\"},{\"id\":\"1405\"},{\"id\":\"1406\"},{\"id\":\"1407\"}]},\"id\":\"1408\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1523\"},\"selection_policy\":{\"id\":\"1522\"}},\"id\":\"1438\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1513\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1390\",\"type\":\"DataRange1d\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"1420\",\"type\":\"Segment\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.0}},\"id\":\"1439\",\"type\":\"Rect\"},{\"attributes\":{\"ticks\":[0,1]},\"id\":\"1475\",\"type\":\"FixedTicker\"},{\"attributes\":{},\"id\":\"1517\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1522\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1407\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1519\",\"type\":\"Selection\"},{\"attributes\":{\"axis\":{\"id\":\"1400\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"1403\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":44.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1425\",\"type\":\"VBar\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.0}},\"id\":\"1440\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1520\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis\":{\"id\":\"1396\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"1399\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1438\"}},\"id\":\"1442\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1429\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"1423\"},\"glyph\":{\"id\":\"1424\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1425\"},\"selection_glyph\":null,\"view\":{\"id\":\"1427\"}},\"id\":\"1426\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1438\"},\"glyph\":{\"id\":\"1439\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1440\"},\"selection_glyph\":null,\"view\":{\"id\":\"1442\"}},\"id\":\"1441\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1521\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1428\"}},\"id\":\"1432\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1519\"},\"selection_policy\":{\"id\":\"1518\"}},\"id\":\"1428\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1516\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"text\":\"Node algo-1\"},\"id\":\"1386\",\"type\":\"Title\"}],\"root_ids\":[\"1385\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"e4720530-8612-4535-a407-a4daff439305\",\"root_ids\":[\"1385\"],\"roots\":{\"1385\":\"234d87cd-0c6c-4556-80ab-63453c62a670\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1385" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**GPU utilization of gpu0 on node algo-2:**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"05b5f792-e68d-4203-bfff-9828141c4085\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" The max utilization of gpu0 on node algo-2 was 56.0% and the 95th percentile was only 56.0%. \\n gpu0 on node algo-2 is underutilized and the 5th percentile was only 0.0% The difference between 5th percentile 0.0% and 95th percentile 56.0% is quite \\n significant, which means that utilization on gpu0 is fluctuating quite a lot.\\n\",\"width\":900},\"id\":\"1662\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1662\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"05b5f792-e68d-4203-bfff-9828141c4085\",\"root_ids\":[\"1662\"],\"roots\":{\"1662\":\"ebbbd289-c2e5-49b6-ab75-49b024a5691c\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1662" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"62a5bc9e-74f0-428a-806d-98336dc5a04e\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1615\"}],\"center\":[{\"id\":\"1618\"},{\"id\":\"1622\"}],\"left\":[{\"id\":\"1619\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"1635\"},{\"id\":\"1640\"},{\"id\":\"1645\"},{\"id\":\"1650\"},{\"id\":\"1655\"},{\"id\":\"1660\"}],\"title\":{\"id\":\"1605\"},\"toolbar\":{\"id\":\"1627\"},\"x_range\":{\"id\":\"1607\"},\"x_scale\":{\"id\":\"1611\"},\"y_range\":{\"id\":\"1609\"},\"y_scale\":{\"id\":\"1613\"}},\"id\":\"1604\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1613\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1649\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"1652\"},\"glyph\":{\"id\":\"1653\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1654\"},\"selection_glyph\":null,\"view\":{\"id\":\"1656\"}},\"id\":\"1655\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"formatter\":{\"id\":\"1762\"},\"major_label_overrides\":{\"1\":\"gpu0\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"1710\"}},\"id\":\"1615\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1768\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1648\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"1652\"}},\"id\":\"1656\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1620\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":0.0}},\"id\":\"1654\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1774\"},\"selection_policy\":{\"id\":\"1773\"}},\"id\":\"1657\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1769\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.0}},\"id\":\"1658\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1764\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1767\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1770\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1761\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1611\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1771\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"text\":\"Node algo-2\"},\"id\":\"1605\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1609\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"1642\"}},\"id\":\"1646\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"1647\"}},\"id\":\"1651\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.0}},\"id\":\"1659\",\"type\":\"Rect\"},{\"attributes\":{\"source\":{\"id\":\"1657\"}},\"id\":\"1661\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1765\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1772\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1632\"}},\"id\":\"1636\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"1638\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1624\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1773\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1657\"},\"glyph\":{\"id\":\"1658\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1659\"},\"selection_glyph\":null,\"view\":{\"id\":\"1661\"}},\"id\":\"1660\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1615\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"1618\",\"type\":\"Grid\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1766\"},\"selection_policy\":{\"id\":\"1765\"}},\"id\":\"1637\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.0},\"y1\":{\"value\":45.0}},\"id\":\"1634\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1626\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1774\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1637\"},\"glyph\":{\"id\":\"1638\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1639\"},\"selection_glyph\":null,\"view\":{\"id\":\"1641\"}},\"id\":\"1640\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1647\"},\"glyph\":{\"id\":\"1648\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1649\"},\"selection_glyph\":null,\"view\":{\"id\":\"1651\"}},\"id\":\"1650\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":45.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1643\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1772\"},\"selection_policy\":{\"id\":\"1771\"}},\"id\":\"1652\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"end\":17},\"id\":\"1607\",\"type\":\"Range1d\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1764\"},\"selection_policy\":{\"id\":\"1763\"}},\"id\":\"1632\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis\":{\"id\":\"1619\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"1622\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1637\"}},\"id\":\"1641\",\"type\":\"CDSView\"},{\"attributes\":{\"ticks\":[0,1]},\"id\":\"1710\",\"type\":\"FixedTicker\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1768\"},\"selection_policy\":{\"id\":\"1767\"}},\"id\":\"1642\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"callback\":null},\"id\":\"1623\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1766\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1632\"},\"glyph\":{\"id\":\"1633\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1634\"},\"selection_glyph\":null,\"view\":{\"id\":\"1636\"}},\"id\":\"1635\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"1639\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.0},\"y1\":{\"value\":45.0}},\"id\":\"1633\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1763\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1623\"},{\"id\":\"1624\"},{\"id\":\"1625\"},{\"id\":\"1626\"}]},\"id\":\"1627\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom\":{\"value\":45.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1644\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1625\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"1761\"},\"ticker\":{\"id\":\"1620\"}},\"id\":\"1619\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1770\"},\"selection_policy\":{\"id\":\"1769\"}},\"id\":\"1647\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1762\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":0.0}},\"id\":\"1653\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"1642\"},\"glyph\":{\"id\":\"1643\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1644\"},\"selection_glyph\":null,\"view\":{\"id\":\"1646\"}},\"id\":\"1645\",\"type\":\"GlyphRenderer\"}],\"root_ids\":[\"1604\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"62a5bc9e-74f0-428a-806d-98336dc5a04e\",\"root_ids\":[\"1604\"],\"roots\":{\"1604\":\"b13f6944-5802-4da7-b9b2-64800d1a448d\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1604" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### GPU utilization analysis\\n\\n\"\"\"))\n", + " display(Markdown(\"\"\"**Usage per GPU** \\n\\n\"\"\"))\n", + " report = load_report('LowGPUUtilization')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold_p95 = params[0].split(':')[1]\n", + " threshold_p5 = params[1].split(':')[1]\n", + " window = params[2].split(':')[1]\n", + " patience = params[3].split(':')[1]\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=Paragraph(text=f\"\"\"The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is \n", + " consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily \n", + " fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th \n", + " percentile of GPU utilization on {window} continuous datapoints and found {violations} cases where \n", + " p95 was above {threshold_p95}% and p5 was below {threshold_p5}%. If p95 is high and p5 is low,\n", + " it might indicate that the GPU usage is highly fluctuating. If both values are very low, \n", + " it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, \n", + " so the rule skipped the first {patience} data points.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\", width=800)\n", + " show(text)\n", + "\n", + " \n", + " if len(report['Details']) > 0:\n", + " \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n", + " to either switch to a smaller instance type or to increase the batch size. \n", + " The last time that the LowGPUUtilization rule was triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. \n", + " They show the utilization per GPU (without outliers).\n", + " To get a better understanding of the workloads throughout the whole training,\n", + " you can check the workload histogram in the next section.\"\"\", width=800)\n", + " show(text)\n", + " \n", + " del report['Details']['last_timestamp']\n", + " \n", + " for node_id in report['Details']:\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,17),\n", + " )\n", + " \n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " display(Markdown(f\"\"\"**GPU utilization of {key} on node {node_id}:**\"\"\"))\n", + " text = \"\"\n", + " gpu_max = report['Details'][node_id][key]['gpu_max']\n", + " p_95 = report['Details'][node_id][key]['gpu_95']\n", + " p_5 = report['Details'][node_id][key]['gpu_5']\n", + " text = f\"\"\"{text} The max utilization of {key} on node {node_id} was {gpu_max}%\"\"\"\n", + " if p_95 < int(threshold_p95): \n", + " text = f\"\"\"{text} and the 95th percentile was only {p_95}%. \n", + " {key} on node {node_id} is underutilized\"\"\"\n", + " if p_5 < int(threshold_p5): \n", + " text = f\"\"\"{text} and the 5th percentile was only {p_5}%\"\"\"\n", + " if p_95 - p_5 > 50:\n", + " text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n", + " significant, which means that utilization on {key} is fluctuating quite a lot.\\n\"\"\"\n", + " \n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " text=Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(text)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " \n", + " show(plot)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:48.153227Z", + "iopub.status.busy": "2023-04-10T21:36:48.152655Z", + "iopub.status.idle": "2023-04-10T21:36:48.243911Z", + "shell.execute_reply": "2023-04-10T21:36:48.244296Z" + }, + "papermill": { + "duration": 0.131534, + "end_time": "2023-04-10T21:36:48.244437", + "exception": false, + "start_time": "2023-04-10T21:36:48.112903", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "**Workload balancing**\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"0019c337-1792-42c4-94ba-7af523cb8747\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The LoadBalancing rule helps to detect issues in workload balancing \\n between multiple GPUs. \\n It computes a histogram of GPU utilization values for each GPU and compares then the \\n similarity between histograms. The rule checked if the distance of histograms is larger than the \\n threshold of 0.2.\\n During initialization utilization is likely zero, so the rule skipped the first 1000 data points.\\n \",\"width\":900},\"id\":\"1855\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1855\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"0019c337-1792-42c4-94ba-7af523cb8747\",\"root_ids\":[\"1855\"],\"roots\":{\"1855\":\"b2aa5c6d-ea0a-4052-90b3-5fc5b9105f33\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1855" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"9e7d0814-a773-4215-9154-b25ae9d0dfd8\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1967\"},{\"id\":\"1919\"}]},\"id\":\"1968\",\"type\":\"Column\"},{\"attributes\":{\"source\":{\"id\":\"1952\"}},\"id\":\"1956\",\"type\":\"CDSView\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"1966\"}]},\"id\":\"1965\",\"type\":\"Legend\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1938\"},{\"id\":\"1939\"},{\"id\":\"1940\"},{\"id\":\"1941\"},{\"id\":\"1942\"},{\"id\":\"1943\"}]},\"id\":\"1945\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1938\",\"type\":\"PanTool\"},{\"attributes\":{\"label\":{\"value\":\"gpu0\"},\"renderers\":[{\"id\":\"1955\"}]},\"id\":\"1966\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"1939\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1944\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data_source\":{\"id\":\"1952\"},\"glyph\":{\"id\":\"1953\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1954\"},\"selection_glyph\":null,\"view\":{\"id\":\"1956\"}},\"id\":\"1955\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"The following histogram shows the workload per GPU on node algo-1. \\n You can enable/disable the visualization of a workload by clicking on the label in the legend.\\n \"},\"id\":\"1967\",\"type\":\"Paragraph\"},{\"attributes\":{\"overlay\":{\"id\":\"1944\"}},\"id\":\"1940\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1961\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"start\":0},\"id\":\"1924\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1959\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1941\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1942\",\"type\":\"ResetTool\"},{\"attributes\":{\"text\":\"Workloads on node algo-1\"},\"id\":\"1920\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1926\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1962\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"end\":100,\"start\":-1},\"id\":\"1922\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"1943\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1963\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1931\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"1959\"},\"ticker\":{\"id\":\"1935\"}},\"id\":\"1934\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1935\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1928\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Utilization\",\"formatter\":{\"id\":\"1961\"},\"ticker\":{\"id\":\"1931\"}},\"id\":\"1930\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"1930\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1933\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1954\",\"type\":\"Quad\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1953\",\"type\":\"Quad\"},{\"attributes\":{\"axis\":{\"id\":\"1934\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1937\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"1930\"}],\"center\":[{\"id\":\"1933\"},{\"id\":\"1937\"},{\"id\":\"1965\"}],\"left\":[{\"id\":\"1934\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"1955\"}],\"title\":{\"id\":\"1920\"},\"toolbar\":{\"id\":\"1945\"},\"x_range\":{\"id\":\"1922\"},\"x_scale\":{\"id\":\"1926\"},\"y_range\":{\"id\":\"1924\"},\"y_scale\":{\"id\":\"1928\"}},\"id\":\"1919\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[616,2,3,3,0,0,0,1,0,0,0,0,16,27,168,75,61,66,72,67,71,57,61,47,61,77,61,87,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"1963\"},\"selection_policy\":{\"id\":\"1962\"}},\"id\":\"1952\",\"type\":\"ColumnDataSource\"}],\"root_ids\":[\"1968\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"9e7d0814-a773-4215-9154-b25ae9d0dfd8\",\"root_ids\":[\"1968\"],\"roots\":{\"1968\":\"e5cdf90d-ea2b-4441-bf73-5d6af17f5a3d\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1968" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"b5dc954a-2547-4628-acaa-70df3947acad\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"2128\"},{\"id\":\"2080\"}]},\"id\":\"2129\",\"type\":\"Column\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"2127\"}]},\"id\":\"2126\",\"type\":\"Legend\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2105\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2099\"},{\"id\":\"2100\"},{\"id\":\"2101\"},{\"id\":\"2102\"},{\"id\":\"2103\"},{\"id\":\"2104\"}]},\"id\":\"2106\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[608,1,4,3,0,0,0,0,0,0,0,0,17,9,171,81,70,78,53,56,59,67,62,69,83,79,50,63,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2124\"},\"selection_policy\":{\"id\":\"2123\"}},\"id\":\"2113\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2115\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2099\",\"type\":\"PanTool\"},{\"attributes\":{\"source\":{\"id\":\"2113\"}},\"id\":\"2117\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2100\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2122\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"overlay\":{\"id\":\"2105\"}},\"id\":\"2101\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"2120\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2102\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"2103\",\"type\":\"ResetTool\"},{\"attributes\":{\"end\":100,\"start\":-1},\"id\":\"2083\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"2123\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2113\"},\"glyph\":{\"id\":\"2114\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2115\"},\"selection_glyph\":null,\"view\":{\"id\":\"2117\"}},\"id\":\"2116\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2104\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"2124\",\"type\":\"Selection\"},{\"attributes\":{\"text\":\"The following histogram shows the workload per GPU on node algo-2. \\n You can enable/disable the visualization of a workload by clicking on the label in the legend.\\n \"},\"id\":\"2128\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"2087\",\"type\":\"LinearScale\"},{\"attributes\":{\"start\":0},\"id\":\"2085\",\"type\":\"DataRange1d\"},{\"attributes\":{\"label\":{\"value\":\"gpu0\"},\"renderers\":[{\"id\":\"2116\"}]},\"id\":\"2127\",\"type\":\"LegendItem\"},{\"attributes\":{\"axis\":{\"id\":\"2091\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2094\",\"type\":\"Grid\"},{\"attributes\":{\"below\":[{\"id\":\"2091\"}],\"center\":[{\"id\":\"2094\"},{\"id\":\"2098\"},{\"id\":\"2126\"}],\"left\":[{\"id\":\"2095\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"2116\"}],\"title\":{\"id\":\"2081\"},\"toolbar\":{\"id\":\"2106\"},\"x_range\":{\"id\":\"2083\"},\"x_scale\":{\"id\":\"2087\"},\"y_range\":{\"id\":\"2085\"},\"y_scale\":{\"id\":\"2089\"}},\"id\":\"2080\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"2096\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Utilization\",\"formatter\":{\"id\":\"2122\"},\"ticker\":{\"id\":\"2092\"}},\"id\":\"2091\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"Workloads on node algo-2\"},\"id\":\"2081\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2089\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"2120\"},\"ticker\":{\"id\":\"2096\"}},\"id\":\"2095\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2092\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"2095\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2098\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2114\",\"type\":\"Quad\"}],\"root_ids\":[\"2129\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"b5dc954a-2547-4628-acaa-70df3947acad\",\"root_ids\":[\"2129\"],\"roots\":{\"2129\":\"c69b05a7-cc72-4e4e-b640-e47494e3b9d6\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2129" + } + }, + "output_type": "display_data" + } + ], + "source": [ + " \n", + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"**Workload balancing**\\n\\n\"\"\")) \n", + " report = load_report('LoadBalancing')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = params[0].split(':')[1]\n", + " patience = params[1].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " paragraph = Paragraph(text=f\"\"\"The LoadBalancing rule helps to detect issues in workload balancing \n", + " between multiple GPUs. \n", + " It computes a histogram of GPU utilization values for each GPU and compares then the \n", + " similarity between histograms. The rule checked if the distance of histograms is larger than the \n", + " threshold of {threshold}.\n", + " During initialization utilization is likely zero, so the rule skipped the first {patience} data points.\n", + " \"\"\", width=900)\n", + " show(paragraph)\n", + " \n", + " if len(report['Details']) > 0:\n", + " for node_id in report['Details']: \n", + " \n", + " \n", + " text = f\"\"\"The following histogram shows the workload per GPU on node {node_id}. \n", + " You can enable/disable the visualization of a workload by clicking on the label in the legend.\n", + " \"\"\"\n", + " if len(report['Details']) == 1 and len(report['Details'][node_id]['workloads']) == 1:\n", + " text = f\"\"\"{text} Your training job only used one GPU so there is no workload balancing issue.\"\"\"\n", + " \n", + " plot = figure(plot_height=450, \n", + " plot_width=850, \n", + " x_range=(-1,100),\n", + " title=f\"\"\"Workloads on node {node_id}\"\"\")\n", + " \n", + " colors = bokeh.palettes.viridis(len(report['Details'][node_id]['workloads']))\n", + " \n", + " for index, gpu_id2 in enumerate(report['Details'][node_id]['workloads']):\n", + " probs = report['Details'][node_id]['workloads'][gpu_id2]\n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=np.arange(0,98,2),\n", + " right=np.arange(2,100,2),\n", + " line_color=\"white\",\n", + " fill_color=colors[index],\n", + " fill_alpha=0.8,\n", + " legend=gpu_id2 )\n", + "\n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Utilization\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + " \n", + " paragraph = Paragraph(text=text)\n", + " show(column(paragraph, plot))\n", + " \n", + " if \"distances\" in report['Details'][node_id]:\n", + " text = f\"\"\"The rule identified workload balancing issues on node {node_id} \n", + " where workloads differed by more than threshold {threshold}. \n", + " \"\"\"\n", + " for index, gpu_id2 in enumerate(report['Details'][node_id]['distances']):\n", + " for gpu_id1 in report['Details'][node_id]['distances'][gpu_id2]:\n", + " distance = round(report['Details'][node_id]['distances'][gpu_id2][gpu_id1], 2)\n", + " text = f\"\"\"{text} The difference of workload between {gpu_id2} and {gpu_id1} is: {distance}.\"\"\"\n", + "\n", + " paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(column(paragraph))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:48.319361Z", + "iopub.status.busy": "2023-04-10T21:36:48.318845Z", + "iopub.status.idle": "2023-04-10T21:36:48.394454Z", + "shell.execute_reply": "2023-04-10T21:36:48.394861Z" + }, + "papermill": { + "duration": 0.119449, + "end_time": "2023-04-10T21:36:48.395006", + "exception": false, + "start_time": "2023-04-10T21:36:48.275557", + "status": "completed" + }, + "scrolled": true, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Dataloading analysis\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"95c735e8-9ea1-4233-964f-33f976f3273b\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The number of dataloader workers can greatly affect the overall performance \\n of your training job. The rule analyzed the number of dataloading processes that have been running in \\n parallel on the training instance and compares it against the total number of cores. \\n The rule checked if the number of processes is smaller than 70% or larger than \\n 200% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU \\n underutilization. Having too many dataloader workers may hurt the\\n overall performance if you are running other compute intensive tasks on the CPU.\\n The rule analysed 8373 datapoints and triggered 1 times.\",\"width\":900},\"id\":\"2249\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2249\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"95c735e8-9ea1-4233-964f-33f976f3273b\",\"root_ids\":[\"2249\"],\"roots\":{\"2249\":\"6bc97726-6f45-43a5-b9e6-54edbf93675b\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2249" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"6cd3122a-2f3c-4144-9141-8b6e107f3638\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" Your training instance provided 4 CPU cores, however your training job only \\n ran on average 1 dataloader workers in parallel. We recommend you to increase the number of\\n dataloader workers. Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.\\n The rule detected that your training job was not using pinned memory. \\n In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True.\",\"width\":900},\"id\":\"2329\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2329\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"6cd3122a-2f3c-4144-9141-8b6e107f3638\",\"root_ids\":[\"2329\"],\"roots\":{\"2329\":\"230c16c5-b6c5-4e29-b0b8-1db7ee6085e9\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2329" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"6531d92f-f455-4437-8eec-3be2d5ba7e53\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"2450\"},{\"id\":\"2409\"}]},\"id\":\"2451\",\"type\":\"Column\"},{\"attributes\":{\"data_source\":{\"id\":\"2435\"},\"glyph\":{\"id\":\"2436\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2437\"},\"selection_glyph\":null,\"view\":{\"id\":\"2439\"}},\"id\":\"2438\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was 0.0789s. \\n The 95th percentile was 0.0955s and the 25th percentile was 0.0734s\",\"width\":900},\"id\":\"2450\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"2446\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"label\":{\"value\":\"Dataloading events\"},\"renderers\":[{\"id\":\"2438\"}]},\"id\":\"2449\",\"type\":\"LegendItem\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2440\",\"type\":\"Title\"},{\"attributes\":{\"callback\":null},\"id\":\"2426\",\"type\":\"HoverTool\"},{\"attributes\":{\"end\":0.291586,\"start\":0.016458},\"id\":\"2410\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"2427\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"2445\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2428\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2419\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2443\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis_label\":\"Dataloading in [s]\",\"formatter\":{\"id\":\"2445\"},\"ticker\":{\"id\":\"2419\"}},\"id\":\"2418\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2429\",\"type\":\"PanTool\"},{\"attributes\":{\"source\":{\"id\":\"2435\"}},\"id\":\"2439\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2447\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{\"left\":[0.016458,0.019209280000000002,0.02196056,0.024711840000000002,0.02746312,0.030214400000000002,0.032965680000000004,0.035716960000000006,0.03846824,0.04121952000000001,0.043970800000000004,0.046722080000000006,0.04947336000000001,0.05222464000000001,0.054975920000000005,0.057727200000000006,0.06047848000000001,0.06322976000000001,0.06598104000000002,0.06873232000000001,0.07148360000000001,0.07423488,0.07698616000000001,0.07973744,0.08248872000000002,0.08524000000000001,0.08799128000000002,0.09074256000000001,0.09349384000000001,0.09624512000000002,0.09899640000000001,0.10174768000000002,0.10449896000000002,0.10725024000000001,0.11000152000000002,0.11275280000000001,0.11550408000000002,0.11825536000000002,0.12100664000000001,0.12375792000000002,0.12650920000000002,0.12926048,0.13201176000000003,0.13476304,0.13751432000000002,0.14026560000000002,0.14301688,0.14576816000000004,0.14851944000000003,0.15127072000000003,0.15402200000000002,0.15677328000000001,0.15952456000000004,0.16227584000000003,0.16502712000000003,0.16777840000000002,0.17052968000000002,0.17328096000000004,0.17603224000000003,0.17878352000000003,0.18153480000000002,0.18428608000000002,0.18703736000000004,0.18978864000000004,0.19253992000000003,0.19529120000000003,0.19804248000000002,0.20079376000000004,0.20354504000000004,0.20629632000000003,0.20904760000000003,0.21179888000000002,0.21455016000000005,0.21730144000000004,0.22005272000000003,0.22280400000000003,0.22555528000000002,0.22830656000000005,0.23105784000000004,0.23380912000000004,0.23656040000000003,0.23931168000000003,0.24206296000000005,0.24481424000000004,0.24756552000000004,0.2503168,0.25306808000000003,0.25581936000000005,0.2585706400000001,0.26132192000000004,0.2640732,0.26682448000000003,0.26957576000000005,0.2723270400000001,0.2750783200000001,0.2778296,0.28058088000000003,0.28333216000000006,0.2860834400000001,0.2888347200000001],\"right\":[0.019209280000000002,0.02196056,0.024711840000000002,0.02746312,0.030214400000000002,0.032965680000000004,0.035716960000000006,0.03846824,0.04121952000000001,0.043970800000000004,0.046722080000000006,0.04947336000000001,0.05222464000000001,0.054975920000000005,0.057727200000000006,0.06047848000000001,0.06322976000000001,0.06598104000000002,0.06873232000000001,0.07148360000000001,0.07423488,0.07698616000000001,0.07973744,0.08248872000000002,0.08524000000000001,0.08799128000000002,0.09074256000000001,0.09349384000000001,0.09624512000000002,0.09899640000000001,0.10174768000000002,0.10449896000000002,0.10725024000000001,0.11000152000000002,0.11275280000000001,0.11550408000000002,0.11825536000000002,0.12100664000000001,0.12375792000000002,0.12650920000000002,0.12926048,0.13201176000000003,0.13476304,0.13751432000000002,0.14026560000000002,0.14301688,0.14576816000000004,0.14851944000000003,0.15127072000000003,0.15402200000000002,0.15677328000000001,0.15952456000000004,0.16227584000000003,0.16502712000000003,0.16777840000000002,0.17052968000000002,0.17328096000000004,0.17603224000000003,0.17878352000000003,0.18153480000000002,0.18428608000000002,0.18703736000000004,0.18978864000000004,0.19253992000000003,0.19529120000000003,0.19804248000000002,0.20079376000000004,0.20354504000000004,0.20629632000000003,0.20904760000000003,0.21179888000000002,0.21455016000000005,0.21730144000000004,0.22005272000000003,0.22280400000000003,0.22555528000000002,0.22830656000000005,0.23105784000000004,0.23380912000000004,0.23656040000000003,0.23931168000000003,0.24206296000000005,0.24481424000000004,0.24756552000000004,0.2503168,0.25306808000000003,0.25581936000000005,0.2585706400000001,0.26132192000000004,0.2640732,0.26682448000000003,0.26957576000000005,0.2723270400000001,0.2750783200000001,0.2778296,0.28058088000000003,0.28333216000000006,0.2860834400000001,0.2888347200000001,0.291586],\"top\":[13,1,0,0,1,0,0,0,0,0,0,0,0,0,3,15,58,211,441,713,945,1052,1046,898,836,687,526,340,223,129,80,54,31,13,9,10,14,11,2,4,2,0,0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]},\"selected\":{\"id\":\"2447\"},\"selection_policy\":{\"id\":\"2446\"}},\"id\":\"2435\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"2443\"},\"ticker\":{\"id\":\"2423\"}},\"id\":\"2422\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2423\",\"type\":\"BasicTicker\"},{\"attributes\":{\"start\":0},\"id\":\"2412\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis\":{\"id\":\"2422\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2425\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"2418\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2421\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2436\",\"type\":\"Quad\"},{\"attributes\":{},\"id\":\"2416\",\"type\":\"LinearScale\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"2449\"}]},\"id\":\"2448\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"2414\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2437\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"2418\"}],\"center\":[{\"id\":\"2421\"},{\"id\":\"2425\"},{\"id\":\"2448\"}],\"left\":[{\"id\":\"2422\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"2438\"}],\"title\":{\"id\":\"2440\"},\"toolbar\":{\"id\":\"2430\"},\"x_range\":{\"id\":\"2410\"},\"x_scale\":{\"id\":\"2414\"},\"y_range\":{\"id\":\"2412\"},\"y_scale\":{\"id\":\"2416\"}},\"id\":\"2409\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2426\"},{\"id\":\"2427\"},{\"id\":\"2428\"},{\"id\":\"2429\"}]},\"id\":\"2430\",\"type\":\"Toolbar\"}],\"root_ids\":[\"2451\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"6531d92f-f455-4437-8eec-3be2d5ba7e53\",\"root_ids\":[\"2451\"],\"roots\":{\"2451\":\"723132b4-29c3-4967-99ad-a603fea0cbd7\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2451" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### Dataloading analysis\\n\\n\"\"\"))\n", + " report = load_report('Dataloader')\n", + " if report:\n", + " params = report['RuleParameters'].split(\"\\n\")\n", + " min_threshold = params[0].split(':')[1]\n", + " max_threshold = params[1].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=f\"\"\"The number of dataloader workers can greatly affect the overall performance \n", + " of your training job. The rule analyzed the number of dataloading processes that have been running in \n", + " parallel on the training instance and compares it against the total number of cores. \n", + " The rule checked if the number of processes is smaller than {min_threshold}% or larger than \n", + " {max_threshold}% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU \n", + " underutilization. Having too many dataloader workers may hurt the\n", + " overall performance if you are running other compute intensive tasks on the CPU.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n", + " \n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(paragraph)\n", + " text = \"\"\n", + " if 'cores' in report['Details']:\n", + " cores = int(report['Details']['cores'])\n", + " dataloaders = report['Details']['dataloaders']\n", + " if dataloaders < cores: \n", + " text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job only \n", + " ran on average {dataloaders} dataloader workers in parallel. We recommend you to increase the number of\n", + " dataloader workers.\"\"\"\n", + " if dataloaders > cores:\n", + " text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job ran \n", + " on average {dataloaders} dataloader workers. We recommed you to decrease the number of dataloader\n", + " workers.\"\"\"\n", + " if 'pin_memory' in report['Details'] and report['Details']['pin_memory'] == False:\n", + " text=f\"\"\"{text} Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.\n", + " The rule detected that your training job was not using pinned memory. \n", + " In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True.\"\"\"\n", + " \n", + " if 'prefetch' in report['Details'] and report['Details']['prefetch'] == False:\n", + " text=f\"\"\"{text} It appears that your training job did not perform any data pre-fetching. Pre-fetching can improve your\n", + " data input pipeline as it produces the data ahead of time.\"\"\"\n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(paragraph)\n", + " \n", + " colors=bokeh.palettes.viridis(10)\n", + " if \"dataloading_time\" in report['Details']:\n", + " median = round(report['Details'][\"dataloading_time\"]['p50'],4)\n", + " p95 = round(report['Details'][\"dataloading_time\"]['p95'],4)\n", + " p25 = round(report['Details'][\"dataloading_time\"]['p25'],4)\n", + " binedges = report['Details'][\"dataloading_time\"]['binedges']\n", + " probs = report['Details'][\"dataloading_time\"]['probs']\n", + " text=f\"\"\"The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was {median}s. \n", + " The 95th percentile was {p95}s and the 25th percentile was {p25}s\"\"\"\n", + "\n", + " plot = figure(plot_height=450, \n", + " plot_width=850,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\",\n", + " x_range=(binedges[0], binedges[-1])\n", + " )\n", + " \n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=binedges[:-1],\n", + " right=binedges[1:],\n", + " line_color=\"white\",\n", + " fill_color=colors[0],\n", + " fill_alpha=0.8,\n", + " legend=\"Dataloading events\" )\n", + "\n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Dataloading in [s]\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + "\n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(column(paragraph, plot))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:48.475424Z", + "iopub.status.busy": "2023-04-10T21:36:48.474828Z", + "iopub.status.idle": "2023-04-10T21:36:48.737604Z", + "shell.execute_reply": "2023-04-10T21:36:48.738003Z" + }, + "papermill": { + "duration": 0.310088, + "end_time": "2023-04-10T21:36:48.738147", + "exception": false, + "start_time": "2023-04-10T21:36:48.428059", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + " ### Batch size" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"ee73fb9d-6abe-474e-9a9a-77645a9e0998\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The BatchSize rule helps to detect if GPU is underutilized because of the batch size being \\n too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of \\n 70%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of 70% and the 95th percentile of memory footprint below gpu_memory_threshold_p95 of 70%. In your training job this happened 14 times. The rule skipped the first 1000 datapoints. The rule computed the percentiles over window size of 500 continuous datapoints.\\n\\n The rule analysed 1750 datapoints and triggered 14 times.\\n \",\"width\":800},\"id\":\"2579\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2579\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"ee73fb9d-6abe-474e-9a9a-77645a9e0998\",\"root_ids\":[\"2579\"],\"roots\":{\"2579\":\"e5e3cb1e-05e4-4eba-9dc1-b194b7dcac83\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2579" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"34c6ec51-a0d1-4cb4-b95f-205760b55051\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"Your training job is underutilizing the instance. You may want to consider\\n either switch to a smaller instance type or to increase the batch size. \\n The last time the BatchSize rule triggered in your training job was on 04/10/2023 at 21:21:00.\\n The following boxplots are a snapshot from the timestamps. They the total \\n CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).\",\"width\":800},\"id\":\"2667\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2667\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"34c6ec51-a0d1-4cb4-b95f-205760b55051\",\"root_ids\":[\"2667\"],\"roots\":{\"2667\":\"7cc91b2e-3604-48d7-967a-1daabdc9d669\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2667" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"117ff17a-2947-4290-826e-197634ebec4f\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"2766\"}],\"center\":[{\"id\":\"2769\"},{\"id\":\"2773\"}],\"left\":[{\"id\":\"2770\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"2786\"},{\"id\":\"2791\"},{\"id\":\"2796\"},{\"id\":\"2801\"},{\"id\":\"2806\"},{\"id\":\"2811\"},{\"id\":\"2816\"},{\"id\":\"2821\"},{\"id\":\"2826\"},{\"id\":\"2831\"},{\"id\":\"2836\"},{\"id\":\"2841\"},{\"id\":\"2846\"},{\"id\":\"2851\"},{\"id\":\"2856\"},{\"id\":\"2861\"},{\"id\":\"2866\"},{\"id\":\"2871\"}],\"title\":{\"id\":\"2756\"},\"toolbar\":{\"id\":\"2778\"},\"x_range\":{\"id\":\"2758\"},\"x_scale\":{\"id\":\"2762\"},\"y_range\":{\"id\":\"2760\"},\"y_scale\":{\"id\":\"2764\"}},\"id\":\"2755\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":0.0}},\"id\":\"2834\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2764\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2987\"},\"selection_policy\":{\"id\":\"2986\"}},\"id\":\"2833\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.0}},\"id\":\"2839\",\"type\":\"Rect\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":0.0}},\"id\":\"2835\",\"type\":\"Rect\"},{\"attributes\":{\"source\":{\"id\":\"2833\"}},\"id\":\"2837\",\"type\":\"CDSView\"},{\"attributes\":{\"axis\":{\"id\":\"2770\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"2773\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2760\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"2833\"},\"glyph\":{\"id\":\"2834\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2835\"},\"selection_glyph\":null,\"view\":{\"id\":\"2837\"}},\"id\":\"2836\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"formatter\":{\"id\":\"2965\"},\"major_label_overrides\":{\"1\":\"cpu\",\"2\":\"gpu0\",\"3\":\"gpu0_memory\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"2873\"}},\"id\":\"2766\",\"type\":\"LinearAxis\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":42.0},\"y1\":{\"value\":33.0}},\"id\":\"2844\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2762\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2989\"},\"selection_policy\":{\"id\":\"2988\"}},\"id\":\"2838\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2967\"},\"selection_policy\":{\"id\":\"2966\"}},\"id\":\"2783\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2853\"}},\"id\":\"2857\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.0}},\"id\":\"2840\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2771\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"2838\"}},\"id\":\"2842\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"2838\"},\"glyph\":{\"id\":\"2839\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2840\"},\"selection_glyph\":null,\"view\":{\"id\":\"2842\"}},\"id\":\"2841\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"2766\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"2769\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"2964\"},\"ticker\":{\"id\":\"2771\"}},\"id\":\"2770\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2991\"},\"selection_policy\":{\"id\":\"2990\"}},\"id\":\"2843\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":53.19499999999999},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":41.06125},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2795\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"2843\"},\"glyph\":{\"id\":\"2844\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2845\"},\"selection_glyph\":null,\"view\":{\"id\":\"2847\"}},\"id\":\"2846\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"2849\",\"type\":\"Segment\"},{\"attributes\":{\"callback\":null},\"id\":\"2774\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"2775\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"2843\"}},\"id\":\"2847\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2993\"},\"selection_policy\":{\"id\":\"2992\"}},\"id\":\"2848\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2776\",\"type\":\"ResetTool\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":42.0},\"y1\":{\"value\":33.0}},\"id\":\"2845\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2777\",\"type\":\"PanTool\"},{\"attributes\":{\"bottom\":{\"value\":41.06125},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":39.54},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2800\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"2788\"},\"glyph\":{\"id\":\"2789\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2790\"},\"selection_glyph\":null,\"view\":{\"id\":\"2792\"}},\"id\":\"2791\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"2848\"},\"glyph\":{\"id\":\"2849\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2850\"},\"selection_glyph\":null,\"view\":{\"id\":\"2852\"}},\"id\":\"2851\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":19.057500000000008},\"y1\":{\"value\":39.54}},\"id\":\"2790\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":33.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":22.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2854\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2971\"},\"selection_policy\":{\"id\":\"2970\"}},\"id\":\"2793\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2848\"}},\"id\":\"2852\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2995\"},\"selection_policy\":{\"id\":\"2994\"}},\"id\":\"2853\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2793\"}},\"id\":\"2797\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"2850\",\"type\":\"Segment\"},{\"attributes\":{\"data_source\":{\"id\":\"2783\"},\"glyph\":{\"id\":\"2784\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2785\"},\"selection_glyph\":null,\"view\":{\"id\":\"2787\"}},\"id\":\"2786\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"end\":20},\"id\":\"2758\",\"type\":\"Range1d\"},{\"attributes\":{\"source\":{\"id\":\"2788\"}},\"id\":\"2792\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":33.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":22.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2855\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2969\"},\"selection_policy\":{\"id\":\"2968\"}},\"id\":\"2788\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":19.057500000000008},\"y1\":{\"value\":39.54}},\"id\":\"2789\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":22.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2859\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"2853\"},\"glyph\":{\"id\":\"2854\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2855\"},\"selection_glyph\":null,\"view\":{\"id\":\"2857\"}},\"id\":\"2856\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":41.06125},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":39.54},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2799\",\"type\":\"VBar\"},{\"attributes\":{\"bottom\":{\"value\":53.19499999999999},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":41.06125},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2794\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"2858\"}},\"id\":\"2862\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2774\"},{\"id\":\"2775\"},{\"id\":\"2776\"},{\"id\":\"2777\"}]},\"id\":\"2778\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2997\"},\"selection_policy\":{\"id\":\"2996\"}},\"id\":\"2858\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"2793\"},\"glyph\":{\"id\":\"2794\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2795\"},\"selection_glyph\":null,\"view\":{\"id\":\"2797\"}},\"id\":\"2796\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2973\"},\"selection_policy\":{\"id\":\"2972\"}},\"id\":\"2798\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":22.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2860\",\"type\":\"VBar\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":0.0}},\"id\":\"2864\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2858\"},\"glyph\":{\"id\":\"2859\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2860\"},\"selection_glyph\":null,\"view\":{\"id\":\"2862\"}},\"id\":\"2861\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"2798\"},\"glyph\":{\"id\":\"2799\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2800\"},\"selection_glyph\":null,\"view\":{\"id\":\"2802\"}},\"id\":\"2801\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2999\"},\"selection_policy\":{\"id\":\"2998\"}},\"id\":\"2863\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2987\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2798\"}},\"id\":\"2802\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":19.057500000000008}},\"id\":\"2804\",\"type\":\"Rect\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":42.0}},\"id\":\"2869\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2988\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":0.0}},\"id\":\"2865\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2989\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2863\"}},\"id\":\"2867\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":73.67749999999998},\"y1\":{\"value\":53.19499999999999}},\"id\":\"2785\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2975\"},\"selection_policy\":{\"id\":\"2974\"}},\"id\":\"2803\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2990\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2863\"},\"glyph\":{\"id\":\"2864\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2865\"},\"selection_glyph\":null,\"view\":{\"id\":\"2867\"}},\"id\":\"2866\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2991\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":19.057500000000008}},\"id\":\"2805\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3001\"},\"selection_policy\":{\"id\":\"3000\"}},\"id\":\"2868\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2992\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"2803\"}},\"id\":\"2807\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2970\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2993\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2803\"},\"glyph\":{\"id\":\"2804\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2805\"},\"selection_glyph\":null,\"view\":{\"id\":\"2807\"}},\"id\":\"2806\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":42.0}},\"id\":\"2870\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2994\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2828\"},\"glyph\":{\"id\":\"2829\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2830\"},\"selection_glyph\":null,\"view\":{\"id\":\"2832\"}},\"id\":\"2831\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"2868\"}},\"id\":\"2872\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2977\"},\"selection_policy\":{\"id\":\"2976\"}},\"id\":\"2808\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2995\",\"type\":\"Selection\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":73.67749999999998}},\"id\":\"2809\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2868\"},\"glyph\":{\"id\":\"2869\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2870\"},\"selection_glyph\":null,\"view\":{\"id\":\"2872\"}},\"id\":\"2871\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2996\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2969\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2997\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":73.67749999999998}},\"id\":\"2810\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2968\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"2808\"}},\"id\":\"2812\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2998\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2966\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2999\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2808\"},\"glyph\":{\"id\":\"2809\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2810\"},\"selection_glyph\":null,\"view\":{\"id\":\"2812\"}},\"id\":\"2811\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2965\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2979\"},\"selection_policy\":{\"id\":\"2978\"}},\"id\":\"2813\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3000\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2813\"},\"glyph\":{\"id\":\"2814\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2815\"},\"selection_glyph\":null,\"view\":{\"id\":\"2817\"}},\"id\":\"2816\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2967\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3001\",\"type\":\"Selection\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.0},\"y1\":{\"value\":44.0}},\"id\":\"2814\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"2823\"}},\"id\":\"2827\",\"type\":\"CDSView\"},{\"attributes\":{\"text\":\"Node algo-1\"},\"id\":\"2756\",\"type\":\"Title\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"2819\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2971\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2964\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2972\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"ticks\":[0,1,2,3]},\"id\":\"2873\",\"type\":\"FixedTicker\"},{\"attributes\":{\"source\":{\"id\":\"2783\"}},\"id\":\"2787\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"2813\"}},\"id\":\"2817\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":73.67749999999998},\"y1\":{\"value\":53.19499999999999}},\"id\":\"2784\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2981\"},\"selection_policy\":{\"id\":\"2980\"}},\"id\":\"2818\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2973\",\"type\":\"Selection\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.0},\"y1\":{\"value\":44.0}},\"id\":\"2815\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2974\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2975\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2818\"},\"glyph\":{\"id\":\"2819\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2820\"},\"selection_glyph\":null,\"view\":{\"id\":\"2822\"}},\"id\":\"2821\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":44.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2824\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2976\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2977\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2818\"}},\"id\":\"2822\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2983\"},\"selection_policy\":{\"id\":\"2982\"}},\"id\":\"2823\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2978\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"2820\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2979\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2980\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":44.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2825\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2981\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2985\"},\"selection_policy\":{\"id\":\"2984\"}},\"id\":\"2828\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2982\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2823\"},\"glyph\":{\"id\":\"2824\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2825\"},\"selection_glyph\":null,\"view\":{\"id\":\"2827\"}},\"id\":\"2826\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2983\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2830\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2984\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2829\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2985\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2828\"}},\"id\":\"2832\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2986\",\"type\":\"UnionRenderers\"}],\"root_ids\":[\"2755\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"117ff17a-2947-4290-826e-197634ebec4f\",\"root_ids\":[\"2755\"],\"roots\":{\"2755\":\"96873601-e9b1-4aab-b449-d2241271ee78\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2755" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"f1c16297-1b8e-4e80-8946-9fc483660a7a\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"3213\"}],\"center\":[{\"id\":\"3216\"},{\"id\":\"3220\"}],\"left\":[{\"id\":\"3217\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"3233\"},{\"id\":\"3238\"},{\"id\":\"3243\"},{\"id\":\"3248\"},{\"id\":\"3253\"},{\"id\":\"3258\"},{\"id\":\"3263\"},{\"id\":\"3268\"},{\"id\":\"3273\"},{\"id\":\"3278\"},{\"id\":\"3283\"},{\"id\":\"3288\"},{\"id\":\"3293\"},{\"id\":\"3298\"},{\"id\":\"3303\"},{\"id\":\"3308\"},{\"id\":\"3313\"},{\"id\":\"3318\"}],\"title\":{\"id\":\"3203\"},\"toolbar\":{\"id\":\"3225\"},\"x_range\":{\"id\":\"3205\"},\"x_scale\":{\"id\":\"3209\"},\"y_range\":{\"id\":\"3207\"},\"y_scale\":{\"id\":\"3211\"}},\"id\":\"3202\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"source\":{\"id\":\"3290\"}},\"id\":\"3294\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3454\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3250\"},\"glyph\":{\"id\":\"3251\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3252\"},\"selection_glyph\":null,\"view\":{\"id\":\"3254\"}},\"id\":\"3253\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.0}},\"id\":\"3286\",\"type\":\"Rect\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"3297\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"3486\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"3275\"}},\"id\":\"3279\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"3280\"}},\"id\":\"3284\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"3296\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3480\"},\"selection_policy\":{\"id\":\"3479\"}},\"id\":\"3295\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3455\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3487\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3457\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.0}},\"id\":\"3287\",\"type\":\"Rect\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":41.0},\"y1\":{\"value\":33.0}},\"id\":\"3292\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"3456\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3488\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"3250\"}},\"id\":\"3254\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3295\"},\"glyph\":{\"id\":\"3296\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3297\"},\"selection_glyph\":null,\"view\":{\"id\":\"3299\"}},\"id\":\"3298\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3464\"},\"selection_policy\":{\"id\":\"3463\"}},\"id\":\"3255\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3458\",\"type\":\"Selection\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":74.8709375}},\"id\":\"3256\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3476\"},\"selection_policy\":{\"id\":\"3475\"}},\"id\":\"3285\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3462\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3224\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"3459\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"3285\"}},\"id\":\"3289\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":74.8709375}},\"id\":\"3257\",\"type\":\"Rect\"},{\"attributes\":{\"bottom\":{\"value\":33.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":22.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3301\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3223\",\"type\":\"ResetTool\"},{\"attributes\":{\"data_source\":{\"id\":\"3285\"},\"glyph\":{\"id\":\"3286\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3287\"},\"selection_glyph\":null,\"view\":{\"id\":\"3289\"}},\"id\":\"3288\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"3255\"}},\"id\":\"3259\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"3295\"}},\"id\":\"3299\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3222\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"3230\"}},\"id\":\"3234\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3255\"},\"glyph\":{\"id\":\"3256\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3257\"},\"selection_glyph\":null,\"view\":{\"id\":\"3259\"}},\"id\":\"3258\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3482\"},\"selection_policy\":{\"id\":\"3481\"}},\"id\":\"3300\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3460\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"3221\"},{\"id\":\"3222\"},{\"id\":\"3223\"},{\"id\":\"3224\"}]},\"id\":\"3225\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3466\"},\"selection_policy\":{\"id\":\"3465\"}},\"id\":\"3260\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3300\"}},\"id\":\"3304\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3463\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3478\"},\"selection_policy\":{\"id\":\"3477\"}},\"id\":\"3290\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3461\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"3260\"},\"glyph\":{\"id\":\"3261\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3262\"},\"selection_glyph\":null,\"view\":{\"id\":\"3264\"}},\"id\":\"3263\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":18.468437499999997},\"y1\":{\"value\":39.619375}},\"id\":\"3236\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.0},\"y1\":{\"value\":45.0}},\"id\":\"3261\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"3270\"}},\"id\":\"3274\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":41.0},\"y1\":{\"value\":33.0}},\"id\":\"3291\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":22.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3306\",\"type\":\"VBar\"},{\"attributes\":{\"callback\":null},\"id\":\"3221\",\"type\":\"HoverTool\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"3266\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"3464\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3474\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3290\"},\"glyph\":{\"id\":\"3291\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3292\"},\"selection_glyph\":null,\"view\":{\"id\":\"3294\"}},\"id\":\"3293\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":33.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":22.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3302\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3466\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"3260\"}},\"id\":\"3264\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3465\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3456\"},\"selection_policy\":{\"id\":\"3455\"}},\"id\":\"3235\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3468\"},\"selection_policy\":{\"id\":\"3467\"}},\"id\":\"3265\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3468\",\"type\":\"Selection\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.0},\"y1\":{\"value\":45.0}},\"id\":\"3262\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"3467\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"3230\"},\"glyph\":{\"id\":\"3231\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3232\"},\"selection_glyph\":null,\"view\":{\"id\":\"3234\"}},\"id\":\"3233\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3484\"},\"selection_policy\":{\"id\":\"3483\"}},\"id\":\"3305\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3305\"}},\"id\":\"3309\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3265\"},\"glyph\":{\"id\":\"3266\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3267\"},\"selection_glyph\":null,\"view\":{\"id\":\"3269\"}},\"id\":\"3268\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"3300\"},\"glyph\":{\"id\":\"3301\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3302\"},\"selection_glyph\":null,\"view\":{\"id\":\"3304\"}},\"id\":\"3303\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3469\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"3235\"}},\"id\":\"3239\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":45.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3271\",\"type\":\"VBar\"},{\"attributes\":{\"bottom\":{\"value\":22.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3307\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3486\"},\"selection_policy\":{\"id\":\"3485\"}},\"id\":\"3310\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":53.72},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":41.208749999999995},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3241\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"3265\"}},\"id\":\"3269\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3471\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"3305\"},\"glyph\":{\"id\":\"3306\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3307\"},\"selection_glyph\":null,\"view\":{\"id\":\"3309\"}},\"id\":\"3308\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"3235\"},\"glyph\":{\"id\":\"3236\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3237\"},\"selection_glyph\":null,\"view\":{\"id\":\"3239\"}},\"id\":\"3238\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3470\"},\"selection_policy\":{\"id\":\"3469\"}},\"id\":\"3270\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3470\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3462\"},\"selection_policy\":{\"id\":\"3461\"}},\"id\":\"3250\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":0.0}},\"id\":\"3311\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3472\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":53.72},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":41.208749999999995},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3242\",\"type\":\"VBar\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":0.0},\"y1\":{\"value\":0.0}},\"id\":\"3267\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":41.208749999999995},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":39.619375},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3246\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3451\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"3245\"}},\"id\":\"3249\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":41.0}},\"id\":\"3316\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3475\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":18.468437499999997},\"y1\":{\"value\":39.619375}},\"id\":\"3237\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":45.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":31.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3272\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3473\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3472\"},\"selection_policy\":{\"id\":\"3471\"}},\"id\":\"3275\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3483\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3458\"},\"selection_policy\":{\"id\":\"3457\"}},\"id\":\"3240\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3310\"}},\"id\":\"3314\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":0.0}},\"id\":\"3312\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3476\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3270\"},\"glyph\":{\"id\":\"3271\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3272\"},\"selection_glyph\":null,\"view\":{\"id\":\"3274\"}},\"id\":\"3273\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"3310\"},\"glyph\":{\"id\":\"3311\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3312\"},\"selection_glyph\":null,\"view\":{\"id\":\"3314\"}},\"id\":\"3313\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3478\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"3240\"}},\"id\":\"3244\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3477\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3276\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3452\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":18.468437499999997}},\"id\":\"3252\",\"type\":\"Rect\"},{\"attributes\":{\"bottom\":{\"value\":41.208749999999995},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":39.619375},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3247\",\"type\":\"VBar\"},{\"attributes\":{\"bottom\":{\"value\":31.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":0.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3277\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3488\"},\"selection_policy\":{\"id\":\"3487\"}},\"id\":\"3315\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3453\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3479\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3480\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3315\"},\"glyph\":{\"id\":\"3316\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3317\"},\"selection_glyph\":null,\"view\":{\"id\":\"3319\"}},\"id\":\"3318\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":0.0}},\"id\":\"3281\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"3275\"},\"glyph\":{\"id\":\"3276\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3277\"},\"selection_glyph\":null,\"view\":{\"id\":\"3279\"}},\"id\":\"3278\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3460\"},\"selection_policy\":{\"id\":\"3459\"}},\"id\":\"3245\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3481\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"3240\"},\"glyph\":{\"id\":\"3241\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3242\"},\"selection_glyph\":null,\"view\":{\"id\":\"3244\"}},\"id\":\"3243\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3474\"},\"selection_policy\":{\"id\":\"3473\"}},\"id\":\"3280\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3315\"}},\"id\":\"3319\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3484\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3245\"},\"glyph\":{\"id\":\"3246\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3247\"},\"selection_glyph\":null,\"view\":{\"id\":\"3249\"}},\"id\":\"3248\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":0.0}},\"id\":\"3282\",\"type\":\"Rect\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":41.0}},\"id\":\"3317\",\"type\":\"Rect\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":74.8709375},\"y1\":{\"value\":53.72}},\"id\":\"3232\",\"type\":\"Segment\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":18.468437499999997}},\"id\":\"3251\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3482\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3280\"},\"glyph\":{\"id\":\"3281\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3282\"},\"selection_glyph\":null,\"view\":{\"id\":\"3284\"}},\"id\":\"3283\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3485\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"ticks\":[0,1,2,3]},\"id\":\"3320\",\"type\":\"FixedTicker\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3454\"},\"selection_policy\":{\"id\":\"3453\"}},\"id\":\"3230\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3211\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"3451\"},\"ticker\":{\"id\":\"3218\"}},\"id\":\"3217\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"3217\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"3220\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"3213\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"3216\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"3218\",\"type\":\"BasicTicker\"},{\"attributes\":{\"formatter\":{\"id\":\"3452\"},\"major_label_overrides\":{\"1\":\"cpu\",\"2\":\"gpu0\",\"3\":\"gpu0_memory\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"3320\"}},\"id\":\"3213\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"Node algo-2\"},\"id\":\"3203\",\"type\":\"Title\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":74.8709375},\"y1\":{\"value\":53.72}},\"id\":\"3231\",\"type\":\"Segment\"},{\"attributes\":{\"end\":20},\"id\":\"3205\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"3209\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3207\",\"type\":\"DataRange1d\"}],\"root_ids\":[\"3202\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"f1c16297-1b8e-4e80-8946-9fc483660a7a\",\"root_ids\":[\"3202\"],\"roots\":{\"3202\":\"4ae3aade-9cfb-4eb4-af65-e787792a575e\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3202" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\" ### Batch size\"\"\"))\n", + " report = load_report('BatchSize')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " cpu_threshold_p95 = int(params[0].split(':')[1])\n", + " gpu_threshold_p95 = int(params[1].split(':')[1])\n", + " gpu_memory_threshold_p95 = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " window = int(params[4].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text = Paragraph(text=f\"\"\"The BatchSize rule helps to detect if GPU is underutilized because of the batch size being \n", + " too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of \n", + " {cpu_threshold_p95}%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of {gpu_threshold_p95}% and the 95th percentile of memory footprint \\\n", + " below gpu_memory_threshold_p95 of {gpu_memory_threshold_p95}%. In your training job this happened {violations} times. \\\n", + " The rule skipped the first {patience} datapoints. The rule computed the percentiles over window size of {window} continuous datapoints.\\n\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\n", + " \"\"\", width=800)\n", + " show(text)\n", + " if len(report['Details']) >0: \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " del report['Details']['last_timestamp']\n", + " text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n", + " either switch to a smaller instance type or to increase the batch size. \n", + " The last time the BatchSize rule triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. They the total \n", + " CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).\"\"\", \n", + " width=800)\n", + " show(text)\n", + "\n", + " for node_id in report['Details']:\n", + " xmax = max(20, len(report['Details'][node_id]))\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,xmax)\n", + " )\n", + " \n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " show(plot)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:48.828597Z", + "iopub.status.busy": "2023-04-10T21:36:48.813165Z", + "iopub.status.idle": "2023-04-10T21:36:48.926506Z", + "shell.execute_reply": "2023-04-10T21:36:48.926901Z" + }, + "papermill": { + "duration": 0.152322, + "end_time": "2023-04-10T21:36:48.927045", + "exception": false, + "start_time": "2023-04-10T21:36:48.774723", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### CPU bottlenecks\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"c9a03fbd-e579-4074-986c-f349e89816ed\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of 90% \\n and GPU utilization was below gpu_threshold of 10%. \\n During initialization utilization is likely to be zero, so the rule skipped the first 1000 datapoints.\\n With this configuration the rule found 570 CPU bottlenecks which is 16% of the total time. This is below the threshold of 50%\\n The rule analysed 3514 data points and triggered 0 times.\",\"width\":900},\"id\":\"3689\",\"type\":\"Paragraph\"}],\"root_ids\":[\"3689\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"c9a03fbd-e579-4074-986c-f349e89816ed\",\"root_ids\":[\"3689\"],\"roots\":{\"3689\":\"63d29d37-c90b-4218-b637-cbba19f4bda8\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3689" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"### CPU bottlenecks\\n\\n\"\"\"))\n", + "\n", + " report = load_report('CPUBottleneck')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = int(params[0].split(':')[1])\n", + " cpu_threshold = int(params[1].split(':')[1])\n", + " gpu_threshold = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " if report['Violations'] > 0:\n", + " perc = int(report['Violations']/report['Datapoints']*100)\n", + " else:\n", + " perc = 0\n", + " if perc < threshold:\n", + " string = 'below'\n", + " else:\n", + " string = 'above'\n", + " text = f\"\"\"The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of {cpu_threshold}% \n", + " and GPU utilization was below gpu_threshold of {gpu_threshold}%. \n", + " During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints.\n", + " With this configuration the rule found {violations} CPU bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%\n", + " The rule analysed {datapoints} data points and triggered {triggered} times.\"\"\"\n", + " \n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(paragraph)\n", + " if report:\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " low_gpu = report['Details']['low_gpu_utilization']\n", + " cpu_bottleneck = {}\n", + " cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n", + " cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n", + " cpu_bottleneck[\"Low GPU usage due to CPU bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n", + "\n", + " n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n", + " text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n", + " and how many of those datapoints were likely caused by a CPU bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n", + " below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by CPU bottlenecks. \n", + " \"\"\"\n", + "\n", + " plot = create_piechart(cpu_bottleneck, \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Low GPU usage caused by CPU bottlenecks\")\n", + "\n", + " plots.append(plot)\n", + "\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether CPU bottlenecks mainly \n", + " happened during train/validation phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between time spent on TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event {event}\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened during CPU bottlenecks. \n", + " It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + " \n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics \n", + " that have been recorded when the CPU bottleneck happened. The most expensive function was \n", + " {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot)))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:49.023093Z", + "iopub.status.busy": "2023-04-10T21:36:49.022178Z", + "iopub.status.idle": "2023-04-10T21:36:49.064619Z", + "shell.execute_reply": "2023-04-10T21:36:49.065010Z" + }, + "papermill": { + "duration": 0.101149, + "end_time": "2023-04-10T21:36:49.065148", + "exception": false, + "start_time": "2023-04-10T21:36:48.963999", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### I/O bottlenecks\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"73ab791e-bdaf-458f-9fd9-857dd7c44c5c\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The IOBottleneck rule checked when I/O wait time was above io_threshold of 50% \\n and GPU utilization was below gpu_threshold of 10. During initialization utilization is likely to be zero, so the rule skipped the first 1000 datapoints. \\n With this configuration the rule found 10 I/O bottlenecks which is 0% of the total time. This is below the threshold of 50%.\\n The rule analysed 3514 datapoints and triggered 0 times.\",\"width\":900},\"id\":\"3857\",\"type\":\"Paragraph\"}],\"root_ids\":[\"3857\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"73ab791e-bdaf-458f-9fd9-857dd7c44c5c\",\"root_ids\":[\"3857\"],\"roots\":{\"3857\":\"b2e2d0e8-bf3a-4126-93f4-7266a1aeb562\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3857" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"### I/O bottlenecks\\n\\n\"\"\"))\n", + "\n", + " report = load_report('IOBottleneck')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = int(params[0].split(':')[1])\n", + " io_threshold = int(params[1].split(':')[1])\n", + " gpu_threshold = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " if report['Violations'] > 0:\n", + " perc = int(report['Violations']/report['Datapoints']*100)\n", + " else:\n", + " perc = 0\n", + " if perc < threshold:\n", + " string = 'below'\n", + " else:\n", + " string = 'above'\n", + " text = f\"\"\"The IOBottleneck rule checked when I/O wait time was above io_threshold of {io_threshold}% \n", + " and GPU utilization was below gpu_threshold of {gpu_threshold}. During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints. \n", + " With this configuration the rule found {violations} I/O bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(paragraph)\n", + " \n", + " if report:\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " low_gpu = report['Details']['low_gpu_utilization']\n", + " cpu_bottleneck = {}\n", + " cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n", + " cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n", + " cpu_bottleneck[\"Low GPU usage due to I/O bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n", + "\n", + " n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n", + " text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n", + " and how many of those datapoints were likely caused by a I/O bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n", + " below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by I/O bottlenecks. \n", + " \"\"\"\n", + "\n", + " plot = create_piechart(cpu_bottleneck, \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Low GPU usage caused by I/O bottlenecks\")\n", + "\n", + " plots.append(plot)\n", + "\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether I/O bottlenecks mainly happened during the training or validation phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened \n", + " during I/O bottlenecks. It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n", + " recorded when I/O bottleneck happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot))) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T21:36:49.154349Z", + "iopub.status.busy": "2023-04-10T21:36:49.143844Z", + "iopub.status.idle": "2023-04-10T21:36:49.200195Z", + "shell.execute_reply": "2023-04-10T21:36:49.200588Z" + }, + "papermill": { + "duration": 0.097867, + "end_time": "2023-04-10T21:36:49.200731", + "exception": false, + "start_time": "2023-04-10T21:36:49.102864", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### GPU memory\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"2a86ae32-b55f-460c-a5b7-101f3eb1d6d6\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. \\n The rule checked if the moving average of memory increased by more than 5.0%. \\n So if the moving average increased for instance from 10% to 16.0%, \\n the rule would have triggered. During initialization utilization is likely 0, so the rule skipped the first 1000 datapoints.\\n The moving average was computed on a window size of 10 continuous datapoints. The rule detected 0 violations\\n where the moving average between previous and current time window increased by more than 5.0%.\\n The rule analysed 1751 datapoints and triggered 0 times.\",\"width\":900},\"id\":\"4025\",\"type\":\"Paragraph\"}],\"root_ids\":[\"4025\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"2a86ae32-b55f-460c-a5b7-101f3eb1d6d6\",\"root_ids\":[\"4025\"],\"roots\":{\"4025\":\"c11660c3-f119-4df0-a5ef-6930b3037b81\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "4025" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### GPU memory\\n\\n\"\"\"))\n", + " \n", + " report = load_report('GPUMemoryIncrease')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " increase = float(params[0].split(':')[1])\n", + " patience = params[1].split(':')[1]\n", + " window = params[2].split(':')[1]\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=Paragraph(text=f\"\"\"The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. \n", + " The rule checked if the moving average of memory increased by more than {increase}%. \n", + " So if the moving average increased for instance from 10% to {11+increase}%, \n", + " the rule would have triggered. During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints.\n", + " The moving average was computed on a window size of {window} continuous datapoints. The rule detected {violations} violations\n", + " where the moving average between previous and current time window increased by more than {increase}%.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\",\n", + " width=900)\n", + " show(text)\n", + "\n", + " if len(report['Details']) > 0:\n", + " \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " text = Paragraph(text=f\"\"\"Your training job triggered memory spikes. \n", + " The last time the GPUMemoryIncrease rule triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. They show for each node and GPU the corresponding\n", + " memory utilization (without outliers).\"\"\", width=900)\n", + " show(text)\n", + " \n", + " del report['Details']['last_timestamp']\n", + " \n", + " for node_id in report['Details']:\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,17),\n", + " )\n", + "\n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " display(Markdown(f\"\"\"**Memory utilization of {key} on node {node_id}:**\"\"\"))\n", + " text = \"\"\n", + " gpu_max = report['Details'][node_id][key]['gpu_max']\n", + " text = f\"\"\"{text} The max memory utilization of {key} on node {node_id} was {gpu_max}%.\"\"\"\n", + " \n", + " p_95 = int(report['Details'][node_id][key]['p95'])\n", + " p_5 = report['Details'][node_id][key]['p05']\n", + " if p_95 < int(50): \n", + " text = f\"\"\"{text} The 95th percentile was only {p_95}%.\"\"\"\n", + " if p_5 < int(5): \n", + " text = f\"\"\"{text} The 5th percentile was only {p_5}%.\"\"\"\n", + " if p_95 - p_5 > 50:\n", + " text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n", + " significant, which means that memory utilization on {key} is fluctuating quite a lot.\"\"\"\n", + " \n", + " text = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(text)\n", + " \n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " show(plot)" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "papermill": { + "duration": 4.572913, + "end_time": "2023-04-10T21:36:49.545501", + "environment_variables": {}, + "exception": null, + "input_path": "/opt/ml/code/profiler_report.ipynb", + "output_path": "/opt/ml/processing/output/rule/profiler-output/.sagemaker-ignore/out.tmp", + "parameters": { + "processing_job_arn": "arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-5f46c0a2" + }, + "start_time": "2023-04-10T21:36:44.972588", + "version": "2.1.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/BatchSize.json b/starter/ProfilerReports/benchmark/profiler-reports/BatchSize.json new file mode 100644 index 00000000..b09f6e70 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/BatchSize.json @@ -0,0 +1 @@ +{"RuleTriggered": 14, "Violations": 14, "Details": {"algo-1": {"cpu": {"p25": 39.54, "p50": 41.06125, "p75": 53.19499999999999, "p95": 76.699125, "upper": 73.67749999999998, "lower": 19.057500000000008}, "gpu0": {"p25": 0.0, "p50": 31.0, "p75": 44.0, "p95": 55.0, "upper": 56.0, "lower": 0.0}, "gpu0_memory": {"p25": 0.0, "p50": 22.0, "p75": 33.0, "p95": 41.0, "upper": 42.0, "lower": 0.0}}, "last_timestamp": 1681161660000000, "algo-2": {"cpu": {"p25": 39.619375, "p50": 41.208749999999995, "p75": 53.72, "p95": 80.23212499999998, "upper": 74.8709375, "lower": 18.468437499999997}, "gpu0": {"p25": 0.0, "p50": 31.0, "p75": 45.0, "p95": 55.0, "upper": 56.0, "lower": 0.0}, "gpu0_memory": {"p25": 0.0, "p50": 22.0, "p75": 33.0, "p95": 40.0, "upper": 41.0, "lower": 0.0}}}, "Datapoints": 1750, "RuleParameters": "cpu_threshold_p95:70\ngpu_threshold_p95:70\ngpu_memory_threshold_p95:70\npatience:1000\nwindow:500"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/CPUBottleneck.json b/starter/ProfilerReports/benchmark/profiler-reports/CPUBottleneck.json new file mode 100644 index 00000000..ddbb31e9 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/CPUBottleneck.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 570, "Details": {"low_gpu_utilization": 1240, "bottlenecks": {"1681161684.00058": {"GPUs": 1, "CPUs": 4}, "1681161684.500246": {"GPUs": 1, "CPUs": 4}, "1681161685.000247": {"GPUs": 1, "CPUs": 4}, "1681161685.500223": {"GPUs": 1, "CPUs": 3}, "1681161686.000622": {"GPUs": 1, "CPUs": 4}, "1681161686.501647": {"GPUs": 1, "CPUs": 1}, "1681161687.002286": {"GPUs": 1, "CPUs": 2}, "1681161687.500428": {"GPUs": 1, "CPUs": 4}, "1681161688.001077": {"GPUs": 1, "CPUs": 4}, "1681161688.50541": {"GPUs": 1, "CPUs": 4}, "1681161689.000937": {"GPUs": 1, "CPUs": 4}, "1681161689.501786": {"GPUs": 1, "CPUs": 4}, "1681161690.000342": {"GPUs": 1, "CPUs": 1}, "1681161690.50118": {"GPUs": 1, "CPUs": 3}, "1681161691.000323": {"GPUs": 1, "CPUs": 1}, "1681161691.500261": {"GPUs": 1, "CPUs": 3}, "1681161692.002621": {"GPUs": 1, "CPUs": 4}, "1681161692.508483": {"GPUs": 1, "CPUs": 4}, "1681161693.000226": {"GPUs": 1, "CPUs": 2}, "1681161693.501129": {"GPUs": 1, "CPUs": 1}, "1681161694.001182": {"GPUs": 1, "CPUs": 3}, "1681161694.511396": {"GPUs": 1, "CPUs": 4}, "1681161695.000651": {"GPUs": 1, "CPUs": 4}, "1681161695.50138": {"GPUs": 1, "CPUs": 4}, "1681161696.001606": {"GPUs": 1, "CPUs": 4}, "1681161696.501924": {"GPUs": 1, "CPUs": 4}, "1681161697.000248": {"GPUs": 1, "CPUs": 2}, "1681161697.504539": {"GPUs": 1, "CPUs": 1}, "1681161698.001088": {"GPUs": 1, "CPUs": 3}, "1681161698.500271": {"GPUs": 1, "CPUs": 4}, "1681161699.000195": {"GPUs": 1, "CPUs": 3}, "1681161699.500236": {"GPUs": 1, "CPUs": 3}, "1681161700.503821": {"GPUs": 1, "CPUs": 1}, "1681161705.501034": {"GPUs": 1, "CPUs": 4}, "1681161706.003806": {"GPUs": 1, "CPUs": 4}, "1681161706.502841": {"GPUs": 1, "CPUs": 4}, "1681161707.003824": {"GPUs": 1, "CPUs": 4}, "1681161707.530353": {"GPUs": 1, "CPUs": 4}, "1681161708.000461": {"GPUs": 1, "CPUs": 2}, "1681161708.502519": {"GPUs": 1, "CPUs": 3}, "1681161709.000387": {"GPUs": 1, "CPUs": 4}, "1681161709.504179": {"GPUs": 1, "CPUs": 3}, "1681161710.00075": {"GPUs": 1, "CPUs": 4}, "1681161710.507801": {"GPUs": 1, "CPUs": 3}, "1681161711.012636": {"GPUs": 1, "CPUs": 4}, "1681161711.500718": {"GPUs": 1, "CPUs": 3}, "1681161712.005508": {"GPUs": 1, "CPUs": 4}, "1681161712.500254": {"GPUs": 1, "CPUs": 4}, "1681161713.000856": {"GPUs": 1, "CPUs": 4}, "1681161713.5052": {"GPUs": 1, "CPUs": 4}, "1681161714.000726": {"GPUs": 1, "CPUs": 4}, "1681161714.501522": {"GPUs": 1, "CPUs": 4}, "1681161715.003921": {"GPUs": 1, "CPUs": 4}, "1681161715.503536": {"GPUs": 1, "CPUs": 4}, "1681161716.007702": {"GPUs": 1, "CPUs": 4}, "1681161716.50121": {"GPUs": 1, "CPUs": 4}, "1681161717.000935": {"GPUs": 1, "CPUs": 4}, "1681161717.503839": {"GPUs": 1, "CPUs": 4}, "1681161718.002486": {"GPUs": 1, "CPUs": 4}, "1681161718.500243": {"GPUs": 1, "CPUs": 4}, "1681161719.00405": {"GPUs": 1, "CPUs": 4}, "1681161685.000608": {"GPUs": 1, "CPUs": 3}, "1681161685.500983": {"GPUs": 1, "CPUs": 4}, "1681161686.000343": {"GPUs": 1, "CPUs": 4}, "1681161686.500354": {"GPUs": 1, "CPUs": 3}, "1681161687.002985": {"GPUs": 1, "CPUs": 2}, "1681161687.502918": {"GPUs": 1, "CPUs": 2}, "1681161688.001135": {"GPUs": 1, "CPUs": 3}, "1681161688.50079": {"GPUs": 1, "CPUs": 2}, "1681161689.000689": {"GPUs": 1, "CPUs": 4}, "1681161689.502951": {"GPUs": 1, "CPUs": 4}, "1681161690.002932": {"GPUs": 1, "CPUs": 4}, "1681161690.500367": {"GPUs": 1, "CPUs": 4}, "1681161691.000329": {"GPUs": 1, "CPUs": 2}, "1681161691.500365": {"GPUs": 1, "CPUs": 2}, "1681161692.000368": {"GPUs": 1, "CPUs": 4}, "1681161692.500328": {"GPUs": 1, "CPUs": 2}, "1681161694.000353": {"GPUs": 1, "CPUs": 4}, "1681161694.500424": {"GPUs": 1, "CPUs": 4}, "1681161695.001399": {"GPUs": 1, "CPUs": 3}, "1681161695.500361": {"GPUs": 1, "CPUs": 2}, "1681161696.000362": {"GPUs": 1, "CPUs": 1}, "1681161696.50198": {"GPUs": 1, "CPUs": 4}, "1681161697.000885": {"GPUs": 1, "CPUs": 4}, "1681161697.501712": {"GPUs": 1, "CPUs": 4}, "1681161698.008304": {"GPUs": 1, "CPUs": 4}, "1681161698.500393": {"GPUs": 1, "CPUs": 4}, "1681161699.002915": {"GPUs": 1, "CPUs": 4}, "1681161699.500356": {"GPUs": 1, "CPUs": 1}, "1681161701.001218": {"GPUs": 1, "CPUs": 4}, "1681161702.000356": {"GPUs": 1, "CPUs": 3}, "1681161702.500382": {"GPUs": 1, "CPUs": 1}, "1681161708.500618": {"GPUs": 1, "CPUs": 2}, "1681161709.501533": {"GPUs": 1, "CPUs": 4}, "1681161710.000865": {"GPUs": 1, "CPUs": 4}, "1681161710.501209": {"GPUs": 1, "CPUs": 4}, "1681161711.000552": {"GPUs": 1, "CPUs": 4}, "1681161711.502921": {"GPUs": 1, "CPUs": 4}, "1681161712.001909": {"GPUs": 1, "CPUs": 2}, "1681161712.500478": {"GPUs": 1, "CPUs": 4}, "1681161713.002918": {"GPUs": 1, "CPUs": 4}, "1681161713.500372": {"GPUs": 1, "CPUs": 2}, "1681161714.00294": {"GPUs": 1, "CPUs": 1}, "1681161714.50037": {"GPUs": 1, "CPUs": 3}, "1681161715.001657": {"GPUs": 1, "CPUs": 4}, "1681161715.500335": {"GPUs": 1, "CPUs": 4}, "1681161716.001207": {"GPUs": 1, "CPUs": 3}, "1681161716.500674": {"GPUs": 1, "CPUs": 4}, "1681161717.006145": {"GPUs": 1, "CPUs": 4}, "1681161717.501286": {"GPUs": 1, "CPUs": 4}, "1681161718.000469": {"GPUs": 1, "CPUs": 4}, "1681161718.500812": {"GPUs": 1, "CPUs": 4}, "1681161719.000602": {"GPUs": 1, "CPUs": 4}, "1681161719.517834": {"GPUs": 1, "CPUs": 4}, "1681161741.50023": {"GPUs": 1, "CPUs": 1}, "1681161745.000702": {"GPUs": 1, "CPUs": 1}, "1681161746.00023": {"GPUs": 1, "CPUs": 1}, "1681161748.500304": {"GPUs": 1, "CPUs": 1}, "1681161749.001052": {"GPUs": 1, "CPUs": 1}, "1681161760.000246": {"GPUs": 1, "CPUs": 1}, "1681161761.000228": {"GPUs": 1, "CPUs": 1}, "1681161763.500224": {"GPUs": 1, "CPUs": 1}, "1681161769.500269": {"GPUs": 1, "CPUs": 1}, "1681161770.000233": {"GPUs": 1, "CPUs": 1}, "1681161770.500245": {"GPUs": 1, "CPUs": 1}, "1681161772.00026": {"GPUs": 1, "CPUs": 1}, "1681161720.002921": {"GPUs": 1, "CPUs": 4}, "1681161720.502803": {"GPUs": 1, "CPUs": 4}, "1681161721.000468": {"GPUs": 1, "CPUs": 4}, "1681161721.500594": {"GPUs": 1, "CPUs": 4}, "1681161722.001757": {"GPUs": 1, "CPUs": 4}, "1681161722.500479": {"GPUs": 1, "CPUs": 4}, "1681161723.000407": {"GPUs": 1, "CPUs": 4}, "1681161723.500401": {"GPUs": 1, "CPUs": 4}, "1681161724.005391": {"GPUs": 1, "CPUs": 4}, "1681161724.501614": {"GPUs": 1, "CPUs": 4}, "1681161725.000341": {"GPUs": 1, "CPUs": 4}, "1681161725.500361": {"GPUs": 1, "CPUs": 4}, "1681161726.000321": {"GPUs": 1, "CPUs": 4}, "1681161726.500344": {"GPUs": 1, "CPUs": 3}, "1681161748.50122": {"GPUs": 1, "CPUs": 1}, "1681161749.00034": {"GPUs": 1, "CPUs": 1}, "1681161755.002934": {"GPUs": 1, "CPUs": 1}, "1681161762.00035": {"GPUs": 1, "CPUs": 1}, "1681161782.000258": {"GPUs": 1, "CPUs": 1}, "1681161784.500261": {"GPUs": 1, "CPUs": 1}, "1681161785.000229": {"GPUs": 1, "CPUs": 1}, "1681161815.000243": {"GPUs": 1, "CPUs": 1}, "1681161784.500345": {"GPUs": 1, "CPUs": 1}, "1681161787.501284": {"GPUs": 1, "CPUs": 1}, "1681161842.500236": {"GPUs": 1, "CPUs": 1}, "1681161884.500234": {"GPUs": 1, "CPUs": 1}, "1681161885.50023": {"GPUs": 1, "CPUs": 1}, "1681161887.000214": {"GPUs": 1, "CPUs": 1}, "1681161887.500225": {"GPUs": 1, "CPUs": 1}, "1681161888.000235": {"GPUs": 1, "CPUs": 1}, "1681161888.500223": {"GPUs": 1, "CPUs": 1}, "1681161889.000243": {"GPUs": 1, "CPUs": 1}, "1681161890.000175": {"GPUs": 1, "CPUs": 1}, "1681161891.000225": {"GPUs": 1, "CPUs": 1}, "1681161891.500241": {"GPUs": 1, "CPUs": 1}, "1681161892.000207": {"GPUs": 1, "CPUs": 1}, "1681161892.500263": {"GPUs": 1, "CPUs": 1}, "1681161893.000226": {"GPUs": 1, "CPUs": 1}, "1681161894.000237": {"GPUs": 1, "CPUs": 1}, "1681161894.500184": {"GPUs": 1, "CPUs": 1}, "1681161895.00025": {"GPUs": 1, "CPUs": 1}, "1681161896.000243": {"GPUs": 1, "CPUs": 1}, "1681161896.500239": {"GPUs": 1, "CPUs": 1}, "1681161897.000634": {"GPUs": 1, "CPUs": 1}, "1681161897.50025": {"GPUs": 1, "CPUs": 1}, "1681161842.000345": {"GPUs": 1, "CPUs": 1}, "1681161854.501045": {"GPUs": 1, "CPUs": 1}, "1681161863.50035": {"GPUs": 1, "CPUs": 1}, "1681161878.000332": {"GPUs": 1, "CPUs": 1}, "1681161879.500338": {"GPUs": 1, "CPUs": 1}, "1681161880.000394": {"GPUs": 1, "CPUs": 1}, "1681161880.500323": {"GPUs": 1, "CPUs": 1}, "1681161881.000376": {"GPUs": 1, "CPUs": 1}, "1681161881.500321": {"GPUs": 1, "CPUs": 1}, "1681161882.500332": {"GPUs": 1, "CPUs": 1}, "1681161883.000381": {"GPUs": 1, "CPUs": 1}, "1681161883.500349": {"GPUs": 1, "CPUs": 1}, "1681161884.500331": {"GPUs": 1, "CPUs": 1}, "1681161885.000345": {"GPUs": 1, "CPUs": 1}, "1681161886.000338": {"GPUs": 1, "CPUs": 1}, "1681161886.500355": {"GPUs": 1, "CPUs": 1}, "1681161887.500324": {"GPUs": 1, "CPUs": 1}, "1681161888.000367": {"GPUs": 1, "CPUs": 1}, "1681161888.500325": {"GPUs": 1, "CPUs": 1}, "1681161889.500358": {"GPUs": 1, "CPUs": 1}, "1681161890.500378": {"GPUs": 1, "CPUs": 1}, "1681161891.000361": {"GPUs": 1, "CPUs": 1}, "1681161891.500354": {"GPUs": 1, "CPUs": 1}, "1681161892.000327": {"GPUs": 1, "CPUs": 1}, "1681161892.500361": {"GPUs": 1, "CPUs": 1}, "1681161893.000337": {"GPUs": 1, "CPUs": 1}, "1681161893.500518": {"GPUs": 1, "CPUs": 1}, "1681162463.000295": {"GPUs": 1, "CPUs": 2}, "1681162463.500276": {"GPUs": 1, "CPUs": 2}, "1681162464.000269": {"GPUs": 1, "CPUs": 1}, "1681162464.500239": {"GPUs": 1, "CPUs": 2}, "1681162465.001072": {"GPUs": 1, "CPUs": 2}, "1681162466.001192": {"GPUs": 1, "CPUs": 2}, "1681162466.500673": {"GPUs": 1, "CPUs": 2}, "1681162467.000256": {"GPUs": 1, "CPUs": 1}, "1681162467.500248": {"GPUs": 1, "CPUs": 2}, "1681162468.003889": {"GPUs": 1, "CPUs": 2}, "1681162468.500196": {"GPUs": 1, "CPUs": 1}, "1681162469.000308": {"GPUs": 1, "CPUs": 2}, "1681162469.50031": {"GPUs": 1, "CPUs": 1}, "1681162470.000266": {"GPUs": 1, "CPUs": 1}, "1681162470.501642": {"GPUs": 1, "CPUs": 2}, "1681162471.000324": {"GPUs": 1, "CPUs": 1}, "1681162471.500298": {"GPUs": 1, "CPUs": 2}, "1681162472.000702": {"GPUs": 1, "CPUs": 2}, "1681162472.500216": {"GPUs": 1, "CPUs": 1}, "1681162473.00022": {"GPUs": 1, "CPUs": 1}, "1681162473.501255": {"GPUs": 1, "CPUs": 2}, "1681162474.00029": {"GPUs": 1, "CPUs": 1}, "1681162474.500268": {"GPUs": 1, "CPUs": 2}, "1681162475.000234": {"GPUs": 1, "CPUs": 2}, "1681162475.501079": {"GPUs": 1, "CPUs": 2}, "1681162476.000542": {"GPUs": 1, "CPUs": 2}, "1681162477.00027": {"GPUs": 1, "CPUs": 2}, "1681162477.500238": {"GPUs": 1, "CPUs": 2}, "1681162478.501719": {"GPUs": 1, "CPUs": 2}, "1681162479.000234": {"GPUs": 1, "CPUs": 2}, "1681162479.500257": {"GPUs": 1, "CPUs": 1}, "1681162480.00023": {"GPUs": 1, "CPUs": 2}, "1681162480.500608": {"GPUs": 1, "CPUs": 2}, "1681162481.001137": {"GPUs": 1, "CPUs": 2}, "1681162481.500226": {"GPUs": 1, "CPUs": 2}, "1681162482.000248": {"GPUs": 1, "CPUs": 2}, "1681162482.500249": {"GPUs": 1, "CPUs": 2}, "1681162483.00034": {"GPUs": 1, "CPUs": 1}, "1681162484.001272": {"GPUs": 1, "CPUs": 2}, "1681162484.500294": {"GPUs": 1, "CPUs": 1}, "1681162485.000259": {"GPUs": 1, "CPUs": 2}, "1681162485.500238": {"GPUs": 1, "CPUs": 2}, "1681162486.000272": {"GPUs": 1, "CPUs": 1}, "1681162486.500246": {"GPUs": 1, "CPUs": 2}, "1681162487.000293": {"GPUs": 1, "CPUs": 1}, "1681162487.500226": {"GPUs": 1, "CPUs": 1}, "1681162488.000869": {"GPUs": 1, "CPUs": 2}, "1681162488.500268": {"GPUs": 1, "CPUs": 2}, "1681162489.000231": {"GPUs": 1, "CPUs": 2}, "1681162489.500242": {"GPUs": 1, "CPUs": 2}, "1681162490.000238": {"GPUs": 1, "CPUs": 2}, "1681162490.500264": {"GPUs": 1, "CPUs": 2}, "1681162491.000236": {"GPUs": 1, "CPUs": 1}, "1681162491.500251": {"GPUs": 1, "CPUs": 2}, "1681162492.001093": {"GPUs": 1, "CPUs": 1}, "1681162492.500944": {"GPUs": 1, "CPUs": 2}, "1681162493.000249": {"GPUs": 1, "CPUs": 1}, "1681162493.500231": {"GPUs": 1, "CPUs": 2}, "1681162494.001202": {"GPUs": 1, "CPUs": 2}, "1681162494.500253": {"GPUs": 1, "CPUs": 1}, "1681162495.00026": {"GPUs": 1, "CPUs": 2}, "1681162495.500226": {"GPUs": 1, "CPUs": 1}, "1681162496.000825": {"GPUs": 1, "CPUs": 1}, "1681162496.500236": {"GPUs": 1, "CPUs": 2}, "1681162497.00111": {"GPUs": 1, "CPUs": 1}, "1681162497.500242": {"GPUs": 1, "CPUs": 2}, "1681162498.000247": {"GPUs": 1, "CPUs": 2}, "1681162498.501472": {"GPUs": 1, "CPUs": 2}, "1681162499.000834": {"GPUs": 1, "CPUs": 2}, "1681162499.500289": {"GPUs": 1, "CPUs": 2}, "1681162461.500375": {"GPUs": 1, "CPUs": 1}, "1681162462.000354": {"GPUs": 1, "CPUs": 2}, "1681162462.500369": {"GPUs": 1, "CPUs": 1}, "1681162464.001277": {"GPUs": 1, "CPUs": 2}, "1681162464.500341": {"GPUs": 1, "CPUs": 1}, "1681162465.000361": {"GPUs": 1, "CPUs": 2}, "1681162465.500376": {"GPUs": 1, "CPUs": 2}, "1681162466.00034": {"GPUs": 1, "CPUs": 1}, "1681162466.500374": {"GPUs": 1, "CPUs": 2}, "1681162467.001276": {"GPUs": 1, "CPUs": 2}, "1681162467.500373": {"GPUs": 1, "CPUs": 1}, "1681162468.00035": {"GPUs": 1, "CPUs": 2}, "1681162468.500352": {"GPUs": 1, "CPUs": 2}, "1681162469.000334": {"GPUs": 1, "CPUs": 2}, "1681162469.50034": {"GPUs": 1, "CPUs": 1}, "1681162470.001205": {"GPUs": 1, "CPUs": 2}, "1681162470.500335": {"GPUs": 1, "CPUs": 1}, "1681162471.00034": {"GPUs": 1, "CPUs": 2}, "1681162471.500314": {"GPUs": 1, "CPUs": 2}, "1681162472.000282": {"GPUs": 1, "CPUs": 2}, "1681162472.500364": {"GPUs": 1, "CPUs": 1}, "1681162473.000317": {"GPUs": 1, "CPUs": 2}, "1681162473.500364": {"GPUs": 1, "CPUs": 1}, "1681162474.000388": {"GPUs": 1, "CPUs": 2}, "1681162474.5004": {"GPUs": 1, "CPUs": 2}, "1681162475.001329": {"GPUs": 1, "CPUs": 1}, "1681162475.500331": {"GPUs": 1, "CPUs": 1}, "1681162476.000341": {"GPUs": 1, "CPUs": 2}, "1681162476.500339": {"GPUs": 1, "CPUs": 1}, "1681162477.00095": {"GPUs": 1, "CPUs": 2}, "1681162477.500323": {"GPUs": 1, "CPUs": 2}, "1681162478.000379": {"GPUs": 1, "CPUs": 1}, "1681162478.500394": {"GPUs": 1, "CPUs": 1}, "1681162479.000376": {"GPUs": 1, "CPUs": 2}, "1681162479.50035": {"GPUs": 1, "CPUs": 1}, "1681162480.000378": {"GPUs": 1, "CPUs": 1}, "1681162480.502916": {"GPUs": 1, "CPUs": 2}, "1681162481.001174": {"GPUs": 1, "CPUs": 1}, "1681162481.500383": {"GPUs": 1, "CPUs": 2}, "1681162482.000378": {"GPUs": 1, "CPUs": 2}, "1681162482.500353": {"GPUs": 1, "CPUs": 2}, "1681162483.001223": {"GPUs": 1, "CPUs": 2}, "1681162483.500354": {"GPUs": 1, "CPUs": 2}, "1681162484.001225": {"GPUs": 1, "CPUs": 1}, "1681162484.500316": {"GPUs": 1, "CPUs": 2}, "1681162485.000296": {"GPUs": 1, "CPUs": 2}, "1681162485.500321": {"GPUs": 1, "CPUs": 2}, "1681162486.00089": {"GPUs": 1, "CPUs": 2}, "1681162486.500353": {"GPUs": 1, "CPUs": 1}, "1681162487.000339": {"GPUs": 1, "CPUs": 2}, "1681162487.50034": {"GPUs": 1, "CPUs": 1}, "1681162488.000379": {"GPUs": 1, "CPUs": 1}, "1681162488.500384": {"GPUs": 1, "CPUs": 1}, "1681162489.500349": {"GPUs": 1, "CPUs": 2}, "1681162490.001188": {"GPUs": 1, "CPUs": 2}, "1681162490.500319": {"GPUs": 1, "CPUs": 2}, "1681162491.001079": {"GPUs": 1, "CPUs": 2}, "1681162491.500687": {"GPUs": 1, "CPUs": 2}, "1681162492.000374": {"GPUs": 1, "CPUs": 1}, "1681162492.50042": {"GPUs": 1, "CPUs": 2}, "1681162493.000347": {"GPUs": 1, "CPUs": 1}, "1681162493.500326": {"GPUs": 1, "CPUs": 2}, "1681162494.000316": {"GPUs": 1, "CPUs": 2}, "1681162494.500329": {"GPUs": 1, "CPUs": 2}, "1681162495.000357": {"GPUs": 1, "CPUs": 1}, "1681162496.00035": {"GPUs": 1, "CPUs": 2}, "1681162496.501249": {"GPUs": 1, "CPUs": 1}, "1681162497.001293": {"GPUs": 1, "CPUs": 2}, "1681162497.501218": {"GPUs": 1, "CPUs": 2}, "1681162498.000319": {"GPUs": 1, "CPUs": 1}, "1681162498.5004": {"GPUs": 1, "CPUs": 1}, "1681162499.000345": {"GPUs": 1, "CPUs": 1}, "1681162499.500761": {"GPUs": 1, "CPUs": 2}, "1681162500.000207": {"GPUs": 1, "CPUs": 2}, "1681162500.501356": {"GPUs": 1, "CPUs": 2}, "1681162501.000354": {"GPUs": 1, "CPUs": 2}, "1681162501.500275": {"GPUs": 1, "CPUs": 2}, "1681162502.000237": {"GPUs": 1, "CPUs": 2}, "1681162502.500262": {"GPUs": 1, "CPUs": 2}, "1681162503.000217": {"GPUs": 1, "CPUs": 2}, "1681162504.011822": {"GPUs": 1, "CPUs": 1}, "1681162504.500262": {"GPUs": 1, "CPUs": 2}, "1681162505.000237": {"GPUs": 1, "CPUs": 2}, "1681162505.501143": {"GPUs": 1, "CPUs": 2}, "1681162506.000288": {"GPUs": 1, "CPUs": 2}, "1681162506.500264": {"GPUs": 1, "CPUs": 2}, "1681162507.00025": {"GPUs": 1, "CPUs": 2}, "1681162508.001312": {"GPUs": 1, "CPUs": 2}, "1681162508.500953": {"GPUs": 1, "CPUs": 2}, "1681162509.001545": {"GPUs": 1, "CPUs": 1}, "1681162509.500342": {"GPUs": 1, "CPUs": 1}, "1681162510.000662": {"GPUs": 1, "CPUs": 2}, "1681162510.500246": {"GPUs": 1, "CPUs": 2}, "1681162511.000241": {"GPUs": 1, "CPUs": 1}, "1681162512.001111": {"GPUs": 1, "CPUs": 2}, "1681162512.500383": {"GPUs": 1, "CPUs": 1}, "1681162513.000953": {"GPUs": 1, "CPUs": 2}, "1681162513.50026": {"GPUs": 1, "CPUs": 2}, "1681162514.000252": {"GPUs": 1, "CPUs": 2}, "1681162514.500252": {"GPUs": 1, "CPUs": 2}, "1681162515.000289": {"GPUs": 1, "CPUs": 2}, "1681162516.000329": {"GPUs": 1, "CPUs": 2}, "1681162516.500267": {"GPUs": 1, "CPUs": 2}, "1681162517.00028": {"GPUs": 1, "CPUs": 1}, "1681162517.50319": {"GPUs": 1, "CPUs": 2}, "1681162518.00024": {"GPUs": 1, "CPUs": 1}, "1681162518.500236": {"GPUs": 1, "CPUs": 1}, "1681162519.0011": {"GPUs": 1, "CPUs": 1}, "1681162519.500244": {"GPUs": 1, "CPUs": 2}, "1681162520.000229": {"GPUs": 1, "CPUs": 1}, "1681162520.501085": {"GPUs": 1, "CPUs": 2}, "1681162521.000228": {"GPUs": 1, "CPUs": 2}, "1681162521.500234": {"GPUs": 1, "CPUs": 2}, "1681162522.000225": {"GPUs": 1, "CPUs": 2}, "1681162522.500216": {"GPUs": 1, "CPUs": 2}, "1681162523.000715": {"GPUs": 1, "CPUs": 2}, "1681162523.500223": {"GPUs": 1, "CPUs": 1}, "1681162524.001114": {"GPUs": 1, "CPUs": 2}, "1681162524.500952": {"GPUs": 1, "CPUs": 1}, "1681162525.000951": {"GPUs": 1, "CPUs": 2}, "1681162525.50026": {"GPUs": 1, "CPUs": 2}, "1681162526.00025": {"GPUs": 1, "CPUs": 2}, "1681162526.500251": {"GPUs": 1, "CPUs": 1}, "1681162527.000223": {"GPUs": 1, "CPUs": 2}, "1681162527.500276": {"GPUs": 1, "CPUs": 1}, "1681162528.00026": {"GPUs": 1, "CPUs": 1}, "1681162528.501143": {"GPUs": 1, "CPUs": 2}, "1681162529.000244": {"GPUs": 1, "CPUs": 2}, "1681162529.500231": {"GPUs": 1, "CPUs": 2}, "1681162530.000208": {"GPUs": 1, "CPUs": 2}, "1681162530.501091": {"GPUs": 1, "CPUs": 2}, "1681162531.000246": {"GPUs": 1, "CPUs": 1}, "1681162531.50164": {"GPUs": 1, "CPUs": 1}, "1681162532.001064": {"GPUs": 1, "CPUs": 1}, "1681162533.000231": {"GPUs": 1, "CPUs": 1}, "1681162533.500903": {"GPUs": 1, "CPUs": 2}, "1681162534.00024": {"GPUs": 1, "CPUs": 2}, "1681162534.500243": {"GPUs": 1, "CPUs": 2}, "1681162535.000299": {"GPUs": 1, "CPUs": 2}, "1681162535.500258": {"GPUs": 1, "CPUs": 2}, "1681162536.001383": {"GPUs": 1, "CPUs": 1}, "1681162536.5003": {"GPUs": 1, "CPUs": 2}, "1681162537.000266": {"GPUs": 1, "CPUs": 2}, "1681162537.50023": {"GPUs": 1, "CPUs": 1}, "1681162538.000963": {"GPUs": 1, "CPUs": 2}, "1681162538.500232": {"GPUs": 1, "CPUs": 2}, "1681162539.000434": {"GPUs": 1, "CPUs": 2}, "1681162539.500246": {"GPUs": 1, "CPUs": 1}, "1681162540.000208": {"GPUs": 1, "CPUs": 2}, "1681162540.501331": {"GPUs": 1, "CPUs": 1}, "1681162541.000288": {"GPUs": 1, "CPUs": 2}, "1681162541.500253": {"GPUs": 1, "CPUs": 2}, "1681162542.000208": {"GPUs": 1, "CPUs": 2}, "1681162542.500217": {"GPUs": 1, "CPUs": 1}, "1681162543.001043": {"GPUs": 1, "CPUs": 2}, "1681162543.500243": {"GPUs": 1, "CPUs": 2}, "1681162544.000239": {"GPUs": 1, "CPUs": 2}, "1681162544.500212": {"GPUs": 1, "CPUs": 2}, "1681162545.000206": {"GPUs": 1, "CPUs": 2}, "1681162545.500237": {"GPUs": 1, "CPUs": 2}, "1681162546.000237": {"GPUs": 1, "CPUs": 2}, "1681162546.500273": {"GPUs": 1, "CPUs": 2}, "1681162547.000222": {"GPUs": 1, "CPUs": 2}, "1681162548.000231": {"GPUs": 1, "CPUs": 1}, "1681162548.501002": {"GPUs": 1, "CPUs": 2}, "1681162549.000273": {"GPUs": 1, "CPUs": 1}, "1681162550.000339": {"GPUs": 1, "CPUs": 2}, "1681162550.500296": {"GPUs": 1, "CPUs": 2}, "1681162551.001678": {"GPUs": 1, "CPUs": 1}, "1681162551.500867": {"GPUs": 1, "CPUs": 2}, "1681162552.000288": {"GPUs": 1, "CPUs": 1}, "1681162552.500238": {"GPUs": 1, "CPUs": 1}, "1681162553.000273": {"GPUs": 1, "CPUs": 2}, "1681162553.500681": {"GPUs": 1, "CPUs": 2}, "1681162554.000232": {"GPUs": 1, "CPUs": 2}, "1681162554.50022": {"GPUs": 1, "CPUs": 2}, "1681162555.000204": {"GPUs": 1, "CPUs": 2}, "1681162555.500234": {"GPUs": 1, "CPUs": 1}, "1681162556.000257": {"GPUs": 1, "CPUs": 2}, "1681162556.500243": {"GPUs": 1, "CPUs": 2}, "1681162557.000318": {"GPUs": 1, "CPUs": 1}, "1681162557.500681": {"GPUs": 1, "CPUs": 2}, "1681162558.000241": {"GPUs": 1, "CPUs": 2}, "1681162558.50095": {"GPUs": 1, "CPUs": 1}, "1681162559.001037": {"GPUs": 1, "CPUs": 2}, "1681162559.500263": {"GPUs": 1, "CPUs": 1}, "1681162500.00584": {"GPUs": 1, "CPUs": 2}, "1681162500.501796": {"GPUs": 1, "CPUs": 2}, "1681162501.000438": {"GPUs": 1, "CPUs": 3}, "1681162501.502961": {"GPUs": 1, "CPUs": 2}, "1681162502.002926": {"GPUs": 1, "CPUs": 2}, "1681162502.501217": {"GPUs": 1, "CPUs": 2}, "1681162503.000367": {"GPUs": 1, "CPUs": 2}, "1681162503.50036": {"GPUs": 1, "CPUs": 2}, "1681162504.000424": {"GPUs": 1, "CPUs": 2}, "1681162504.500319": {"GPUs": 1, "CPUs": 2}, "1681162505.000988": {"GPUs": 1, "CPUs": 2}, "1681162505.500376": {"GPUs": 1, "CPUs": 2}, "1681162506.000394": {"GPUs": 1, "CPUs": 1}, "1681162506.500381": {"GPUs": 1, "CPUs": 1}, "1681162507.001722": {"GPUs": 1, "CPUs": 2}, "1681162507.500335": {"GPUs": 1, "CPUs": 1}, "1681162508.000397": {"GPUs": 1, "CPUs": 1}, "1681162508.500381": {"GPUs": 1, "CPUs": 2}, "1681162509.000365": {"GPUs": 1, "CPUs": 2}, "1681162509.500361": {"GPUs": 1, "CPUs": 2}, "1681162510.000365": {"GPUs": 1, "CPUs": 2}, "1681162510.500372": {"GPUs": 1, "CPUs": 1}, "1681162511.000321": {"GPUs": 1, "CPUs": 2}, "1681162511.500409": {"GPUs": 1, "CPUs": 1}, "1681162512.000354": {"GPUs": 1, "CPUs": 1}, "1681162512.501287": {"GPUs": 1, "CPUs": 2}, "1681162513.000382": {"GPUs": 1, "CPUs": 2}, "1681162514.000384": {"GPUs": 1, "CPUs": 2}, "1681162514.500345": {"GPUs": 1, "CPUs": 2}, "1681162515.000358": {"GPUs": 1, "CPUs": 1}, "1681162515.500353": {"GPUs": 1, "CPUs": 2}, "1681162516.001601": {"GPUs": 1, "CPUs": 2}, "1681162516.500415": {"GPUs": 1, "CPUs": 1}, "1681162517.000385": {"GPUs": 1, "CPUs": 2}, "1681162517.500338": {"GPUs": 1, "CPUs": 2}, "1681162518.000974": {"GPUs": 1, "CPUs": 2}, "1681162518.500335": {"GPUs": 1, "CPUs": 2}, "1681162519.000345": {"GPUs": 1, "CPUs": 2}, "1681162519.500354": {"GPUs": 1, "CPUs": 2}, "1681162520.000381": {"GPUs": 1, "CPUs": 1}, "1681162520.500347": {"GPUs": 1, "CPUs": 1}, "1681162521.001172": {"GPUs": 1, "CPUs": 1}, "1681162521.500872": {"GPUs": 1, "CPUs": 2}, "1681162522.000351": {"GPUs": 1, "CPUs": 2}, "1681162522.500309": {"GPUs": 1, "CPUs": 2}, "1681162523.000362": {"GPUs": 1, "CPUs": 1}, "1681162523.500352": {"GPUs": 1, "CPUs": 2}, "1681162524.000334": {"GPUs": 1, "CPUs": 2}, "1681162524.500361": {"GPUs": 1, "CPUs": 1}, "1681162525.00037": {"GPUs": 1, "CPUs": 2}, "1681162525.500378": {"GPUs": 1, "CPUs": 2}, "1681162526.000348": {"GPUs": 1, "CPUs": 1}, "1681162527.000355": {"GPUs": 1, "CPUs": 2}, "1681162527.500365": {"GPUs": 1, "CPUs": 1}, "1681162528.000362": {"GPUs": 1, "CPUs": 2}, "1681162528.500356": {"GPUs": 1, "CPUs": 1}, "1681162529.000364": {"GPUs": 1, "CPUs": 2}, "1681162529.500368": {"GPUs": 1, "CPUs": 2}, "1681162530.000341": {"GPUs": 1, "CPUs": 2}, "1681162530.501194": {"GPUs": 1, "CPUs": 2}, "1681162531.000345": {"GPUs": 1, "CPUs": 2}, "1681162531.501199": {"GPUs": 1, "CPUs": 1}, "1681162532.000352": {"GPUs": 1, "CPUs": 1}, "1681162532.500364": {"GPUs": 1, "CPUs": 2}, "1681162533.000397": {"GPUs": 1, "CPUs": 2}, "1681162533.500353": {"GPUs": 1, "CPUs": 2}, "1681162534.001293": {"GPUs": 1, "CPUs": 2}, "1681162534.50036": {"GPUs": 1, "CPUs": 1}, "1681162535.000366": {"GPUs": 1, "CPUs": 2}, "1681162536.001239": {"GPUs": 1, "CPUs": 1}, "1681162536.500359": {"GPUs": 1, "CPUs": 2}, "1681162537.000358": {"GPUs": 1, "CPUs": 2}, "1681162537.500467": {"GPUs": 1, "CPUs": 2}, "1681162538.000356": {"GPUs": 1, "CPUs": 2}, "1681162538.500387": {"GPUs": 1, "CPUs": 1}, "1681162539.000351": {"GPUs": 1, "CPUs": 2}, "1681162539.501143": {"GPUs": 1, "CPUs": 2}, "1681162540.500395": {"GPUs": 1, "CPUs": 2}, "1681162541.000365": {"GPUs": 1, "CPUs": 2}, "1681162541.500371": {"GPUs": 1, "CPUs": 2}, "1681162542.000359": {"GPUs": 1, "CPUs": 2}, "1681162542.500384": {"GPUs": 1, "CPUs": 1}, "1681162543.000363": {"GPUs": 1, "CPUs": 2}, "1681162543.500335": {"GPUs": 1, "CPUs": 2}, "1681162544.001139": {"GPUs": 1, "CPUs": 2}, "1681162544.500387": {"GPUs": 1, "CPUs": 2}, "1681162545.000375": {"GPUs": 1, "CPUs": 2}, "1681162545.500344": {"GPUs": 1, "CPUs": 1}, "1681162546.000373": {"GPUs": 1, "CPUs": 2}, "1681162546.500343": {"GPUs": 1, "CPUs": 2}, "1681162547.000356": {"GPUs": 1, "CPUs": 2}, "1681162547.500348": {"GPUs": 1, "CPUs": 2}, "1681162548.000346": {"GPUs": 1, "CPUs": 2}, "1681162548.500347": {"GPUs": 1, "CPUs": 1}, "1681162549.004905": {"GPUs": 1, "CPUs": 2}, "1681162549.500338": {"GPUs": 1, "CPUs": 1}, "1681162550.000365": {"GPUs": 1, "CPUs": 2}, "1681162550.500356": {"GPUs": 1, "CPUs": 1}, "1681162551.000348": {"GPUs": 1, "CPUs": 1}, "1681162551.500344": {"GPUs": 1, "CPUs": 2}, "1681162552.000345": {"GPUs": 1, "CPUs": 2}, "1681162552.500326": {"GPUs": 1, "CPUs": 2}, "1681162553.000335": {"GPUs": 1, "CPUs": 2}, "1681162553.500324": {"GPUs": 1, "CPUs": 2}, "1681162554.000356": {"GPUs": 1, "CPUs": 2}, "1681162554.500377": {"GPUs": 1, "CPUs": 1}, "1681162555.001655": {"GPUs": 1, "CPUs": 1}, "1681162555.501274": {"GPUs": 1, "CPUs": 2}, "1681162556.000369": {"GPUs": 1, "CPUs": 2}, "1681162556.500404": {"GPUs": 1, "CPUs": 2}, "1681162557.000365": {"GPUs": 1, "CPUs": 2}, "1681162557.500333": {"GPUs": 1, "CPUs": 2}, "1681162558.000359": {"GPUs": 1, "CPUs": 2}, "1681162558.500413": {"GPUs": 1, "CPUs": 2}, "1681162559.00034": {"GPUs": 1, "CPUs": 1}, "1681162559.500333": {"GPUs": 1, "CPUs": 1}}}, "Datapoints": 3514, "RuleParameters": "threshold:50\ncpu_threshold:90\ngpu_threshold:10\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/Dataloader.json b/starter/ProfilerReports/benchmark/profiler-reports/Dataloader.json new file mode 100644 index 00000000..7c3a8e1a --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/Dataloader.json @@ -0,0 +1 @@ +{"RuleTriggered": 1, "Violations": 11, "Details": {"pin_memory": false, "num_workers": 0, "cores": 4, "dataloaders": 1, "dataloading_time": {"p25": 0.073428, "p50": 0.078887, "p95": 0.09545139999999998, "probs": [13, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 15, 58, 211, 441, 713, 945, 1052, 1046, 898, 836, 687, 526, 340, 223, 129, 80, 54, 31, 13, 9, 10, 14, 11, 2, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], "binedges": [0.016458, 0.019209280000000002, 0.02196056, 0.024711840000000002, 0.02746312, 0.030214400000000002, 0.032965680000000004, 0.035716960000000006, 0.03846824, 0.04121952000000001, 0.043970800000000004, 0.046722080000000006, 0.04947336000000001, 0.05222464000000001, 0.054975920000000005, 0.057727200000000006, 0.06047848000000001, 0.06322976000000001, 0.06598104000000002, 0.06873232000000001, 0.07148360000000001, 0.07423488, 0.07698616000000001, 0.07973744, 0.08248872000000002, 0.08524000000000001, 0.08799128000000002, 0.09074256000000001, 0.09349384000000001, 0.09624512000000002, 0.09899640000000001, 0.10174768000000002, 0.10449896000000002, 0.10725024000000001, 0.11000152000000002, 0.11275280000000001, 0.11550408000000002, 0.11825536000000002, 0.12100664000000001, 0.12375792000000002, 0.12650920000000002, 0.12926048, 0.13201176000000003, 0.13476304, 0.13751432000000002, 0.14026560000000002, 0.14301688, 0.14576816000000004, 0.14851944000000003, 0.15127072000000003, 0.15402200000000002, 0.15677328000000001, 0.15952456000000004, 0.16227584000000003, 0.16502712000000003, 0.16777840000000002, 0.17052968000000002, 0.17328096000000004, 0.17603224000000003, 0.17878352000000003, 0.18153480000000002, 0.18428608000000002, 0.18703736000000004, 0.18978864000000004, 0.19253992000000003, 0.19529120000000003, 0.19804248000000002, 0.20079376000000004, 0.20354504000000004, 0.20629632000000003, 0.20904760000000003, 0.21179888000000002, 0.21455016000000005, 0.21730144000000004, 0.22005272000000003, 0.22280400000000003, 0.22555528000000002, 0.22830656000000005, 0.23105784000000004, 0.23380912000000004, 0.23656040000000003, 0.23931168000000003, 0.24206296000000005, 0.24481424000000004, 0.24756552000000004, 0.2503168, 0.25306808000000003, 0.25581936000000005, 0.2585706400000001, 0.26132192000000004, 0.2640732, 0.26682448000000003, 0.26957576000000005, 0.2723270400000001, 0.2750783200000001, 0.2778296, 0.28058088000000003, 0.28333216000000006, 0.2860834400000001, 0.2888347200000001, 0.291586]}}, "Datapoints": 8373, "RuleParameters": "min_threshold:70\nmax_threshold:200"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/GPUMemoryIncrease.json b/starter/ProfilerReports/benchmark/profiler-reports/GPUMemoryIncrease.json new file mode 100644 index 00000000..5a1ee6ba --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/GPUMemoryIncrease.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {}, "Datapoints": 1751, "RuleParameters": "increase:5\npatience:1000\nwindow:10"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/IOBottleneck.json b/starter/ProfilerReports/benchmark/profiler-reports/IOBottleneck.json new file mode 100644 index 00000000..9320b3aa --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/IOBottleneck.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 10, "Details": {"low_gpu_utilization": 1240, "bottlenecks": {"1681161733.500243": {"GPUs": 1, "CPUs": 1}, "1681161734.500247": {"GPUs": 1, "CPUs": 1}, "1681161808.00034": {"GPUs": 1, "CPUs": 1}, "1681161873.000234": {"GPUs": 1, "CPUs": 1}, "1681161874.00023": {"GPUs": 1, "CPUs": 1}, "1681161878.500251": {"GPUs": 1, "CPUs": 1}, "1681161880.500233": {"GPUs": 1, "CPUs": 1}, "1681161881.50023": {"GPUs": 1, "CPUs": 1}, "1681161882.000251": {"GPUs": 1, "CPUs": 1}, "1681161878.000332": {"GPUs": 1, "CPUs": 1}}}, "Datapoints": 3514, "RuleParameters": "threshold:50\nio_threshold:50\ngpu_threshold:10\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/LoadBalancing.json b/starter/ProfilerReports/benchmark/profiler-reports/LoadBalancing.json new file mode 100644 index 00000000..970b09cb --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/LoadBalancing.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"algo-1": {"workloads": {"gpu0": [616, 2, 3, 3, 0, 0, 0, 1, 0, 0, 0, 0, 16, 27, 168, 75, 61, 66, 72, 67, 71, 57, 61, 47, 61, 77, 61, 87, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}, "algo-2": {"workloads": {"gpu0": [608, 1, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 17, 9, 171, 81, 70, 78, 53, 56, 59, 67, 62, 69, 83, 79, 50, 63, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}}, "Datapoints": 1751, "RuleParameters": "threshold:0.2\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/LowGPUUtilization.json b/starter/ProfilerReports/benchmark/profiler-reports/LowGPUUtilization.json new file mode 100644 index 00000000..67de6a67 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/LowGPUUtilization.json @@ -0,0 +1 @@ +{"RuleTriggered": 14, "Violations": 14, "Details": {"algo-1": {"gpu0": {"gpu_max": 56.0, "gpu_95": 55.0, "gpu_5": 0.0, "p25": 0.0, "p50": 31.0, "p75": 44.0, "p95": 55.0, "upper": 56.0, "lower": 0.0}}, "last_timestamp": 1681162500000000, "algo-2": {"gpu0": {"gpu_max": 56.0, "gpu_95": 56.0, "gpu_5": 0.0, "p25": 0.0, "p50": 31.0, "p75": 45.0, "p95": 55.0, "upper": 56.0, "lower": 0.0}}}, "Datapoints": 1751, "RuleParameters": "threshold_p95:70\nthreshold_p5:10\nwindow:500\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/MaxInitializationTime.json b/starter/ProfilerReports/benchmark/profiler-reports/MaxInitializationTime.json new file mode 100644 index 00000000..317b3215 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/MaxInitializationTime.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"step_num": {}, "job_start": 1681161683.500177, "job_end": 1681162559.500965}, "Datapoints": 0, "RuleParameters": "threshold:20"} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/OverallFrameworkMetrics.json b/starter/ProfilerReports/benchmark/profiler-reports/OverallFrameworkMetrics.json new file mode 100644 index 00000000..a5bd66c1 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/OverallFrameworkMetrics.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"ratio": {}, "phase": {}, "phase_time": {}, "general": {"DataLoaderIterInitialize": 0.0013649255513454284, "DataLoaderIter": 99.99863507444864}}, "Datapoints": 0, "RuleParameters": ""} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/OverallSystemUsage.json b/starter/ProfilerReports/benchmark/profiler-reports/OverallSystemUsage.json new file mode 100644 index 00000000..eed17852 --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/OverallSystemUsage.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"Network": {"algo-1": {"max": 93836168.6, "p99": 52.28, "p95": 0, "p50": 0, "min": 0}, "algo-2": {"max": 97595785.97, "p99": 0, "p95": 0, "p50": 0, "min": 0}}, "GPU": {"algo-1": {"max": 56.0, "p99": 56.0, "p95": 55.0, "p50": 31.0, "min": 0}, "algo-2": {"max": 56.0, "p99": 56.0, "p95": 55.0, "p50": 31.0, "min": 0}}, "CPU": {"algo-1": {"max": 100.0, "p99": 96.7, "p95": 76.7, "p50": 41.06, "min": 0.49}, "algo-2": {"max": 100.0, "p99": 98.49, "p95": 80.23, "p50": 41.21, "min": 4.6}}, "CPU memory": {"algo-1": {"max": 33.06, "p99": 32.42, "p95": 31.57, "p50": 29.77, "min": 4.4}, "algo-2": {"max": 32.57, "p99": 31.69, "p95": 31.06, "p50": 29.71, "min": 4.46}}, "GPU memory": {"algo-1": {"max": 42.0, "p99": 41.0, "p95": 41.0, "p50": 22.0, "min": 0}, "algo-2": {"max": 41.0, "p99": 41.0, "p95": 40.0, "p50": 22.0, "min": 0}}, "I/O": {"algo-1": {"max": 38.15, "p99": 26.48, "p95": 18.68, "p50": 0, "min": 0}, "algo-2": {"max": 39.47, "p99": 27.54, "p95": 18.51, "p50": 0, "min": 0}}}, "Datapoints": 1752, "RuleParameters": ""} \ No newline at end of file diff --git a/starter/ProfilerReports/benchmark/profiler-reports/StepOutlier.json b/starter/ProfilerReports/benchmark/profiler-reports/StepOutlier.json new file mode 100644 index 00000000..e024a30e --- /dev/null +++ b/starter/ProfilerReports/benchmark/profiler-reports/StepOutlier.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"step_details": {}}, "Datapoints": 0, "RuleParameters": "threshold:3\nmode:None\nn_outliers:10\nstddev:3"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-report.html b/starter/ProfilerReports/improved/profiler-report.html new file mode 100644 index 00000000..f0450c5c --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-report.html @@ -0,0 +1,15030 @@ + + + + + + profiler-report + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+

+ SageMaker Debugger Profiling Report + + ¶ + +

+

+ SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. +

+

+ + Legal disclaimer: + + This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information. +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+ In [4]: +
+
+
+
+
# Parameters
+processing_job_arn = "arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-644ed05c"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Training job summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ System usage statistics + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Framework metrics summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Rules summary + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ The following table shows a profiling summary of the Debugger built-in rules. +The table is sorted by the rules that triggered the most frequently. During your training job, the LowGPUUtilization rule +was the most frequently triggered. It processed 2660 datapoints and was triggered 28 times. +

+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Description + + Recommendation + + Number of times rule triggered + + Number of datapoints + + Rule parameters +
+ LowGPUUtilization + + Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size. + + Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size. + + 28 + + 2660 + + threshold_p95:70 +
+ threshold_p5:10 +
+ window:500 +
+ patience:1000 +
+ BatchSize + + Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization. + + The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size. + + 28 + + 2659 + + cpu_threshold_p95:70 +
+ gpu_threshold_p95:70 +
+ gpu_memory_threshold_p95:70 +
+ patience:1000 +
+ window:500 +
+ Dataloader + + Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU. + + Change the number of data loader processes. + + 1 + + 13041 + + min_threshold:70 +
+ max_threshold:200 +
+ CPUBottleneck + + Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent. + + Consider increasing the number of data loaders or applying data pre-fetching. + + 0 + + 5337 + + threshold:50 +
+ cpu_threshold:90 +
+ gpu_threshold:10 +
+ patience:1000 +
+ StepOutlier + + Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues. + + Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers. + + 0 + + 0 + + threshold:3 +
+ mode:None +
+ n_outliers:10 +
+ stddev:3 +
+ IOBottleneck + + Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent. + + Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance. + + 0 + + 5337 + + threshold:50 +
+ io_threshold:50 +
+ gpu_threshold:10 +
+ patience:1000 +
+ MaxInitializationTime + + Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes. + + Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework. + + 0 + + 0 + + threshold:20 +
+ GPUMemoryIncrease + + Measures the average GPU memory footprint and triggers if there is a large increase. + + Choose a larger instance type with more memory if footprint is close to maximum available memory. + + 0 + + 2660 + + increase:5 +
+ patience:1000 +
+ window:10 +
+ LoadBalancing + + Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization. + + Choose a different distributed training strategy or a different distributed training framework. + + 0 + + 2660 + + threshold:0.2 +
+ patience:1000 +
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Analyzing the training loop + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ Step duration analysis + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ GPU utilization analysis + + ¶ + +

+
+
+
+
+
+
+

+ + Usage per GPU + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

+ + GPU utilization of gpu0 on node algo-1: + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

+ + GPU utilization of gpu0 on node algo-2: + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ + Workload balancing + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Dataloading analysis + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ Batch size + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ CPU bottlenecks + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ I/O bottlenecks + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+

+ GPU memory + + ¶ + +

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+ + diff --git a/starter/ProfilerReports/improved/profiler-report.ipynb b/starter/ProfilerReports/improved/profiler-report.ipynb new file mode 100644 index 00000000..513b70a4 --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-report.ipynb @@ -0,0 +1,4055 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.017513, + "end_time": "2023-04-10T22:55:59.992355", + "exception": false, + "start_time": "2023-04-10T22:55:59.974842", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# SageMaker Debugger Profiling Report\n", + "\n", + "SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. \n", + "\n", + "**Legal disclaimer:** This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:00.033173Z", + "iopub.status.busy": "2023-04-10T22:56:00.032663Z", + "iopub.status.idle": "2023-04-10T22:56:00.650332Z", + "shell.execute_reply": "2023-04-10T22:56:00.649784Z" + }, + "papermill": { + "duration": 0.641626, + "end_time": "2023-04-10T22:56:00.650456", + "exception": false, + "start_time": "2023-04-10T22:56:00.008830", + "status": "completed" + }, + "tags": [ + "hide-output", + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-04-10 22:56:00.641 ip-10-0-202-33.ec2.internal:623 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: /opt/ml/processing/input/profiler/signals/ProfilerReport\n" + ] + } + ], + "source": [ + "import json\n", + "import pandas as pd\n", + "import glob\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import datetime\n", + "from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time\n", + "from smdebug.core.utils import setup_profiler_report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:00.689819Z", + "iopub.status.busy": "2023-04-10T22:56:00.689338Z", + "iopub.status.idle": "2023-04-10T22:56:00.884447Z", + "shell.execute_reply": "2023-04-10T22:56:00.884831Z" + }, + "papermill": { + "duration": 0.217161, + "end_time": "2023-04-10T22:56:00.884972", + "exception": false, + "start_time": "2023-04-10T22:56:00.667811", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + " var JS_MIME_TYPE = 'application/javascript';\n", + " var HTML_MIME_TYPE = 'text/html';\n", + " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " var CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " var cell = handle.cell;\n", + "\n", + " var id = cell.output_area._bokeh_element_id;\n", + " var server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " var id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + "\n", + " \n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " var NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"
    \\n\"+\n", + " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", + " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", + " \"
\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " var el = document.getElementById(null);\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error() {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (var i = 0; i < css_urls.length; i++) {\n", + " var url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n", + "\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.src = url;\n", + " if (url in hashes) {\n", + " element.crossOrigin = \"anonymous\";\n", + " element.integrity = \"sha384-\" + hashes[url];\n", + " }\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " \n", + " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n", + " var css_urls = [];\n", + " \n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " function(Bokeh) {\n", + " \n", + " \n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " \n", + " if (root.Bokeh !== undefined || force === true) {\n", + " \n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + "\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\": \"T2yuo9Oe71Cz/I4X9Ac5+gpEa5a8PpJCDlqKYO0CfAuEszu1JrXLl8YugMqYe3sM\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\": \"98GDGJ0kOMCUMUePhksaQ/GYgB3+NH9h996V88sh3aOiUNX3N+fLXAtry6xctSZ6\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\": \"89bArO+nlbP3sgakeHjCo1JYxYR5wufVgA3IbUvDY+K7w4zyxJqssu7wVnfeKCq8\"};\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.2.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.2.3.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import bokeh\n", + "from bokeh.io import output_notebook, show\n", + "from bokeh.layouts import column, row\n", + "from bokeh.plotting import figure\n", + "from bokeh.models.widgets import DataTable, DateFormatter, TableColumn\n", + "from bokeh.models import ColumnDataSource, PreText\n", + "from math import pi\n", + "from bokeh.transform import cumsum\n", + "import warnings\n", + "from bokeh.models.widgets import Paragraph\n", + "from bokeh.models import Legend\n", + "from bokeh.util.warnings import BokehDeprecationWarning, BokehUserWarning\n", + "warnings.simplefilter('ignore', BokehDeprecationWarning)\n", + "warnings.simplefilter('ignore', BokehUserWarning)\n", + "\n", + "output_notebook(hide_banner=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:00.923934Z", + "iopub.status.busy": "2023-04-10T22:56:00.923457Z", + "iopub.status.idle": "2023-04-10T22:56:00.925088Z", + "shell.execute_reply": "2023-04-10T22:56:00.925533Z" + }, + "papermill": { + "duration": 0.023132, + "end_time": "2023-04-10T22:56:00.925656", + "exception": false, + "start_time": "2023-04-10T22:56:00.902524", + "status": "completed" + }, + "tags": [ + "parameters", + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "processing_job_arn = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ce883387", + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:00.964114Z", + "iopub.status.busy": "2023-04-10T22:56:00.963646Z", + "iopub.status.idle": "2023-04-10T22:56:00.965479Z", + "shell.execute_reply": "2023-04-10T22:56:00.965871Z" + }, + "papermill": { + "duration": 0.023086, + "end_time": "2023-04-10T22:56:00.966000", + "exception": false, + "start_time": "2023-04-10T22:56:00.942914", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "processing_job_arn = \"arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-644ed05c\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.004271Z", + "iopub.status.busy": "2023-04-10T22:56:01.003794Z", + "iopub.status.idle": "2023-04-10T22:56:01.005988Z", + "shell.execute_reply": "2023-04-10T22:56:01.005584Z" + }, + "papermill": { + "duration": 0.022848, + "end_time": "2023-04-10T22:56:01.006091", + "exception": false, + "start_time": "2023-04-10T22:56:00.983243", + "status": "completed" + }, + "tags": [ + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "setup_profiler_report(processing_job_arn)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.048778Z", + "iopub.status.busy": "2023-04-10T22:56:01.048279Z", + "iopub.status.idle": "2023-04-10T22:56:01.050046Z", + "shell.execute_reply": "2023-04-10T22:56:01.050438Z" + }, + "papermill": { + "duration": 0.027047, + "end_time": "2023-04-10T22:56:01.050566", + "exception": false, + "start_time": "2023-04-10T22:56:01.023519", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "def create_piechart(data_dict, title=None, height=400, width=400, x1=0, x2=0.1, radius=0.4, toolbar_location='right'):\n", + " \n", + " plot = figure(plot_height=height, \n", + " plot_width=width,\n", + " toolbar_location=toolbar_location,\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " tooltips=\"@phase:@value\", \n", + " title=title,\n", + " x_range=(-radius-x1, radius+x2))\n", + "\n", + " data = pd.Series(data_dict).reset_index(name='value').rename(columns={'index':'phase'})\n", + " data['angle'] = data['value']/data['value'].sum() * 2*pi\n", + " data['color'] = bokeh.palettes.viridis(len(data_dict))\n", + "\n", + " plot.wedge(x=0, y=0., radius=radius,\n", + " start_angle=cumsum('angle', include_zero=True), \n", + " end_angle=cumsum('angle'),\n", + " line_color=\"white\", \n", + " source=data, \n", + " fill_color='color', \n", + " legend='phase'\n", + " )\n", + " plot.legend.label_text_font_size = \"8pt\"\n", + " plot.legend.location = 'center_right'\n", + " plot.axis.axis_label=None\n", + " plot.axis.visible=False\n", + " plot.grid.grid_line_color = None\n", + " plot.outline_line_color = \"white\"\n", + " \n", + " return plot" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.090028Z", + "iopub.status.busy": "2023-04-10T22:56:01.089541Z", + "iopub.status.idle": "2023-04-10T22:56:01.091356Z", + "shell.execute_reply": "2023-04-10T22:56:01.091737Z" + }, + "papermill": { + "duration": 0.023792, + "end_time": "2023-04-10T22:56:01.091865", + "exception": false, + "start_time": "2023-04-10T22:56:01.068073", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import display, HTML, Markdown, Image\n", + "def pretty_print(df):\n", + " raw_html = df.to_html().replace(\"\\\\n\",\"
\").replace('','')\n", + " return display(HTML(raw_html))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.017383, + "end_time": "2023-04-10T22:56:01.126784", + "exception": false, + "start_time": "2023-04-10T22:56:01.109401", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Training job summary" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.166276Z", + "iopub.status.busy": "2023-04-10T22:56:01.165756Z", + "iopub.status.idle": "2023-04-10T22:56:01.167842Z", + "shell.execute_reply": "2023-04-10T22:56:01.167438Z" + }, + "papermill": { + "duration": 0.023665, + "end_time": "2023-04-10T22:56:01.167944", + "exception": false, + "start_time": "2023-04-10T22:56:01.144279", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "def load_report(rule_name):\n", + " try:\n", + " report = json.load(open('/opt/ml/processing/output/rule/profiler-output/profiler-reports/'+rule_name+'.json'))\n", + " return report\n", + " except FileNotFoundError:\n", + " print (rule_name + ' not triggered')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.216596Z", + "iopub.status.busy": "2023-04-10T22:56:01.215220Z", + "iopub.status.idle": "2023-04-10T22:56:01.218813Z", + "shell.execute_reply": "2023-04-10T22:56:01.218390Z" + }, + "papermill": { + "duration": 0.033455, + "end_time": "2023-04-10T22:56:01.218957", + "exception": false, + "start_time": "2023-04-10T22:56:01.185502", + "status": "completed" + }, + "tags": [ + "hide-input", + "hide-output" + ] + }, + "outputs": [], + "source": [ + "\n", + "job_statistics = {}\n", + "report = load_report('MaxInitializationTime')\n", + "if report:\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " first_step = report['Details'][\"step_num\"][\"first\"]\n", + " last_step = report['Details'][\"step_num\"][\"last\"]\n", + " tmp = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Start time\"] = f\"{hour} {day}\"\n", + " tmp = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"End time\"] = f\"{hour} {day}\"\n", + " job_duration_in_seconds = int(report['Details']['job_end'] - report['Details']['job_start']) \n", + " job_statistics[\"Job duration\"] = f\"{job_duration_in_seconds} seconds\"\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " tmp = us_since_epoch_to_human_readable_time(first_step)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Training loop start\"] = f\"{hour} {day}\"\n", + " tmp = us_since_epoch_to_human_readable_time(last_step)\n", + " date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " job_statistics[\"Training loop end\"] = f\"{hour} {day}\"\n", + " training_loop_duration_in_seconds = int((last_step - first_step) / 1000000)\n", + " job_statistics[\"Training loop duration\"] = f\"{training_loop_duration_in_seconds} seconds\"\n", + " initialization_in_seconds = int(first_step/1000000 - report['Details']['job_start'])\n", + " job_statistics[\"Initialization time\"] = f\"{initialization_in_seconds} seconds\"\n", + " finalization_in_seconds = int(np.abs(report['Details']['job_end'] - last_step/1000000))\n", + " job_statistics[\"Finalization time\"] = f\"{finalization_in_seconds} seconds\"\n", + " initialization_perc = int(initialization_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Initialization\"] = f\"{initialization_perc} %\"\n", + " training_loop_perc = int(training_loop_duration_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Training loop\"] = f\"{training_loop_perc} %\"\n", + " finalization_perc = int(finalization_in_seconds / job_duration_in_seconds * 100)\n", + " job_statistics[\"Finalization\"] = f\"{finalization_perc} %\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.270666Z", + "iopub.status.busy": "2023-04-10T22:56:01.267536Z", + "iopub.status.idle": "2023-04-10T22:56:01.279909Z", + "shell.execute_reply": "2023-04-10T22:56:01.280286Z" + }, + "papermill": { + "duration": 0.043508, + "end_time": "2023-04-10T22:56:01.280417", + "exception": false, + "start_time": "2023-04-10T22:56:01.236909", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"5d0d7c4d-6036-4349-81c6-c0345303faa1\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1006\"},{\"id\":\"1007\"}]},\"id\":\"1008\",\"type\":\"Column\"},{\"attributes\":{\"columns\":[{\"id\":\"1002\"},{\"id\":\"1003\"}],\"height\":380,\"source\":{\"id\":\"1001\"},\"view\":{\"id\":\"1005\"},\"width\":450},\"id\":\"1004\",\"type\":\"DataTable\"},{\"attributes\":{\"text\":\"The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took. \\n Your training job started on 04/10/2023 at 22:32:49 and ran for 1330 seconds. \\n Your training job started on 04/10/2023 at 22:32:49 and ran for 1330 seconds.. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.\",\"width\":800},\"id\":\"1006\",\"type\":\"Paragraph\"},{\"attributes\":{\"data\":{\"0\":[\"Start time\",\"End time\",\"Job duration\"],\"1\":[\"22:32:49 04/10/2023\",\"22:54:59 04/10/2023\",\"1330 seconds\"],\"index\":[0,1,2]},\"selected\":{\"id\":\"1009\"},\"selection_policy\":{\"id\":\"1010\"}},\"id\":\"1001\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"editor\":{\"id\":\"1011\"},\"field\":\"0\",\"formatter\":{\"id\":\"1012\"},\"title\":\"\"},\"id\":\"1002\",\"type\":\"TableColumn\"},{\"attributes\":{\"source\":{\"id\":\"1001\"}},\"id\":\"1005\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1013\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1014\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1013\"},\"field\":\"1\",\"formatter\":{\"id\":\"1014\"},\"title\":\"Job Statistics\"},\"id\":\"1003\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1011\",\"type\":\"StringEditor\"},{\"attributes\":{\"children\":[{\"id\":\"1004\"}]},\"id\":\"1007\",\"type\":\"Row\"},{\"attributes\":{},\"id\":\"1012\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1009\",\"type\":\"Selection\"}],\"root_ids\":[\"1008\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"5d0d7c4d-6036-4349-81c6-c0345303faa1\",\"root_ids\":[\"1008\"],\"roots\":{\"1008\":\"a3cb8a99-d438-4414-a891-fcb74c1e8e1e\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1008" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if report:\n", + " text = \"\"\"The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took.\"\"\"\n", + " if len(job_statistics) > 0:\n", + " df = pd.DataFrame.from_dict(job_statistics, orient='index')\n", + " start_time = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)\n", + " date = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " duration = job_duration_in_seconds\n", + " text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n", + "\n", + " #pretty_print(df)\n", + " if \"first\" in report['Details'][\"step_num\"] and \"last\" in report['Details'][\"step_num\"]:\n", + " if finalization_perc < 0:\n", + " job_statistics[\"Finalization%\"] = 0\n", + " if training_loop_perc < 0:\n", + " job_statistics[\"Training loop\"] = 0\n", + " if initialization_perc < 0:\n", + " job_statistics[\"Initialization\"] = 0\n", + " else:\n", + " text = f\"\"\"{text} \\n Your training job started on {day} at {hour} and ran for {duration} seconds.\"\"\"\n", + " \n", + " if len(job_statistics) > 0:\n", + " df2 = df.reset_index()\n", + " df2.columns = [\"0\", \"1\"]\n", + " source = ColumnDataSource(data=df2)\n", + " columns = [TableColumn(field='0', title=\"\"),\n", + " TableColumn(field='1', title=\"Job Statistics\"),]\n", + " table = DataTable(source=source, columns=columns, width=450, height=380)\n", + "\n", + " plot = None\n", + "\n", + " if \"Initialization\" in job_statistics:\n", + " piechart_data = {}\n", + " piechart_data[\"Initialization\"] = initialization_perc \n", + " piechart_data[\"Training loop\"] = training_loop_perc\n", + " piechart_data[\"Finalization\"] = finalization_perc \n", + "\n", + " plot = create_piechart(piechart_data, \n", + " height=350,\n", + " width=500,\n", + " x1=0.15,\n", + " x2=0.15,\n", + " radius=0.15, \n", + " toolbar_location=None)\n", + "\n", + " if plot != None:\n", + " paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width = 800)\n", + " show(column(paragraph, row(table, plot)))\n", + " else:\n", + " paragraph = Paragraph(text=f\"\"\"{text}. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.\"\"\" , width = 800)\n", + " show(column(paragraph, row(table)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.018379, + "end_time": "2023-04-10T22:56:01.317691", + "exception": false, + "start_time": "2023-04-10T22:56:01.299312", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## System usage statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.358176Z", + "iopub.status.busy": "2023-04-10T22:56:01.357684Z", + "iopub.status.idle": "2023-04-10T22:56:01.359646Z", + "shell.execute_reply": "2023-04-10T22:56:01.360015Z" + }, + "papermill": { + "duration": 0.02411, + "end_time": "2023-04-10T22:56:01.360143", + "exception": false, + "start_time": "2023-04-10T22:56:01.336033", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "report = load_report('OverallSystemUsage')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.406473Z", + "iopub.status.busy": "2023-04-10T22:56:01.405947Z", + "iopub.status.idle": "2023-04-10T22:56:01.408103Z", + "shell.execute_reply": "2023-04-10T22:56:01.407686Z" + }, + "papermill": { + "duration": 0.029666, + "end_time": "2023-04-10T22:56:01.408204", + "exception": false, + "start_time": "2023-04-10T22:56:01.378538", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "text1 = ''\n", + "if report:\n", + " if \"GPU\" in report[\"Details\"]:\n", + " for node_id in report[\"Details\"][\"GPU\"]:\n", + " gpu_p95 = report[\"Details\"][\"GPU\"][node_id][\"p95\"]\n", + " gpu_p50 = report[\"Details\"][\"GPU\"][node_id][\"p50\"]\n", + " cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n", + " cpu_p50 = report[\"Details\"][\"CPU\"][node_id][\"p50\"]\n", + " \n", + " if gpu_p95 < 70 and cpu_p95 < 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n", + " The 95th percentile of the total CPU utilization is only {int(cpu_p95)}%. Node {node_id} is underutilized. \n", + " You may want to consider switching to a smaller instance type.\"\"\"\n", + " elif gpu_p95 < 70 and cpu_p95 > 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. \n", + " However, the 95th percentile of the total CPU utilization is {int(cpu_p95)}%. GPUs on node {node_id} are underutilized, \n", + " likely because of CPU bottlenecks.\"\"\"\n", + " elif gpu_p50 > 70:\n", + " text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n", + " GPUs on node {node_id} are well utilized.\"\"\"\n", + " else:\n", + " text1 = f\"\"\"{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. \n", + " The median total CPU utilization is {int(cpu_p50)}%.\"\"\"\n", + " else:\n", + " for node_id in report[\"Details\"][\"CPU\"]:\n", + " cpu_p95 = report[\"Details\"][\"CPU\"][node_id][\"p95\"]\n", + " if cpu_p95 > 70:\n", + " text1 = f\"\"\"{text1}The 95th percentile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%. CPUs on node {node_id} are well utilized.\"\"\"\n", + " text1 = Paragraph(text=f\"\"\"{text1}\"\"\", width=1100)\n", + " text2 = Paragraph(text=f\"\"\"The following table shows statistics of resource utilization per worker (node), \n", + " such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. \n", + " The table also includes the total I/O wait time and the total amount of data sent or received in bytes.\n", + " The table shows min and max values as well as p99, p90 and p50 percentiles.\"\"\", width=900)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.469021Z", + "iopub.status.busy": "2023-04-10T22:56:01.453803Z", + "iopub.status.idle": "2023-04-10T22:56:01.472796Z", + "shell.execute_reply": "2023-04-10T22:56:01.472399Z" + }, + "papermill": { + "duration": 0.045866, + "end_time": "2023-04-10T22:56:01.472899", + "exception": false, + "start_time": "2023-04-10T22:56:01.427033", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"cf272431-9439-4b0a-9920-8f3c1c6fedb2\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1045\"},{\"id\":\"1046\"},{\"id\":\"1058\"}]},\"id\":\"1059\",\"type\":\"Column\"},{\"attributes\":{\"editor\":{\"id\":\"1070\"},\"field\":\"metric\",\"formatter\":{\"id\":\"1071\"},\"title\":\"metric\"},\"id\":\"1049\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1082\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1080\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1079\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1068\"},\"field\":\"Node\",\"formatter\":{\"id\":\"1069\"},\"title\":\"node\"},\"id\":\"1048\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1072\",\"type\":\"StringEditor\"},{\"attributes\":{\"text\":\"The 95th percentile of the total GPU utilization on node algo-1 is only 48%. \\n However, the 95th percentile of the total CPU utilization is 74%. GPUs on node algo-1 are underutilized, \\n likely because of CPU bottlenecks.The 95th percentile of the total GPU utilization on node algo-2 is only 50%. \\n However, the 95th percentile of the total CPU utilization is 74%. GPUs on node algo-2 are underutilized, \\n likely because of CPU bottlenecks.\",\"width\":1100},\"id\":\"1045\",\"type\":\"Paragraph\"},{\"attributes\":{\"data\":{\"Node\":[\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\",\"algo-1\",\"algo-2\"],\"index\":[0,1,2,3,4,5,6,7,8,9,10,11],\"level_0\":[0,1,2,3,4,5,6,7,8,9,10,11],\"max\":{\"__ndarray__\":\"mpmZHZJelkFSuB55c+eXQQAAAAAAAExAAAAAAAAATUAAAAAAAABZQI/C9Shc31hAMzMzMzNzQEDsUbgehWtAQAAAAAAAAEVAAAAAAACAREDhehSuRwFEQHE9CtejcEVA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"metric\":[\"Network\",\"Network\",\"GPU\",\"GPU\",\"CPU\",\"CPU\",\"CPU memory\",\"CPU memory\",\"GPU memory\",\"GPU memory\",\"I/O\",\"I/O\"],\"min\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAzczMzMzMEUBcj8L1KFwRQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p50\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAADxAAAAAAAAAPUBxPQrXo7BDQMP1KFyPokNArkfhehSuPUDD9Shcj8I9QAAAAAAAADVAAAAAAAAANUAAAAAAAAAAAAAAAAAAAAAA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p95\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAEhAAAAAAAAASUAAAAAAALBSQK5H4XoUnlJAzczMzMwMPkAfhetRuN4+QAAAAAAAAEJAAAAAAACAQkAfhetRuB4uQHsUrkfh+ixA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"p99\":{\"__ndarray__\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAEpAAAAAAACASkDsUbgehTtYQK5H4XoUPlhAhetRuB7FP0CuR+F6FK4/QAAAAAAAgENAAAAAAACAQ0BI4XoUrsc4QEjhehSuBzlA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[12]},\"unit\":[\"bytes\",\"bytes\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\",\"percentage\"]},\"selected\":{\"id\":\"1066\"},\"selection_policy\":{\"id\":\"1067\"}},\"id\":\"1047\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1069\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1068\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1071\",\"type\":\"StringFormatter\"},{\"attributes\":{\"columns\":[{\"id\":\"1048\"},{\"id\":\"1049\"},{\"id\":\"1050\"},{\"id\":\"1051\"},{\"id\":\"1052\"},{\"id\":\"1053\"},{\"id\":\"1054\"},{\"id\":\"1055\"}],\"height\":360,\"source\":{\"id\":\"1047\"},\"view\":{\"id\":\"1057\"},\"width\":800},\"id\":\"1056\",\"type\":\"DataTable\"},{\"attributes\":{\"children\":[{\"id\":\"1056\"}]},\"id\":\"1058\",\"type\":\"Row\"},{\"attributes\":{\"editor\":{\"id\":\"1082\"},\"field\":\"min\",\"formatter\":{\"id\":\"1083\"},\"title\":\"min\"},\"id\":\"1055\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1070\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1080\"},\"field\":\"p50\",\"formatter\":{\"id\":\"1081\"},\"title\":\"p50\"},\"id\":\"1054\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1083\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1073\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1074\"},\"field\":\"max\",\"formatter\":{\"id\":\"1075\"},\"title\":\"max\"},\"id\":\"1051\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1075\",\"type\":\"StringFormatter\"},{\"attributes\":{},\"id\":\"1081\",\"type\":\"StringFormatter\"},{\"attributes\":{\"editor\":{\"id\":\"1076\"},\"field\":\"p99\",\"formatter\":{\"id\":\"1077\"},\"title\":\"p99\"},\"id\":\"1052\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1077\",\"type\":\"StringFormatter\"},{\"attributes\":{\"text\":\"The following table shows statistics of resource utilization per worker (node), \\n such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. \\n The table also includes the total I/O wait time and the total amount of data sent or received in bytes.\\n The table shows min and max values as well as p99, p90 and p50 percentiles.\",\"width\":900},\"id\":\"1046\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"1076\",\"type\":\"StringEditor\"},{\"attributes\":{\"editor\":{\"id\":\"1078\"},\"field\":\"p95\",\"formatter\":{\"id\":\"1079\"},\"title\":\"p95\"},\"id\":\"1053\",\"type\":\"TableColumn\"},{\"attributes\":{},\"id\":\"1066\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1074\",\"type\":\"StringEditor\"},{\"attributes\":{},\"id\":\"1067\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"editor\":{\"id\":\"1072\"},\"field\":\"unit\",\"formatter\":{\"id\":\"1073\"},\"title\":\"unit\"},\"id\":\"1050\",\"type\":\"TableColumn\"},{\"attributes\":{\"source\":{\"id\":\"1047\"}},\"id\":\"1057\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1078\",\"type\":\"StringEditor\"}],\"root_ids\":[\"1059\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"cf272431-9439-4b0a-9920-8f3c1c6fedb2\",\"root_ids\":[\"1059\"],\"roots\":{\"1059\":\"e39ddc8a-8a55-4d8a-9547-7b9231817f51\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1059" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "units = {\"CPU\": \"percentage\", \"CPU memory\": \"percentage\", \"GPU\": \"percentage\", \"Network\": \"bytes\", \"GPU memory\": \"percentage\", \"I/O\": \"percentage\"}\n", + "if report:\n", + " for metric in report['Details']:\n", + " for node_id in report['Details'][metric]:\n", + " values = report['Details'][metric][node_id]\n", + " rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']\n", + " df2 = df.reset_index()\n", + " source = ColumnDataSource(data=df2)\n", + " columns = [TableColumn(field='Node', title=\"node\"),\n", + " TableColumn(field='metric', title=\"metric\"),\n", + " TableColumn(field='unit', title=\"unit\"),\n", + " TableColumn(field='max', title=\"max\"),\n", + " TableColumn(field='p99', title=\"p99\"),\n", + " TableColumn(field='p95', title=\"p95\"),\n", + " TableColumn(field='p50', title=\"p50\"),\n", + " TableColumn(field='min', title=\"min\"),]\n", + " table = DataTable(source=source, columns=columns, width=800, height=df2.shape[0]*30)\n", + "\n", + " show(column( text1, text2, row(table)))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.525661Z", + "iopub.status.busy": "2023-04-10T22:56:01.523788Z", + "iopub.status.idle": "2023-04-10T22:56:01.562537Z", + "shell.execute_reply": "2023-04-10T22:56:01.562933Z" + }, + "papermill": { + "duration": 0.070867, + "end_time": "2023-04-10T22:56:01.563068", + "exception": false, + "start_time": "2023-04-10T22:56:01.492201", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Framework metrics summary" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"d8a038cd-9c59-4f8b-8d28-d662bed386cc\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1219\"},{\"id\":\"1220\"}]},\"id\":\"1221\",\"type\":\"Column\"},{\"attributes\":{},\"id\":\"1183\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1193\"},{\"id\":\"1194\"},{\"id\":\"1195\"},{\"id\":\"1196\"}]},\"id\":\"1197\",\"type\":\"Toolbar\"},{\"attributes\":{\"field\":\"angle\"},\"id\":\"1203\",\"type\":\"CumSum\"},{\"attributes\":{\"field\":\"angle\",\"include_zero\":true},\"id\":\"1202\",\"type\":\"CumSum\"},{\"attributes\":{\"callback\":null,\"tooltips\":\"@phase:@value\"},\"id\":\"1193\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1194\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1204\"},\"glyph\":{\"id\":\"1206\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1207\"},\"selection_glyph\":null,\"view\":{\"id\":\"1209\"}},\"id\":\"1208\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"1204\"}},\"id\":\"1209\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1195\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1196\",\"type\":\"PanTool\"},{\"attributes\":{\"data\":{\"angle\":{\"__ndarray__\":\"eaqR90GuET+GNQKm6SEZQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[2]},\"color\":[\"#440154\",\"#FDE724\"],\"index\":[0,1],\"phase\":[\"DataLoaderIterInitialize\",\"DataLoaderIter\"],\"value\":{\"__ndarray__\":\"2+b2DVqWUT8J8qVp7v9YQA==\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[2]}},\"selected\":{\"id\":\"1215\"},\"selection_policy\":{\"id\":\"1216\"}},\"id\":\"1204\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"text\":\"General framework operations\"},\"id\":\"1175\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1213\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"label\":{\"field\":\"phase\"},\"renderers\":[{\"id\":\"1208\"}]},\"id\":\"1218\",\"type\":\"LegendItem\"},{\"attributes\":{\"children\":[{\"id\":\"1174\"}]},\"id\":\"1220\",\"type\":\"Row\"},{\"attributes\":{},\"id\":\"1181\",\"type\":\"LinearScale\"},{\"attributes\":{\"end\":0.8999999999999999,\"start\":-0.5},\"id\":\"1177\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"1211\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"items\":[{\"id\":\"1218\"}],\"label_text_font_size\":\"8pt\",\"location\":\"center_right\"},\"id\":\"1217\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"1179\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1213\"},\"ticker\":{\"id\":\"1186\"},\"visible\":false},\"id\":\"1185\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1216\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1190\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1186\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1185\"},\"grid_line_color\":null,\"ticker\":null},\"id\":\"1188\",\"type\":\"Grid\"},{\"attributes\":{\"width\":1100},\"id\":\"1219\",\"type\":\"Paragraph\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1203\"},\"units\":\"rad\"},\"fill_color\":{\"field\":\"color\"},\"line_color\":{\"value\":\"white\"},\"radius\":{\"units\":\"data\",\"value\":0.3},\"start_angle\":{\"expr\":{\"id\":\"1202\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":0.0}},\"id\":\"1206\",\"type\":\"Wedge\"},{\"attributes\":{},\"id\":\"1215\",\"type\":\"Selection\"},{\"attributes\":{\"below\":[{\"id\":\"1185\"}],\"center\":[{\"id\":\"1188\"},{\"id\":\"1192\"},{\"id\":\"1217\"}],\"left\":[{\"id\":\"1189\"}],\"outline_line_color\":\"white\",\"plot_height\":350,\"renderers\":[{\"id\":\"1208\"}],\"title\":{\"id\":\"1175\"},\"toolbar\":{\"id\":\"1197\"},\"x_range\":{\"id\":\"1177\"},\"x_scale\":{\"id\":\"1181\"},\"y_range\":{\"id\":\"1179\"},\"y_scale\":{\"id\":\"1183\"}},\"id\":\"1174\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis\":{\"id\":\"1189\"},\"dimension\":1,\"grid_line_color\":null,\"ticker\":null},\"id\":\"1192\",\"type\":\"Grid\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1211\"},\"ticker\":{\"id\":\"1190\"},\"visible\":false},\"id\":\"1189\",\"type\":\"LinearAxis\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1203\"},\"units\":\"rad\"},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"field\":\"color\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"radius\":{\"units\":\"data\",\"value\":0.3},\"start_angle\":{\"expr\":{\"id\":\"1202\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":0.0}},\"id\":\"1207\",\"type\":\"Wedge\"}],\"root_ids\":[\"1221\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"d8a038cd-9c59-4f8b-8d28-d662bed386cc\",\"root_ids\":[\"1221\"],\"roots\":{\"1221\":\"a4a5dd12-bede-4793-890e-82213bac2e77\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1221" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "report = load_report('OverallFrameworkMetrics')\n", + "if report:\n", + " if 'Details' in report:\n", + "\n", + " display(Markdown(f\"\"\"## Framework metrics summary\"\"\"))\n", + " plots = []\n", + " text = ''\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"The following two pie charts show the time spent on the TRAIN phase, the EVAL phase, \n", + " and others. The 'others' includes the time spent between steps (after one step has finished and before\n", + " the next step has started). Ideally, most of the training time should be spent on the \n", + " TRAIN and EVAL phases. If TRAIN/EVAL were not specified in the training script, steps will be recorded as \n", + " GLOBAL.\"\"\"\n", + "\n", + " if 'others' in report['Details']['phase']:\n", + " others = float(report['Details']['phase']['others'])\n", + "\n", + " if others > 25:\n", + " text = f\"\"\"{text} Your training job spent quite a significant amount of time ({round(others,2)}%) in phase \"others\".\n", + " You should check what is happening in between the steps.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase and others\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details']:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie chart on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the time was spent in event \"{event}\".\"\"\"\n", + "\n", + " if perc > 70:\n", + " text = f\"\"\"There is quite a significant difference between the time spent on forward and backward\n", + " pass.\"\"\"\n", + " else:\n", + " text = f\"\"\"{text} It shows that {int(perc)}% of the training time\n", + " was spent on \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text=''\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following piechart shows a breakdown of the CPU/GPU operators. \n", + " It shows that {int(ratio)}% of training time was spent on executing the \"{key}\" operator.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details']:\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General framework operations\")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = ''\n", + " if 'horovod' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: Horovod metrics\"\"\"))\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"{text} The following pie chart shows a detailed breakdown of the Horovod metrics profiled\n", + " from your training job. The most expensive function was \"{event}\" with {int(perc)}%.\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Horovod metrics \")\n", + "\n", + " paragraph = Paragraph(text=text, width=1100)\n", + " show(column(paragraph, row(plot)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.612806Z", + "iopub.status.busy": "2023-04-10T22:56:01.612300Z", + "iopub.status.idle": "2023-04-10T22:56:01.614012Z", + "shell.execute_reply": "2023-04-10T22:56:01.614401Z" + }, + "papermill": { + "duration": 0.031023, + "end_time": "2023-04-10T22:56:01.614531", + "exception": false, + "start_time": "2023-04-10T22:56:01.583508", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "values = []\n", + "if report:\n", + " if 'CPU_total' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: CPU operators\"\"\"))\n", + " event = max(report['Details']['CPU'], key=report['Details']['CPU'].get)\n", + " perc = report['Details']['CPU'][event]\n", + "\n", + " for function in report['Details']['CPU']:\n", + " percentage = round(report['Details']['CPU'][function],2)\n", + " time = report['Details']['CPU_total'][function] \n", + " rows.append([percentage, time, function])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['percentage', 'time', 'operator']\n", + "\n", + " df = df.sort_values(by=['percentage'], ascending=False)\n", + " source = ColumnDataSource(data=df)\n", + " columns = [TableColumn(field='percentage', title=\"Percentage\"),\n", + " TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n", + " TableColumn(field='operator', title=\"CPU operator\"),]\n", + "\n", + " table = DataTable(source=source, columns=columns, width=550, height=350)\n", + "\n", + " text = Paragraph(text=f\"\"\"The following table shows a list of operators that ran on the CPUs.\n", + " The most expensive operator on the CPUs was \"{event}\" with {int(perc)} %.\"\"\")\n", + "\n", + " plot = create_piechart(report['Details']['CPU'],\n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " )\n", + "\n", + " show(column(text, row(table, plot)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.665670Z", + "iopub.status.busy": "2023-04-10T22:56:01.658685Z", + "iopub.status.idle": "2023-04-10T22:56:01.667378Z", + "shell.execute_reply": "2023-04-10T22:56:01.667756Z" + }, + "papermill": { + "duration": 0.032833, + "end_time": "2023-04-10T22:56:01.667891", + "exception": false, + "start_time": "2023-04-10T22:56:01.635058", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", + "rows = [] \n", + "values = []\n", + "if report:\n", + " if 'GPU_total' in report['Details']:\n", + " display(Markdown(f\"\"\"#### Overview: GPU operators\"\"\"))\n", + " event = max(report['Details']['GPU'], key=report['Details']['GPU'].get)\n", + " perc = report['Details']['GPU'][event]\n", + "\n", + " for function in report['Details']['GPU']:\n", + " percentage = round(report['Details']['GPU'][function],2)\n", + " time = report['Details']['GPU_total'][function] \n", + " rows.append([percentage, time, function])\n", + "\n", + " df = pd.DataFrame(rows) \n", + " df.columns = ['percentage', 'time', 'operator']\n", + "\n", + " df = df.sort_values(by=['percentage'], ascending=False)\n", + " source = ColumnDataSource(data=df)\n", + " columns = [TableColumn(field='percentage', title=\"Percentage\"),\n", + " TableColumn(field='time', title=\"Cumulative time in microseconds\"),\n", + " TableColumn(field='operator', title=\"GPU operator\"),]\n", + " table = DataTable(source=source, columns=columns, width=450, height=350)\n", + "\n", + " text = Paragraph(text=f\"\"\"The following table shows a list of operators that your training job ran on GPU.\n", + " The most expensive operator on GPU was \"{event}\" with {int(perc)} %\"\"\")\n", + "\n", + " plot = create_piechart(report['Details']['GPU'],\n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " )\n", + "\n", + " show(column(text, row(table, plot)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.020246, + "end_time": "2023-04-10T22:56:01.708811", + "exception": false, + "start_time": "2023-04-10T22:56:01.688565", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Rules summary" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.755796Z", + "iopub.status.busy": "2023-04-10T22:56:01.755301Z", + "iopub.status.idle": "2023-04-10T22:56:01.757067Z", + "shell.execute_reply": "2023-04-10T22:56:01.757432Z" + }, + "papermill": { + "duration": 0.028571, + "end_time": "2023-04-10T22:56:01.757558", + "exception": false, + "start_time": "2023-04-10T22:56:01.728987", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "description = {}\n", + "description['CPUBottleneck'] = 'Checks if the CPU utilization is high and the GPU utilization is low. \\\n", + "It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive \\\n", + "from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue \\\n", + "if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n", + "description['IOBottleneck'] = 'Checks if the data I/O wait time is high and the GPU utilization is low. \\\n", + "It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. \\\n", + "The rule evaluates the I/O and GPU utilization rates and triggers the issue \\\n", + "if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'\n", + "description['Dataloader'] = 'Checks how many data loaders are running in parallel and whether the total number is equal the number \\\n", + "of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. \\\n", + "If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.'\n", + "description['GPUMemoryIncrease'] = 'Measures the average GPU memory footprint and triggers if there is a large increase.'\n", + "description['BatchSize'] = 'Checks if GPUs are underutilized because the batch size is too small. \\\n", + "To detect this problem, the rule analyzes the average GPU memory footprint, \\\n", + "the CPU and the GPU utilization. '\n", + "description['LowGPUUtilization'] = 'Checks if the GPU utilization is low or fluctuating. \\\n", + "This can happen due to bottlenecks, blocking calls for synchronizations, \\\n", + "or a small batch size.'\n", + "description['MaxInitializationTime'] = 'Checks if the time spent on initialization exceeds a threshold percent of the total training time. \\\n", + "The rule waits until the first step of training loop starts. The initialization can take longer \\\n", + "if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.'\n", + "description['LoadBalancing'] = 'Detects workload balancing issues across GPUs. \\\n", + "Workload imbalance can occur in training jobs with data parallelism. \\\n", + "The gradients are accumulated on a primary GPU, and this GPU might be overused \\\n", + "with regard to other GPUs, resulting in reducing the efficiency of data parallelization.'\n", + "description['StepOutlier'] = 'Detects outliers in step duration. The step duration for forward and backward pass should be \\\n", + "roughly the same throughout the training. If there are significant outliers, \\\n", + "it may indicate a system stall or bottleneck issues.'" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.803340Z", + "iopub.status.busy": "2023-04-10T22:56:01.802823Z", + "iopub.status.idle": "2023-04-10T22:56:01.804453Z", + "shell.execute_reply": "2023-04-10T22:56:01.804818Z" + }, + "papermill": { + "duration": 0.027201, + "end_time": "2023-04-10T22:56:01.804944", + "exception": false, + "start_time": "2023-04-10T22:56:01.777743", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "recommendation = {}\n", + "recommendation['CPUBottleneck'] = 'Consider increasing the number of data loaders \\\n", + "or applying data pre-fetching.'\n", + "recommendation['IOBottleneck'] = 'Pre-fetch data or choose different file formats, such as binary formats that \\\n", + "improve I/O performance.'\n", + "recommendation['Dataloader'] = 'Change the number of data loader processes.'\n", + "recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory if footprint is close to maximum available memory.'\n", + "recommendation['BatchSize'] = 'The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.'\n", + "recommendation['LowGPUUtilization'] = 'Check if there are bottlenecks, minimize blocking calls, \\\n", + "change distributed training strategy, or increase the batch size.'\n", + "recommendation['MaxInitializationTime'] = 'Initialization takes too long. \\\n", + "If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.'\n", + "recommendation['LoadBalancing'] = 'Choose a different distributed training strategy or \\\n", + "a different distributed training framework.'\n", + "recommendation['StepOutlier'] = 'Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.'" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.852850Z", + "iopub.status.busy": "2023-04-10T22:56:01.852369Z", + "iopub.status.idle": "2023-04-10T22:56:01.862326Z", + "shell.execute_reply": "2023-04-10T22:56:01.862700Z" + }, + "papermill": { + "duration": 0.037207, + "end_time": "2023-04-10T22:56:01.862827", + "exception": false, + "start_time": "2023-04-10T22:56:01.825620", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "The following table shows a profiling summary of the Debugger built-in rules. \n", + "The table is sorted by the rules that triggered the most frequently. During your training job, the LowGPUUtilization rule\n", + "was the most frequently triggered. It processed 2660 datapoints and was triggered 28 times." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DescriptionRecommendationNumber of times rule triggeredNumber of datapointsRule parameters
LowGPUUtilizationChecks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.282660threshold_p95:70
threshold_p5:10
window:500
patience:1000
BatchSizeChecks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.282659cpu_threshold_p95:70
gpu_threshold_p95:70
gpu_memory_threshold_p95:70
patience:1000
window:500
DataloaderChecks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.Change the number of data loader processes.113041min_threshold:70
max_threshold:200
CPUBottleneckChecks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.Consider increasing the number of data loaders or applying data pre-fetching.05337threshold:50
cpu_threshold:90
gpu_threshold:10
patience:1000
StepOutlierDetects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.00threshold:3
mode:None
n_outliers:10
stddev:3
IOBottleneckChecks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.05337threshold:50
io_threshold:50
gpu_threshold:10
patience:1000
MaxInitializationTimeChecks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.00threshold:20
GPUMemoryIncreaseMeasures the average GPU memory footprint and triggers if there is a large increase.Choose a larger instance type with more memory if footprint is close to maximum available memory.02660increase:5
patience:1000
window:10
LoadBalancingDetects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.Choose a different distributed training strategy or a different distributed training framework.02660threshold:0.2
patience:1000
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "files = glob.glob('/opt/ml/processing/output/rule/profiler-output/profiler-reports/*json')\n", + "summary = {}\n", + "for i in files:\n", + " rule_name = i.split('/')[-1].replace('.json','')\n", + " if rule_name == \"OverallSystemUsage\" or rule_name == \"OverallFrameworkMetrics\":\n", + " continue\n", + " rule_report = json.load(open(i))\n", + " summary[rule_name] = {}\n", + " summary[rule_name]['Description'] = description[rule_name]\n", + " summary[rule_name]['Recommendation'] = recommendation[rule_name]\n", + " summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] \n", + " #summary[rule_name]['Number of violations'] = rule_report['Violations'] \n", + " summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']\n", + " summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']\n", + "\n", + "df = pd.DataFrame.from_dict(summary, orient='index')\n", + "df = df.sort_values(by=['Number of times rule triggered'], ascending=False)\n", + "\n", + "\n", + "display(Markdown(f\"\"\"The following table shows a profiling summary of the Debugger built-in rules. \n", + "The table is sorted by the rules that triggered the most frequently. During your training job, the {df.index[0]} rule\n", + "was the most frequently triggered. It processed {df.values[0,3]} datapoints and was triggered {df.values[0,2]} times.\"\"\"))\n", + "\n", + "with pd.option_context('display.colheader_justify','left'): \n", + " pretty_print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.911468Z", + "iopub.status.busy": "2023-04-10T22:56:01.910975Z", + "iopub.status.idle": "2023-04-10T22:56:01.913305Z", + "shell.execute_reply": "2023-04-10T22:56:01.913673Z" + }, + "papermill": { + "duration": 0.029572, + "end_time": "2023-04-10T22:56:01.913797", + "exception": false, + "start_time": "2023-04-10T22:56:01.884225", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Analyzing the training loop\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "analyse_phase = \"training\"\n", + "if job_statistics and \"initialization_in_seconds\" in job_statistics:\n", + " if job_statistics[\"initialization_in_seconds\"] > job_statistics[\"training_loop_duration_in_seconds\"]:\n", + " analyse_phase = \"initialization\"\n", + " time = job_statistics[\"initialization_in_seconds\"]\n", + " perc = job_statistics[\"initialization_%\"]\n", + " display(Markdown(f\"\"\"The initialization phase took {int(time)} seconds, which is {int(perc)}%*\n", + " of the total training time. Since the training loop has taken the most time, \n", + " we dive deep into the events occurring during this phase\"\"\"))\n", + " display(Markdown(\"\"\"## Analyzing initialization\\n\\n\"\"\"))\n", + " time = job_statistics[\"training_loop_duration_in_seconds\"]\n", + " perc = job_statistics[\"training_loop_%\"]\n", + " display(Markdown(f\"\"\"The training loop lasted for {int(time)} seconds which is {int(perc)}% of the training job time.\n", + " Since the training loop has taken the most time, we dive deep into the events occured during this phase.\"\"\"))\n", + "if analyse_phase == 'training':\n", + " display(Markdown(\"\"\"## Analyzing the training loop\\n\\n\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:01.961121Z", + "iopub.status.busy": "2023-04-10T22:56:01.960645Z", + "iopub.status.idle": "2023-04-10T22:56:01.962278Z", + "shell.execute_reply": "2023-04-10T22:56:01.962676Z" + }, + "papermill": { + "duration": 0.027442, + "end_time": "2023-04-10T22:56:01.962803", + "exception": false, + "start_time": "2023-04-10T22:56:01.935361", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "if analyse_phase == \"initialization\":\n", + " display(Markdown(\"\"\"### MaxInitializationTime\\n\\nThis rule helps to detect if the training initialization is taking too much time. \\nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\\nYou can run the rule locally in the following way:\n", + " \"\"\"))\n", + " \n", + " _ = load_report(\"MaxInitializationTime\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.030152Z", + "iopub.status.busy": "2023-04-10T22:56:02.019434Z", + "iopub.status.idle": "2023-04-10T22:56:02.040641Z", + "shell.execute_reply": "2023-04-10T22:56:02.040177Z" + }, + "papermill": { + "duration": 0.056378, + "end_time": "2023-04-10T22:56:02.040755", + "exception": false, + "start_time": "2023-04-10T22:56:01.984377", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Step duration analysis" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"d1ae4100-a182-4e40-b475-98be59dfe857\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1288\"}]},\"id\":\"1289\",\"type\":\"Column\"},{\"attributes\":{\"text\":\"The StepOutlier rule measures step durations and checks for outliers. The rule \\n returns True if duration is larger than 3 times the standard deviation. The rule \\n also takes the parameter mode, that specifies whether steps from training or validation phase \\n should be checked. In your processing job mode was specified as None. \\n Typically the first step is taking significantly more time and to avoid the \\n rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. \\n n_outliers was set to 10.\\n The rule analysed 0 datapoints and triggered 0 times.\\n \",\"width\":900},\"id\":\"1288\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1289\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"d1ae4100-a182-4e40-b475-98be59dfe857\",\"root_ids\":[\"1289\"],\"roots\":{\"1289\":\"5e726369-2766-46e2-ad8c-10f6699aee36\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1289" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### Step duration analysis\"\"\"))\n", + " report = load_report('StepOutlier')\n", + " if report:\n", + " parameters = report['RuleParameters']\n", + " params = report['RuleParameters'].split('\\n')\n", + " stddev = params[3].split(':')[1]\n", + " mode = params[1].split(':')[1]\n", + " n_outlier = params[2].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + "\n", + " text = f\"\"\"The StepOutlier rule measures step durations and checks for outliers. The rule \n", + " returns True if duration is larger than {stddev} times the standard deviation. The rule \n", + " also takes the parameter mode, that specifies whether steps from training or validation phase \n", + " should be checked. In your processing job mode was specified as {mode}. \n", + " Typically the first step is taking significantly more time and to avoid the \n", + " rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. \n", + " n_outliers was set to {n_outlier}.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\n", + " \"\"\"\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph))\n", + "\n", + " if report and len(report['Details']['step_details']) > 0:\n", + " for node_id in report['Details']['step_details']:\n", + " tmp = report['RuleParameters'].split('threshold:')\n", + " threshold = tmp[1].split('\\n')[0]\n", + " n_outliers = report['Details']['step_details'][node_id]['number_of_outliers']\n", + " mean = report['Details']['step_details'][node_id]['step_stats']['mean']\n", + " stddev = report['Details']['step_details'][node_id]['stddev']\n", + " phase = report['Details']['step_details'][node_id]['phase']\n", + " display(Markdown(f\"\"\"**Step durations on node {node_id}:**\"\"\"))\n", + " display(Markdown(f\"\"\"The following table is a summary of the statistics of step durations measured on node {node_id}.\n", + " The rule has analyzed the step duration from {phase} phase.\n", + " The average step duration on node {node_id} was {round(mean, 2)}s. \n", + " The rule detected {n_outliers} outliers, where step duration was larger than {threshold} times the standard deviation of {stddev}s\n", + " \\n\"\"\"))\n", + " step_stats_df = pd.DataFrame.from_dict(report['Details']['step_details'][node_id]['step_stats'], orient='index').T\n", + " step_stats_df.index = ['Step Durations in [s]']\n", + " pretty_print(step_stats_df)\n", + "\n", + " display(Markdown(f\"\"\"The following histogram shows the step durations measured on the different nodes. \n", + " You can turn on or turn off the visualization of histograms by selecting or unselecting the labels in the legend.\"\"\"))\n", + "\n", + " plot = figure(plot_height=450, \n", + " plot_width=850, \n", + " title=f\"\"\"Step durations\"\"\") \n", + "\n", + " colors = bokeh.palettes.viridis(len(report['Details']['step_details']))\n", + "\n", + " for index, node_id in enumerate(report['Details']['step_details']):\n", + " probs = report['Details']['step_details'][node_id]['probs']\n", + " binedges = report['Details']['step_details'][node_id]['binedges']\n", + "\n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=binedges[:-1],\n", + " right=binedges[1:],\n", + " line_color=\"white\",\n", + " fill_color=colors[index],\n", + " fill_alpha=0.7,\n", + " legend=node_id)\n", + "\n", + " plot.add_layout(Legend(), 'right') \n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Step durations in [s]\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + " plot.legend.location = 'center_right'\n", + " show(plot)\n", + "\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " text=f\"\"\"To get a better understanding of what may have caused those outliers,\n", + " we correlate the timestamps of step outliers with other framework metrics that happened at the same time.\n", + " The left chart shows how much time was spent in the different framework\n", + " metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without\n", + " outliers). The following chart shows how much time was spent in the different \n", + " framework metrics when step outliers occurred. In this chart framework metrics are not aggregated byphase.\"\"\"\n", + " plots = []\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether step outliers mainly happened during TRAIN or EVAL phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie chart on the right shows a detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The Ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators executed during the step outliers. \n", + " It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n", + " recorded when step outliers happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot))) " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.103161Z", + "iopub.status.busy": "2023-04-10T22:56:02.102585Z", + "iopub.status.idle": "2023-04-10T22:56:02.220635Z", + "shell.execute_reply": "2023-04-10T22:56:02.221011Z" + }, + "papermill": { + "duration": 0.157386, + "end_time": "2023-04-10T22:56:02.221147", + "exception": false, + "start_time": "2023-04-10T22:56:02.063761", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### GPU utilization analysis\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Usage per GPU** \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"94fdbb65-9f05-41d4-90b1-4b659c8054a9\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is \\n consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily \\n fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th \\n percentile of GPU utilization on 500 continuous datapoints and found 28 cases where \\n p95 was above 70% and p5 was below 10%. If p95 is high and p5 is low,\\n it might indicate that the GPU usage is highly fluctuating. If both values are very low, \\n it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, \\n so the rule skipped the first 1000 data points.\\n The rule analysed 2660 datapoints and triggered 28 times.\",\"width\":800},\"id\":\"1321\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1321\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"94fdbb65-9f05-41d4-90b1-4b659c8054a9\",\"root_ids\":[\"1321\"],\"roots\":{\"1321\":\"06c5c195-f988-403a-bcca-b2aa9f90d7b4\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1321" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"62b1b33c-ade3-4a56-abcb-0322f4979221\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"Your training job is underutilizing the instance. You may want to consider\\n to either switch to a smaller instance type or to increase the batch size. \\n The last time that the LowGPUUtilization rule was triggered in your training job was on 04/10/2023 at 22:54:00.\\n The following boxplots are a snapshot from the timestamps. \\n They show the utilization per GPU (without outliers).\\n To get a better understanding of the workloads throughout the whole training,\\n you can check the workload histogram in the next section.\",\"width\":800},\"id\":\"1353\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1353\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"62b1b33c-ade3-4a56-abcb-0322f4979221\",\"root_ids\":[\"1353\"],\"roots\":{\"1353\":\"055bce5c-0f2d-48a5-bdc9-38ebd2aa9e3c\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1353" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**GPU utilization of gpu0 on node algo-1:**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"08f4e6b0-4670-4cf1-a953-c5fac34d634d\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" The max utilization of gpu0 on node algo-1 was 56.0% and the 95th percentile was only 47.0%. \\n gpu0 on node algo-1 is underutilized and the 5th percentile was only 0.0%\",\"width\":900},\"id\":\"1443\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1443\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"08f4e6b0-4670-4cf1-a953-c5fac34d634d\",\"root_ids\":[\"1443\"],\"roots\":{\"1443\":\"f7e04964-c5fa-4186-98dd-951f22a12b66\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1443" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"93f676fb-0bca-4f74-b839-346a35fdc0e5\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1396\"}],\"center\":[{\"id\":\"1399\"},{\"id\":\"1403\"}],\"left\":[{\"id\":\"1400\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"1416\"},{\"id\":\"1421\"},{\"id\":\"1426\"},{\"id\":\"1431\"},{\"id\":\"1436\"},{\"id\":\"1441\"}],\"title\":{\"id\":\"1386\"},\"toolbar\":{\"id\":\"1408\"},\"x_range\":{\"id\":\"1388\"},\"x_scale\":{\"id\":\"1392\"},\"y_range\":{\"id\":\"1390\"},\"y_scale\":{\"id\":\"1394\"}},\"id\":\"1385\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":53.0}},\"id\":\"1439\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1520\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1518\"},\"selection_policy\":{\"id\":\"1519\"}},\"id\":\"1428\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"end\":17},\"id\":\"1388\",\"type\":\"Range1d\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"1509\"},\"ticker\":{\"id\":\"1401\"}},\"id\":\"1400\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data_source\":{\"id\":\"1438\"},\"glyph\":{\"id\":\"1439\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1440\"},\"selection_glyph\":null,\"view\":{\"id\":\"1442\"}},\"id\":\"1441\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1396\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"1399\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1423\"}},\"id\":\"1427\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1521\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1520\"},\"selection_policy\":{\"id\":\"1521\"}},\"id\":\"1433\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1394\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1438\"}},\"id\":\"1442\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"1423\"},\"glyph\":{\"id\":\"1424\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1425\"},\"selection_glyph\":null,\"view\":{\"id\":\"1427\"}},\"id\":\"1426\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1519\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1429\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1517\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1401\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1518\",\"type\":\"Selection\"},{\"attributes\":{\"formatter\":{\"id\":\"1511\"},\"major_label_overrides\":{\"1\":\"gpu0\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"1475\"}},\"id\":\"1396\",\"type\":\"LinearAxis\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":53.0}},\"id\":\"1440\",\"type\":\"Rect\"},{\"attributes\":{\"bottom\":{\"value\":38.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1425\",\"type\":\"VBar\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":13.0}},\"id\":\"1435\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1516\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1392\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1522\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1515\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":53.0},\"y1\":{\"value\":38.0}},\"id\":\"1415\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"1428\"}},\"id\":\"1432\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1509\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1405\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1404\"},{\"id\":\"1405\"},{\"id\":\"1406\"},{\"id\":\"1407\"}]},\"id\":\"1408\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1406\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1523\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1428\"},\"glyph\":{\"id\":\"1429\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1430\"},\"selection_glyph\":null,\"view\":{\"id\":\"1432\"}},\"id\":\"1431\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1400\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"1403\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1413\"}},\"id\":\"1417\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":13.0}},\"id\":\"1434\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"1413\"},\"glyph\":{\"id\":\"1414\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1415\"},\"selection_glyph\":null,\"view\":{\"id\":\"1417\"}},\"id\":\"1416\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"ticks\":[0,1]},\"id\":\"1475\",\"type\":\"FixedTicker\"},{\"attributes\":{\"text\":\"Node algo-1\"},\"id\":\"1386\",\"type\":\"Title\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":13.0},\"y1\":{\"value\":28.0}},\"id\":\"1419\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1430\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1514\"},\"selection_policy\":{\"id\":\"1515\"}},\"id\":\"1418\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1522\"},\"selection_policy\":{\"id\":\"1523\"}},\"id\":\"1438\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1511\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1390\",\"type\":\"DataRange1d\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":13.0},\"y1\":{\"value\":28.0}},\"id\":\"1420\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":53.0},\"y1\":{\"value\":38.0}},\"id\":\"1414\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"1433\"}},\"id\":\"1437\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null},\"id\":\"1404\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1513\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1512\"},\"selection_policy\":{\"id\":\"1513\"}},\"id\":\"1413\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1512\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":38.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1424\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"1418\"}},\"id\":\"1422\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1407\",\"type\":\"PanTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1433\"},\"glyph\":{\"id\":\"1434\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1435\"},\"selection_glyph\":null,\"view\":{\"id\":\"1437\"}},\"id\":\"1436\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1516\"},\"selection_policy\":{\"id\":\"1517\"}},\"id\":\"1423\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1514\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1418\"},\"glyph\":{\"id\":\"1419\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1420\"},\"selection_glyph\":null,\"view\":{\"id\":\"1422\"}},\"id\":\"1421\",\"type\":\"GlyphRenderer\"}],\"root_ids\":[\"1385\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"93f676fb-0bca-4f74-b839-346a35fdc0e5\",\"root_ids\":[\"1385\"],\"roots\":{\"1385\":\"dffd2d3c-030b-4a9e-9434-c691ea52f7fe\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1385" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**GPU utilization of gpu0 on node algo-2:**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"fa3b8514-cf5f-4fdc-b7a6-2fb7260de363\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" The max utilization of gpu0 on node algo-2 was 58.0% and the 95th percentile was only 51.0%. \\n gpu0 on node algo-2 is underutilized and the 5th percentile was only 0.0% The difference between 5th percentile 0.0% and 95th percentile 51.0% is quite \\n significant, which means that utilization on gpu0 is fluctuating quite a lot.\\n\",\"width\":900},\"id\":\"1662\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1662\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"fa3b8514-cf5f-4fdc-b7a6-2fb7260de363\",\"root_ids\":[\"1662\"],\"roots\":{\"1662\":\"c4acddc9-48f8-4bf3-93bc-d8e435538784\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1662" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"3c1ec19a-944c-4d6b-9547-0441234c590c\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1615\"}],\"center\":[{\"id\":\"1618\"},{\"id\":\"1622\"}],\"left\":[{\"id\":\"1619\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"1635\"},{\"id\":\"1640\"},{\"id\":\"1645\"},{\"id\":\"1650\"},{\"id\":\"1655\"},{\"id\":\"1660\"}],\"title\":{\"id\":\"1605\"},\"toolbar\":{\"id\":\"1627\"},\"x_range\":{\"id\":\"1607\"},\"x_scale\":{\"id\":\"1611\"},\"y_range\":{\"id\":\"1609\"},\"y_scale\":{\"id\":\"1613\"}},\"id\":\"1604\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1625\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1771\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":39.25},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":29.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1643\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1624\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"1637\"}},\"id\":\"1641\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1760\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1611\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1652\"}},\"id\":\"1656\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null},\"id\":\"1623\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1766\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1609\",\"type\":\"DataRange1d\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":11.125},\"y1\":{\"value\":28.0}},\"id\":\"1638\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1626\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1774\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1642\"},\"glyph\":{\"id\":\"1643\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1644\"},\"selection_glyph\":null,\"view\":{\"id\":\"1646\"}},\"id\":\"1645\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1620\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1765\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1642\"}},\"id\":\"1646\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":29.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1648\",\"type\":\"VBar\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":11.125}},\"id\":\"1653\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1763\",\"type\":\"Selection\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1623\"},{\"id\":\"1624\"},{\"id\":\"1625\"},{\"id\":\"1626\"}]},\"id\":\"1627\",\"type\":\"Toolbar\"},{\"attributes\":{\"formatter\":{\"id\":\"1762\"},\"major_label_overrides\":{\"1\":\"gpu0\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"1710\"}},\"id\":\"1615\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1763\"},\"selection_policy\":{\"id\":\"1764\"}},\"id\":\"1632\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1767\"},\"selection_policy\":{\"id\":\"1768\"}},\"id\":\"1642\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":11.125}},\"id\":\"1654\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1613\",\"type\":\"LinearScale\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1765\"},\"selection_policy\":{\"id\":\"1766\"}},\"id\":\"1637\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":29.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1649\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1762\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"1637\"},\"glyph\":{\"id\":\"1638\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1639\"},\"selection_glyph\":null,\"view\":{\"id\":\"1641\"}},\"id\":\"1640\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.125}},\"id\":\"1658\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1771\"},\"selection_policy\":{\"id\":\"1772\"}},\"id\":\"1652\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1773\"},\"selection_policy\":{\"id\":\"1774\"}},\"id\":\"1657\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"1647\"},\"glyph\":{\"id\":\"1648\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1649\"},\"selection_glyph\":null,\"view\":{\"id\":\"1651\"}},\"id\":\"1650\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"end\":17},\"id\":\"1607\",\"type\":\"Range1d\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":11.125},\"y1\":{\"value\":28.0}},\"id\":\"1639\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":39.25},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":29.0},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"1644\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"1647\"}},\"id\":\"1651\",\"type\":\"CDSView\"},{\"attributes\":{\"ticks\":[0,1]},\"id\":\"1710\",\"type\":\"FixedTicker\"},{\"attributes\":{\"data_source\":{\"id\":\"1657\"},\"glyph\":{\"id\":\"1658\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1659\"},\"selection_glyph\":null,\"view\":{\"id\":\"1661\"}},\"id\":\"1660\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"1619\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"1622\",\"type\":\"Grid\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"1769\"},\"selection_policy\":{\"id\":\"1770\"}},\"id\":\"1647\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis\":{\"id\":\"1615\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"1618\",\"type\":\"Grid\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.125},\"y1\":{\"value\":39.25}},\"id\":\"1634\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"1770\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1768\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"1760\"},\"ticker\":{\"id\":\"1620\"}},\"id\":\"1619\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1632\"}},\"id\":\"1636\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1767\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1769\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1652\"},\"glyph\":{\"id\":\"1653\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1654\"},\"selection_glyph\":null,\"view\":{\"id\":\"1656\"}},\"id\":\"1655\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1764\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1772\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1632\"},\"glyph\":{\"id\":\"1633\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1634\"},\"selection_glyph\":null,\"view\":{\"id\":\"1636\"}},\"id\":\"1635\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"1657\"}},\"id\":\"1661\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":56.125}},\"id\":\"1659\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1773\",\"type\":\"Selection\"},{\"attributes\":{\"text\":\"Node algo-2\"},\"id\":\"1605\",\"type\":\"Title\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":56.125},\"y1\":{\"value\":39.25}},\"id\":\"1633\",\"type\":\"Segment\"}],\"root_ids\":[\"1604\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"3c1ec19a-944c-4d6b-9547-0441234c590c\",\"root_ids\":[\"1604\"],\"roots\":{\"1604\":\"5fc8b594-6ff9-411f-9826-174796e532bb\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1604" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### GPU utilization analysis\\n\\n\"\"\"))\n", + " display(Markdown(\"\"\"**Usage per GPU** \\n\\n\"\"\"))\n", + " report = load_report('LowGPUUtilization')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold_p95 = params[0].split(':')[1]\n", + " threshold_p5 = params[1].split(':')[1]\n", + " window = params[2].split(':')[1]\n", + " patience = params[3].split(':')[1]\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=Paragraph(text=f\"\"\"The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is \n", + " consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily \n", + " fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th \n", + " percentile of GPU utilization on {window} continuous datapoints and found {violations} cases where \n", + " p95 was above {threshold_p95}% and p5 was below {threshold_p5}%. If p95 is high and p5 is low,\n", + " it might indicate that the GPU usage is highly fluctuating. If both values are very low, \n", + " it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, \n", + " so the rule skipped the first {patience} data points.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\", width=800)\n", + " show(text)\n", + "\n", + " \n", + " if len(report['Details']) > 0:\n", + " \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n", + " to either switch to a smaller instance type or to increase the batch size. \n", + " The last time that the LowGPUUtilization rule was triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. \n", + " They show the utilization per GPU (without outliers).\n", + " To get a better understanding of the workloads throughout the whole training,\n", + " you can check the workload histogram in the next section.\"\"\", width=800)\n", + " show(text)\n", + " \n", + " del report['Details']['last_timestamp']\n", + " \n", + " for node_id in report['Details']:\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,17),\n", + " )\n", + " \n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " display(Markdown(f\"\"\"**GPU utilization of {key} on node {node_id}:**\"\"\"))\n", + " text = \"\"\n", + " gpu_max = report['Details'][node_id][key]['gpu_max']\n", + " p_95 = report['Details'][node_id][key]['gpu_95']\n", + " p_5 = report['Details'][node_id][key]['gpu_5']\n", + " text = f\"\"\"{text} The max utilization of {key} on node {node_id} was {gpu_max}%\"\"\"\n", + " if p_95 < int(threshold_p95): \n", + " text = f\"\"\"{text} and the 95th percentile was only {p_95}%. \n", + " {key} on node {node_id} is underutilized\"\"\"\n", + " if p_5 < int(threshold_p5): \n", + " text = f\"\"\"{text} and the 5th percentile was only {p_5}%\"\"\"\n", + " if p_95 - p_5 > 50:\n", + " text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n", + " significant, which means that utilization on {key} is fluctuating quite a lot.\\n\"\"\"\n", + " \n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " text=Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(text)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " \n", + " show(plot)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.287605Z", + "iopub.status.busy": "2023-04-10T22:56:02.287037Z", + "iopub.status.idle": "2023-04-10T22:56:02.385070Z", + "shell.execute_reply": "2023-04-10T22:56:02.385447Z" + }, + "papermill": { + "duration": 0.13713, + "end_time": "2023-04-10T22:56:02.385584", + "exception": false, + "start_time": "2023-04-10T22:56:02.248454", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "**Workload balancing**\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"9fc5d011-6b09-4458-a181-69ea020ad203\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The LoadBalancing rule helps to detect issues in workload balancing \\n between multiple GPUs. \\n It computes a histogram of GPU utilization values for each GPU and compares then the \\n similarity between histograms. The rule checked if the distance of histograms is larger than the \\n threshold of 0.2.\\n During initialization utilization is likely zero, so the rule skipped the first 1000 data points.\\n \",\"width\":900},\"id\":\"1855\",\"type\":\"Paragraph\"}],\"root_ids\":[\"1855\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"9fc5d011-6b09-4458-a181-69ea020ad203\",\"root_ids\":[\"1855\"],\"roots\":{\"1855\":\"03c36bf4-ddfe-4d04-b57f-6291d4dca229\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1855" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"adf4d98e-bc68-4150-b6f2-e3798a489b40\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"1967\"},{\"id\":\"1919\"}]},\"id\":\"1968\",\"type\":\"Column\"},{\"attributes\":{\"start\":0},\"id\":\"1924\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1931\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Utilization\",\"formatter\":{\"id\":\"1960\"},\"ticker\":{\"id\":\"1931\"}},\"id\":\"1930\",\"type\":\"LinearAxis\"},{\"attributes\":{\"text\":\"Workloads on node algo-1\"},\"id\":\"1920\",\"type\":\"Title\"},{\"attributes\":{\"axis\":{\"id\":\"1934\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1937\",\"type\":\"Grid\"},{\"attributes\":{\"axis\":{\"id\":\"1930\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"1933\",\"type\":\"Grid\"},{\"attributes\":{\"end\":100,\"start\":-1},\"id\":\"1922\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"1935\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"1958\"},\"ticker\":{\"id\":\"1935\"}},\"id\":\"1934\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1926\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1954\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"1930\"}],\"center\":[{\"id\":\"1933\"},{\"id\":\"1937\"},{\"id\":\"1965\"}],\"left\":[{\"id\":\"1934\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"1955\"}],\"title\":{\"id\":\"1920\"},\"toolbar\":{\"id\":\"1945\"},\"x_range\":{\"id\":\"1922\"},\"x_scale\":{\"id\":\"1926\"},\"y_range\":{\"id\":\"1924\"},\"y_scale\":{\"id\":\"1928\"}},\"id\":\"1919\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data_source\":{\"id\":\"1952\"},\"glyph\":{\"id\":\"1953\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1954\"},\"selection_glyph\":null,\"view\":{\"id\":\"1956\"}},\"id\":\"1955\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"1953\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1938\"},{\"id\":\"1939\"},{\"id\":\"1940\"},{\"id\":\"1941\"},{\"id\":\"1942\"},{\"id\":\"1943\"}]},\"id\":\"1945\",\"type\":\"Toolbar\"},{\"attributes\":{\"label\":{\"value\":\"gpu0\"},\"renderers\":[{\"id\":\"1955\"}]},\"id\":\"1966\",\"type\":\"LegendItem\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1944\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[530,3,4,2,1,0,0,4,0,0,1,1,24,24,937,99,119,115,120,114,125,100,110,76,71,50,19,10,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"1962\"},\"selection_policy\":{\"id\":\"1963\"}},\"id\":\"1952\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1938\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1958\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1928\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1939\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"text\":\"The following histogram shows the workload per GPU on node algo-1. \\n You can enable/disable the visualization of a workload by clicking on the label in the legend.\\n \"},\"id\":\"1967\",\"type\":\"Paragraph\"},{\"attributes\":{\"overlay\":{\"id\":\"1944\"}},\"id\":\"1940\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1941\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1962\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1960\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1942\",\"type\":\"ResetTool\"},{\"attributes\":{\"source\":{\"id\":\"1952\"}},\"id\":\"1956\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1943\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1963\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"1966\"}]},\"id\":\"1965\",\"type\":\"Legend\"}],\"root_ids\":[\"1968\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"adf4d98e-bc68-4150-b6f2-e3798a489b40\",\"root_ids\":[\"1968\"],\"roots\":{\"1968\":\"3e54a45f-5960-4a5a-a42b-4de755cf4e47\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1968" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"5ff507cc-ad97-4e7b-a472-20a37aaf4081\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"2128\"},{\"id\":\"2080\"}]},\"id\":\"2129\",\"type\":\"Column\"},{\"attributes\":{},\"id\":\"2102\",\"type\":\"SaveTool\"},{\"attributes\":{\"end\":100,\"start\":-1},\"id\":\"2083\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"2099\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2103\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"2104\",\"type\":\"HelpTool\"},{\"attributes\":{\"text\":\"Workloads on node algo-2\"},\"id\":\"2081\",\"type\":\"Title\"},{\"attributes\":{\"label\":{\"value\":\"gpu0\"},\"renderers\":[{\"id\":\"2116\"}]},\"id\":\"2127\",\"type\":\"LegendItem\"},{\"attributes\":{\"text\":\"The following histogram shows the workload per GPU on node algo-2. \\n You can enable/disable the visualization of a workload by clicking on the label in the legend.\\n \"},\"id\":\"2128\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"2089\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2119\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2124\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"start\":0},\"id\":\"2085\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2087\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2121\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"2105\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"2123\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2096\",\"type\":\"BasicTicker\"},{\"attributes\":{\"below\":[{\"id\":\"2091\"}],\"center\":[{\"id\":\"2094\"},{\"id\":\"2098\"},{\"id\":\"2126\"}],\"left\":[{\"id\":\"2095\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"2116\"}],\"title\":{\"id\":\"2081\"},\"toolbar\":{\"id\":\"2106\"},\"x_range\":{\"id\":\"2083\"},\"x_scale\":{\"id\":\"2087\"},\"y_range\":{\"id\":\"2085\"},\"y_scale\":{\"id\":\"2089\"}},\"id\":\"2080\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"Utilization\",\"formatter\":{\"id\":\"2121\"},\"ticker\":{\"id\":\"2092\"}},\"id\":\"2091\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"2113\"}},\"id\":\"2117\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2100\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"2127\"}]},\"id\":\"2126\",\"type\":\"Legend\"},{\"attributes\":{\"data\":{\"left\":[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96],\"right\":[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98],\"top\":[556,1,2,5,1,0,0,0,0,0,0,2,7,24,860,105,117,88,122,105,114,119,103,106,87,78,42,11,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},\"selected\":{\"id\":\"2123\"},\"selection_policy\":{\"id\":\"2124\"}},\"id\":\"2113\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"2119\"},\"ticker\":{\"id\":\"2096\"}},\"id\":\"2095\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2095\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2098\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2115\",\"type\":\"Quad\"},{\"attributes\":{\"axis\":{\"id\":\"2091\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2094\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2092\",\"type\":\"BasicTicker\"},{\"attributes\":{\"overlay\":{\"id\":\"2105\"}},\"id\":\"2101\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2114\",\"type\":\"Quad\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2099\"},{\"id\":\"2100\"},{\"id\":\"2101\"},{\"id\":\"2102\"},{\"id\":\"2103\"},{\"id\":\"2104\"}]},\"id\":\"2106\",\"type\":\"Toolbar\"},{\"attributes\":{\"data_source\":{\"id\":\"2113\"},\"glyph\":{\"id\":\"2114\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2115\"},\"selection_glyph\":null,\"view\":{\"id\":\"2117\"}},\"id\":\"2116\",\"type\":\"GlyphRenderer\"}],\"root_ids\":[\"2129\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"5ff507cc-ad97-4e7b-a472-20a37aaf4081\",\"root_ids\":[\"2129\"],\"roots\":{\"2129\":\"35126f56-c6ab-44d7-a563-257e9b0b368e\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2129" + } + }, + "output_type": "display_data" + } + ], + "source": [ + " \n", + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"**Workload balancing**\\n\\n\"\"\")) \n", + " report = load_report('LoadBalancing')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = params[0].split(':')[1]\n", + " patience = params[1].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " paragraph = Paragraph(text=f\"\"\"The LoadBalancing rule helps to detect issues in workload balancing \n", + " between multiple GPUs. \n", + " It computes a histogram of GPU utilization values for each GPU and compares then the \n", + " similarity between histograms. The rule checked if the distance of histograms is larger than the \n", + " threshold of {threshold}.\n", + " During initialization utilization is likely zero, so the rule skipped the first {patience} data points.\n", + " \"\"\", width=900)\n", + " show(paragraph)\n", + " \n", + " if len(report['Details']) > 0:\n", + " for node_id in report['Details']: \n", + " \n", + " \n", + " text = f\"\"\"The following histogram shows the workload per GPU on node {node_id}. \n", + " You can enable/disable the visualization of a workload by clicking on the label in the legend.\n", + " \"\"\"\n", + " if len(report['Details']) == 1 and len(report['Details'][node_id]['workloads']) == 1:\n", + " text = f\"\"\"{text} Your training job only used one GPU so there is no workload balancing issue.\"\"\"\n", + " \n", + " plot = figure(plot_height=450, \n", + " plot_width=850, \n", + " x_range=(-1,100),\n", + " title=f\"\"\"Workloads on node {node_id}\"\"\")\n", + " \n", + " colors = bokeh.palettes.viridis(len(report['Details'][node_id]['workloads']))\n", + " \n", + " for index, gpu_id2 in enumerate(report['Details'][node_id]['workloads']):\n", + " probs = report['Details'][node_id]['workloads'][gpu_id2]\n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=np.arange(0,98,2),\n", + " right=np.arange(2,100,2),\n", + " line_color=\"white\",\n", + " fill_color=colors[index],\n", + " fill_alpha=0.8,\n", + " legend=gpu_id2 )\n", + "\n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Utilization\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + " \n", + " paragraph = Paragraph(text=text)\n", + " show(column(paragraph, plot))\n", + " \n", + " if \"distances\" in report['Details'][node_id]:\n", + " text = f\"\"\"The rule identified workload balancing issues on node {node_id} \n", + " where workloads differed by more than threshold {threshold}. \n", + " \"\"\"\n", + " for index, gpu_id2 in enumerate(report['Details'][node_id]['distances']):\n", + " for gpu_id1 in report['Details'][node_id]['distances'][gpu_id2]:\n", + " distance = round(report['Details'][node_id]['distances'][gpu_id2][gpu_id1], 2)\n", + " text = f\"\"\"{text} The difference of workload between {gpu_id2} and {gpu_id1} is: {distance}.\"\"\"\n", + "\n", + " paragraph = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(column(paragraph))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.458547Z", + "iopub.status.busy": "2023-04-10T22:56:02.457951Z", + "iopub.status.idle": "2023-04-10T22:56:02.535535Z", + "shell.execute_reply": "2023-04-10T22:56:02.535911Z" + }, + "papermill": { + "duration": 0.12002, + "end_time": "2023-04-10T22:56:02.536048", + "exception": false, + "start_time": "2023-04-10T22:56:02.416028", + "status": "completed" + }, + "scrolled": true, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Dataloading analysis\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"9d45ef55-a396-40ef-907c-0c683189a230\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The number of dataloader workers can greatly affect the overall performance \\n of your training job. The rule analyzed the number of dataloading processes that have been running in \\n parallel on the training instance and compares it against the total number of cores. \\n The rule checked if the number of processes is smaller than 70% or larger than \\n 200% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU \\n underutilization. Having too many dataloader workers may hurt the\\n overall performance if you are running other compute intensive tasks on the CPU.\\n The rule analysed 13041 datapoints and triggered 1 times.\",\"width\":900},\"id\":\"2249\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2249\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"9d45ef55-a396-40ef-907c-0c683189a230\",\"root_ids\":[\"2249\"],\"roots\":{\"2249\":\"72274a2b-55c8-464f-a7f6-0767179a6ae3\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2249" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"28f73abe-dfab-4bc0-a5f8-31a58ea57824\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\" Your training instance provided 4 CPU cores, however your training job only \\n ran on average 2 dataloader workers in parallel. We recommend you to increase the number of\\n dataloader workers. Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.\\n The rule detected that your training job was not using pinned memory. \\n In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True.\",\"width\":900},\"id\":\"2329\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2329\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"28f73abe-dfab-4bc0-a5f8-31a58ea57824\",\"root_ids\":[\"2329\"],\"roots\":{\"2329\":\"6e3f7e2e-3ad7-4633-9be6-f9cac4693923\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2329" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"92c64635-717a-41be-b343-91f45a2294cd\":{\"roots\":{\"references\":[{\"attributes\":{\"children\":[{\"id\":\"2450\"},{\"id\":\"2409\"}]},\"id\":\"2451\",\"type\":\"Column\"},{\"attributes\":{\"start\":0},\"id\":\"2412\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2416\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2414\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2419\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data_source\":{\"id\":\"2435\"},\"glyph\":{\"id\":\"2436\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2437\"},\"selection_glyph\":null,\"view\":{\"id\":\"2439\"}},\"id\":\"2438\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"2418\"},\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2421\",\"type\":\"Grid\"},{\"attributes\":{\"text\":\"\"},\"id\":\"2440\",\"type\":\"Title\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2437\",\"type\":\"Quad\"},{\"attributes\":{\"axis_label\":\"Occurrences\",\"formatter\":{\"id\":\"2442\"},\"ticker\":{\"id\":\"2423\"}},\"id\":\"2422\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2422\"},\"dimension\":1,\"grid_line_color\":\"white\",\"ticker\":null},\"id\":\"2425\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"2435\"}},\"id\":\"2439\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{\"left\":[0.016137,0.018631309999999998,0.021125619999999998,0.023619929999999997,0.026114239999999997,0.028608549999999996,0.03110286,0.033597169999999996,0.036091479999999995,0.038585789999999995,0.041080099999999994,0.043574409999999994,0.04606872,0.04856303,0.05105734,0.05355165,0.05604596,0.05854027,0.06103458,0.06352889,0.0660232,0.06851751,0.07101182,0.07350613,0.07600044,0.07849475,0.08098906,0.08348337,0.08597768,0.08847199,0.0909663,0.09346061,0.09595492,0.09844923,0.10094354,0.10343785,0.10593216,0.10842647,0.11092078,0.11341509,0.1159094,0.11840371,0.12089802,0.12339233,0.12588664,0.12838095,0.13087526,0.13336957,0.13586388,0.13835819,0.1408525,0.14334681,0.14584112,0.14833543,0.15082974,0.15332405,0.15581836,0.15831267,0.16080698,0.16330129,0.1657956,0.16828991,0.17078422,0.17327853,0.17577283999999999,0.17826714999999999,0.18076145999999998,0.18325576999999998,0.18575007999999998,0.18824438999999998,0.19073869999999998,0.19323300999999998,0.19572731999999998,0.19822162999999998,0.20071593999999998,0.20321024999999998,0.20570455999999998,0.20819886999999998,0.21069317999999998,0.21318748999999998,0.21568179999999998,0.21817610999999998,0.22067041999999998,0.22316472999999998,0.22565903999999998,0.22815334999999998,0.23064765999999998,0.23314196999999998,0.23563627999999998,0.23813058999999998,0.24062489999999997,0.24311920999999997,0.24561351999999997,0.24810782999999997,0.25060214,0.25309645,0.25559076000000003,0.25808507,0.26057938,0.26307369],\"right\":[0.018631309999999998,0.021125619999999998,0.023619929999999997,0.026114239999999997,0.028608549999999996,0.03110286,0.033597169999999996,0.036091479999999995,0.038585789999999995,0.041080099999999994,0.043574409999999994,0.04606872,0.04856303,0.05105734,0.05355165,0.05604596,0.05854027,0.06103458,0.06352889,0.0660232,0.06851751,0.07101182,0.07350613,0.07600044,0.07849475,0.08098906,0.08348337,0.08597768,0.08847199,0.0909663,0.09346061,0.09595492,0.09844923,0.10094354,0.10343785,0.10593216,0.10842647,0.11092078,0.11341509,0.1159094,0.11840371,0.12089802,0.12339233,0.12588664,0.12838095,0.13087526,0.13336957,0.13586388,0.13835819,0.1408525,0.14334681,0.14584112,0.14833543,0.15082974,0.15332405,0.15581836,0.15831267,0.16080698,0.16330129,0.1657956,0.16828991,0.17078422,0.17327853,0.17577283999999999,0.17826714999999999,0.18076145999999998,0.18325576999999998,0.18575007999999998,0.18824438999999998,0.19073869999999998,0.19323300999999998,0.19572731999999998,0.19822162999999998,0.20071593999999998,0.20321024999999998,0.20570455999999998,0.20819886999999998,0.21069317999999998,0.21318748999999998,0.21568179999999998,0.21817610999999998,0.22067041999999998,0.22316472999999998,0.22565903999999998,0.22815334999999998,0.23064765999999998,0.23314196999999998,0.23563627999999998,0.23813058999999998,0.24062489999999997,0.24311920999999997,0.24561351999999997,0.24810782999999997,0.25060214,0.25309645,0.25559076000000003,0.25808507,0.26057938,0.26307369,0.265568],\"top\":[16,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,19,45,50,61,107,153,209,258,284,361,439,523,613,667,780,844,846,900,845,786,776,697,567,498,383,312,255,190,146,115,64,57,47,23,21,12,7,10,8,2,5,3,2,4,1,1,4,1,1,1,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]},\"selected\":{\"id\":\"2446\"},\"selection_policy\":{\"id\":\"2447\"}},\"id\":\"2435\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":0},\"fill_alpha\":{\"value\":0.8},\"fill_color\":{\"value\":\"#440154\"},\"left\":{\"field\":\"left\"},\"line_color\":{\"value\":\"white\"},\"right\":{\"field\":\"right\"},\"top\":{\"field\":\"top\"}},\"id\":\"2436\",\"type\":\"Quad\"},{\"attributes\":{\"below\":[{\"id\":\"2418\"}],\"center\":[{\"id\":\"2421\"},{\"id\":\"2425\"},{\"id\":\"2448\"}],\"left\":[{\"id\":\"2422\"}],\"plot_height\":450,\"plot_width\":850,\"renderers\":[{\"id\":\"2438\"}],\"title\":{\"id\":\"2440\"},\"toolbar\":{\"id\":\"2430\"},\"x_range\":{\"id\":\"2410\"},\"x_scale\":{\"id\":\"2414\"},\"y_range\":{\"id\":\"2412\"},\"y_scale\":{\"id\":\"2416\"}},\"id\":\"2409\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"text\":\"The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was 0.1041s. \\n The 95th percentile was 0.1293s and the 25th percentile was 0.0939s\",\"width\":900},\"id\":\"2450\",\"type\":\"Paragraph\"},{\"attributes\":{},\"id\":\"2442\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2427\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"callback\":null},\"id\":\"2426\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"2428\",\"type\":\"ResetTool\"},{\"attributes\":{\"end\":0.265568,\"start\":0.016137},\"id\":\"2410\",\"type\":\"Range1d\"},{\"attributes\":{\"label\":{\"value\":\"Dataloading events\"},\"renderers\":[{\"id\":\"2438\"}]},\"id\":\"2449\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"2429\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"2444\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"2423\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"2446\",\"type\":\"Selection\"},{\"attributes\":{\"axis_label\":\"Dataloading in [s]\",\"formatter\":{\"id\":\"2444\"},\"ticker\":{\"id\":\"2419\"}},\"id\":\"2418\",\"type\":\"LinearAxis\"},{\"attributes\":{\"click_policy\":\"hide\",\"items\":[{\"id\":\"2449\"}]},\"id\":\"2448\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"2447\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2426\"},{\"id\":\"2427\"},{\"id\":\"2428\"},{\"id\":\"2429\"}]},\"id\":\"2430\",\"type\":\"Toolbar\"}],\"root_ids\":[\"2451\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"92c64635-717a-41be-b343-91f45a2294cd\",\"root_ids\":[\"2451\"],\"roots\":{\"2451\":\"537e7850-a802-48e2-b545-1df757f9245b\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2451" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### Dataloading analysis\\n\\n\"\"\"))\n", + " report = load_report('Dataloader')\n", + " if report:\n", + " params = report['RuleParameters'].split(\"\\n\")\n", + " min_threshold = params[0].split(':')[1]\n", + " max_threshold = params[1].split(':')[1]\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=f\"\"\"The number of dataloader workers can greatly affect the overall performance \n", + " of your training job. The rule analyzed the number of dataloading processes that have been running in \n", + " parallel on the training instance and compares it against the total number of cores. \n", + " The rule checked if the number of processes is smaller than {min_threshold}% or larger than \n", + " {max_threshold}% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU \n", + " underutilization. Having too many dataloader workers may hurt the\n", + " overall performance if you are running other compute intensive tasks on the CPU.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n", + " \n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(paragraph)\n", + " text = \"\"\n", + " if 'cores' in report['Details']:\n", + " cores = int(report['Details']['cores'])\n", + " dataloaders = report['Details']['dataloaders']\n", + " if dataloaders < cores: \n", + " text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job only \n", + " ran on average {dataloaders} dataloader workers in parallel. We recommend you to increase the number of\n", + " dataloader workers.\"\"\"\n", + " if dataloaders > cores:\n", + " text=f\"\"\"{text} Your training instance provided {cores} CPU cores, however your training job ran \n", + " on average {dataloaders} dataloader workers. We recommed you to decrease the number of dataloader\n", + " workers.\"\"\"\n", + " if 'pin_memory' in report['Details'] and report['Details']['pin_memory'] == False:\n", + " text=f\"\"\"{text} Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.\n", + " The rule detected that your training job was not using pinned memory. \n", + " In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True.\"\"\"\n", + " \n", + " if 'prefetch' in report['Details'] and report['Details']['prefetch'] == False:\n", + " text=f\"\"\"{text} It appears that your training job did not perform any data pre-fetching. Pre-fetching can improve your\n", + " data input pipeline as it produces the data ahead of time.\"\"\"\n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(paragraph)\n", + " \n", + " colors=bokeh.palettes.viridis(10)\n", + " if \"dataloading_time\" in report['Details']:\n", + " median = round(report['Details'][\"dataloading_time\"]['p50'],4)\n", + " p95 = round(report['Details'][\"dataloading_time\"]['p95'],4)\n", + " p25 = round(report['Details'][\"dataloading_time\"]['p25'],4)\n", + " binedges = report['Details'][\"dataloading_time\"]['binedges']\n", + " probs = report['Details'][\"dataloading_time\"]['probs']\n", + " text=f\"\"\"The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was {median}s. \n", + " The 95th percentile was {p95}s and the 25th percentile was {p25}s\"\"\"\n", + "\n", + " plot = figure(plot_height=450, \n", + " plot_width=850,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\",\n", + " x_range=(binedges[0], binedges[-1])\n", + " )\n", + " \n", + " plot.quad( top=probs,\n", + " bottom=0,\n", + " left=binedges[:-1],\n", + " right=binedges[1:],\n", + " line_color=\"white\",\n", + " fill_color=colors[0],\n", + " fill_alpha=0.8,\n", + " legend=\"Dataloading events\" )\n", + "\n", + " plot.y_range.start = 0\n", + " plot.xaxis.axis_label = f\"\"\"Dataloading in [s]\"\"\"\n", + " plot.yaxis.axis_label = \"Occurrences\"\n", + " plot.grid.grid_line_color = \"white\"\n", + " plot.legend.click_policy=\"hide\"\n", + "\n", + " paragraph = Paragraph(text=f\"{text}\", width=900)\n", + " show(column(paragraph, plot))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.613417Z", + "iopub.status.busy": "2023-04-10T22:56:02.612267Z", + "iopub.status.idle": "2023-04-10T22:56:02.863938Z", + "shell.execute_reply": "2023-04-10T22:56:02.864317Z" + }, + "papermill": { + "duration": 0.296817, + "end_time": "2023-04-10T22:56:02.864458", + "exception": false, + "start_time": "2023-04-10T22:56:02.567641", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + " ### Batch size" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"832b00c2-9495-4f5f-b39a-cc2e623f9b07\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The BatchSize rule helps to detect if GPU is underutilized because of the batch size being \\n too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of \\n 70%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of 70% and the 95th percentile of memory footprint below gpu_memory_threshold_p95 of 70%. In your training job this happened 28 times. The rule skipped the first 1000 datapoints. The rule computed the percentiles over window size of 500 continuous datapoints.\\n\\n The rule analysed 2659 datapoints and triggered 28 times.\\n \",\"width\":800},\"id\":\"2579\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2579\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"832b00c2-9495-4f5f-b39a-cc2e623f9b07\",\"root_ids\":[\"2579\"],\"roots\":{\"2579\":\"7f23a11d-4f9e-4271-9d55-1ff80622b963\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2579" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"af7d4e13-e717-4c22-86c6-092057325b5b\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"Your training job is underutilizing the instance. You may want to consider\\n either switch to a smaller instance type or to increase the batch size. \\n The last time the BatchSize rule triggered in your training job was on 04/10/2023 at 22:32:00.\\n The following boxplots are a snapshot from the timestamps. They the total \\n CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).\",\"width\":800},\"id\":\"2667\",\"type\":\"Paragraph\"}],\"root_ids\":[\"2667\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"af7d4e13-e717-4c22-86c6-092057325b5b\",\"root_ids\":[\"2667\"],\"roots\":{\"2667\":\"cef63a93-079c-473a-a941-5647f67287e4\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2667" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"052de5d2-daf2-42ba-b110-65e9a124e0cb\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"2766\"}],\"center\":[{\"id\":\"2769\"},{\"id\":\"2773\"}],\"left\":[{\"id\":\"2770\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"2786\"},{\"id\":\"2791\"},{\"id\":\"2796\"},{\"id\":\"2801\"},{\"id\":\"2806\"},{\"id\":\"2811\"},{\"id\":\"2816\"},{\"id\":\"2821\"},{\"id\":\"2826\"},{\"id\":\"2831\"},{\"id\":\"2836\"},{\"id\":\"2841\"},{\"id\":\"2846\"},{\"id\":\"2851\"},{\"id\":\"2856\"},{\"id\":\"2861\"},{\"id\":\"2866\"},{\"id\":\"2871\"}],\"title\":{\"id\":\"2756\"},\"toolbar\":{\"id\":\"2778\"},\"x_range\":{\"id\":\"2758\"},\"x_scale\":{\"id\":\"2762\"},\"y_range\":{\"id\":\"2760\"},\"y_scale\":{\"id\":\"2764\"}},\"id\":\"2755\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"source\":{\"id\":\"2838\"}},\"id\":\"2842\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":28.0},\"y1\":{\"value\":28.0}},\"id\":\"2845\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":28.0},\"y1\":{\"value\":28.0}},\"id\":\"2844\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2969\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2990\"},\"selection_policy\":{\"id\":\"2991\"}},\"id\":\"2843\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2967\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":20.0},\"y1\":{\"value\":20.0}},\"id\":\"2849\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2966\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2968\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2843\"}},\"id\":\"2847\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2971\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2970\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2843\"},\"glyph\":{\"id\":\"2844\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2845\"},\"selection_glyph\":null,\"view\":{\"id\":\"2847\"}},\"id\":\"2846\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2992\"},\"selection_policy\":{\"id\":\"2993\"}},\"id\":\"2848\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"ticks\":[0,1,2,3]},\"id\":\"2873\",\"type\":\"FixedTicker\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":20.0},\"y1\":{\"value\":20.0}},\"id\":\"2850\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2972\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":21.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2854\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2973\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2974\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2848\"}},\"id\":\"2852\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2996\"},\"selection_policy\":{\"id\":\"2997\"}},\"id\":\"2858\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2975\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2848\"},\"glyph\":{\"id\":\"2849\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2850\"},\"selection_glyph\":null,\"view\":{\"id\":\"2852\"}},\"id\":\"2851\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2976\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2994\"},\"selection_policy\":{\"id\":\"2995\"}},\"id\":\"2853\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2977\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"2853\"}},\"id\":\"2857\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2978\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2998\"},\"selection_policy\":{\"id\":\"2999\"}},\"id\":\"2863\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2979\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"2853\"},\"glyph\":{\"id\":\"2854\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2855\"},\"selection_glyph\":null,\"view\":{\"id\":\"2857\"}},\"id\":\"2856\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":21.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":20.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2859\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2980\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2981\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":21.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2855\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"2982\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2858\"}},\"id\":\"2862\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2983\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2984\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2858\"},\"glyph\":{\"id\":\"2859\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2860\"},\"selection_glyph\":null,\"view\":{\"id\":\"2862\"}},\"id\":\"2861\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":20.0}},\"id\":\"2864\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2985\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2986\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":21.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":20.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"2860\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3000\"},\"selection_policy\":{\"id\":\"3001\"}},\"id\":\"2868\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2987\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2965\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"2863\"}},\"id\":\"2867\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2988\",\"type\":\"Selection\"},{\"attributes\":{\"text\":\"Node algo-1\"},\"id\":\"2756\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"2989\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":28.0}},\"id\":\"2869\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2990\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2863\"},\"glyph\":{\"id\":\"2864\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2865\"},\"selection_glyph\":null,\"view\":{\"id\":\"2867\"}},\"id\":\"2866\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2991\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":20.0}},\"id\":\"2865\",\"type\":\"Rect\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":28.0}},\"id\":\"2870\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2992\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"2993\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2994\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"2868\"}},\"id\":\"2872\",\"type\":\"CDSView\"},{\"attributes\":{\"formatter\":{\"id\":\"2965\"},\"major_label_overrides\":{\"1\":\"cpu\",\"2\":\"gpu0\",\"3\":\"gpu0_memory\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"2873\"}},\"id\":\"2766\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"2995\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2771\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":46.10718749999998}},\"id\":\"2810\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"2996\",\"type\":\"Selection\"},{\"attributes\":{\"axis\":{\"id\":\"2770\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"2773\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2997\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"2998\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"2808\"},\"glyph\":{\"id\":\"2809\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2810\"},\"selection_glyph\":null,\"view\":{\"id\":\"2812\"}},\"id\":\"2811\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2966\"},\"selection_policy\":{\"id\":\"2967\"}},\"id\":\"2783\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2999\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"2808\"}},\"id\":\"2812\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3000\",\"type\":\"Selection\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":53.0},\"y1\":{\"value\":38.0}},\"id\":\"2815\",\"type\":\"Segment\"},{\"attributes\":{\"callback\":null},\"id\":\"2774\",\"type\":\"HoverTool\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":13.0},\"y1\":{\"value\":28.0}},\"id\":\"2819\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"3001\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2978\"},\"selection_policy\":{\"id\":\"2979\"}},\"id\":\"2813\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"2775\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2783\"},\"glyph\":{\"id\":\"2784\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2785\"},\"selection_glyph\":null,\"view\":{\"id\":\"2787\"}},\"id\":\"2786\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"2963\"},\"ticker\":{\"id\":\"2771\"}},\"id\":\"2770\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis\":{\"id\":\"2766\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"2769\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"2776\",\"type\":\"ResetTool\"},{\"attributes\":{\"source\":{\"id\":\"2813\"}},\"id\":\"2817\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"2777\",\"type\":\"PanTool\"},{\"attributes\":{\"data_source\":{\"id\":\"2813\"},\"glyph\":{\"id\":\"2814\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2815\"},\"selection_glyph\":null,\"view\":{\"id\":\"2817\"}},\"id\":\"2816\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"2783\"}},\"id\":\"2787\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2980\"},\"selection_policy\":{\"id\":\"2981\"}},\"id\":\"2818\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":41.439375},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":39.379999999999995},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2794\",\"type\":\"VBar\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":13.0},\"y1\":{\"value\":28.0}},\"id\":\"2820\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2970\"},\"selection_policy\":{\"id\":\"2971\"}},\"id\":\"2793\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"bottom\":{\"value\":38.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2824\",\"type\":\"VBar\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":33.65968750000002},\"y1\":{\"value\":38.32750000000001}},\"id\":\"2790\",\"type\":\"Segment\"},{\"attributes\":{},\"id\":\"2760\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"2762\",\"type\":\"LinearScale\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":46.10718749999998},\"y1\":{\"value\":41.439375}},\"id\":\"2785\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2972\"},\"selection_policy\":{\"id\":\"2973\"}},\"id\":\"2798\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2818\"}},\"id\":\"2822\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"2818\"},\"glyph\":{\"id\":\"2819\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2820\"},\"selection_glyph\":null,\"view\":{\"id\":\"2822\"}},\"id\":\"2821\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2982\"},\"selection_policy\":{\"id\":\"2983\"}},\"id\":\"2823\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"2788\"},\"glyph\":{\"id\":\"2789\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2790\"},\"selection_glyph\":null,\"view\":{\"id\":\"2792\"}},\"id\":\"2791\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2968\"},\"selection_policy\":{\"id\":\"2969\"}},\"id\":\"2788\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2823\"}},\"id\":\"2827\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":33.65968750000002},\"y1\":{\"value\":38.32750000000001}},\"id\":\"2789\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2986\"},\"selection_policy\":{\"id\":\"2987\"}},\"id\":\"2833\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"2793\"},\"glyph\":{\"id\":\"2794\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2795\"},\"selection_glyph\":null,\"view\":{\"id\":\"2797\"}},\"id\":\"2796\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"2764\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"2963\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"2823\"},\"glyph\":{\"id\":\"2824\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2825\"},\"selection_glyph\":null,\"view\":{\"id\":\"2827\"}},\"id\":\"2826\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"2788\"}},\"id\":\"2792\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"2774\"},{\"id\":\"2775\"},{\"id\":\"2776\"},{\"id\":\"2777\"}]},\"id\":\"2778\",\"type\":\"Toolbar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2984\"},\"selection_policy\":{\"id\":\"2985\"}},\"id\":\"2828\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"end\":20},\"id\":\"2758\",\"type\":\"Range1d\"},{\"attributes\":{\"bottom\":{\"value\":39.379999999999995},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":38.32750000000001},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2799\",\"type\":\"VBar\"},{\"attributes\":{\"bottom\":{\"value\":38.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2825\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2974\"},\"selection_policy\":{\"id\":\"2975\"}},\"id\":\"2803\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":46.10718749999998},\"y1\":{\"value\":41.439375}},\"id\":\"2784\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"2793\"}},\"id\":\"2797\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"2828\"}},\"id\":\"2832\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":41.439375},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":39.379999999999995},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2795\",\"type\":\"VBar\"},{\"attributes\":{\"data_source\":{\"id\":\"2828\"},\"glyph\":{\"id\":\"2829\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2830\"},\"selection_glyph\":null,\"view\":{\"id\":\"2832\"}},\"id\":\"2831\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"2798\"}},\"id\":\"2802\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":13.0}},\"id\":\"2834\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2798\"},\"glyph\":{\"id\":\"2799\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2800\"},\"selection_glyph\":null,\"view\":{\"id\":\"2802\"}},\"id\":\"2801\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2830\",\"type\":\"VBar\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":33.65968750000002}},\"id\":\"2804\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2988\"},\"selection_policy\":{\"id\":\"2989\"}},\"id\":\"2838\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"2833\"}},\"id\":\"2837\",\"type\":\"CDSView\"},{\"attributes\":{\"bottom\":{\"value\":39.379999999999995},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":38.32750000000001},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"2800\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"2803\"}},\"id\":\"2807\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":53.0}},\"id\":\"2839\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2833\"},\"glyph\":{\"id\":\"2834\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2835\"},\"selection_glyph\":null,\"view\":{\"id\":\"2837\"}},\"id\":\"2836\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":53.0},\"y1\":{\"value\":38.0}},\"id\":\"2814\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"2829\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"2976\"},\"selection_policy\":{\"id\":\"2977\"}},\"id\":\"2808\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":13.0}},\"id\":\"2835\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2868\"},\"glyph\":{\"id\":\"2869\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2870\"},\"selection_glyph\":null,\"view\":{\"id\":\"2872\"}},\"id\":\"2871\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":46.10718749999998}},\"id\":\"2809\",\"type\":\"Rect\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":53.0}},\"id\":\"2840\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"2803\"},\"glyph\":{\"id\":\"2804\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2805\"},\"selection_glyph\":null,\"view\":{\"id\":\"2807\"}},\"id\":\"2806\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"2838\"},\"glyph\":{\"id\":\"2839\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"2840\"},\"selection_glyph\":null,\"view\":{\"id\":\"2842\"}},\"id\":\"2841\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":33.65968750000002}},\"id\":\"2805\",\"type\":\"Rect\"}],\"root_ids\":[\"2755\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"052de5d2-daf2-42ba-b110-65e9a124e0cb\",\"root_ids\":[\"2755\"],\"roots\":{\"2755\":\"b2f4c494-9b72-4ae4-be49-79f53fee1418\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "2755" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"9d32758d-420f-415b-be2f-3f1e04e24df2\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"3213\"}],\"center\":[{\"id\":\"3216\"},{\"id\":\"3220\"}],\"left\":[{\"id\":\"3217\"}],\"plot_height\":350,\"plot_width\":1000,\"renderers\":[{\"id\":\"3233\"},{\"id\":\"3238\"},{\"id\":\"3243\"},{\"id\":\"3248\"},{\"id\":\"3253\"},{\"id\":\"3258\"},{\"id\":\"3263\"},{\"id\":\"3268\"},{\"id\":\"3273\"},{\"id\":\"3278\"},{\"id\":\"3283\"},{\"id\":\"3288\"},{\"id\":\"3293\"},{\"id\":\"3298\"},{\"id\":\"3303\"},{\"id\":\"3308\"},{\"id\":\"3313\"},{\"id\":\"3318\"}],\"title\":{\"id\":\"3203\"},\"toolbar\":{\"id\":\"3225\"},\"x_range\":{\"id\":\"3205\"},\"x_scale\":{\"id\":\"3209\"},\"y_range\":{\"id\":\"3207\"},\"y_scale\":{\"id\":\"3211\"}},\"id\":\"3202\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3483\"},\"selection_policy\":{\"id\":\"3484\"}},\"id\":\"3305\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3463\",\"type\":\"Selection\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":21.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3301\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3481\"},\"selection_policy\":{\"id\":\"3482\"}},\"id\":\"3300\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3464\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"3300\"}},\"id\":\"3304\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3465\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3485\"},\"selection_policy\":{\"id\":\"3486\"}},\"id\":\"3310\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3467\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3300\"},\"glyph\":{\"id\":\"3301\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3302\"},\"selection_glyph\":null,\"view\":{\"id\":\"3304\"}},\"id\":\"3303\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3466\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":21.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":20.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3306\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3468\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":28.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":21.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3302\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3469\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3470\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"3305\"}},\"id\":\"3309\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3471\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3305\"},\"glyph\":{\"id\":\"3306\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3307\"},\"selection_glyph\":null,\"view\":{\"id\":\"3309\"}},\"id\":\"3308\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3478\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":20.0}},\"id\":\"3311\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3473\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3472\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom\":{\"value\":21.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":20.0},\"width\":{\"value\":0.7},\"x\":{\"value\":3}},\"id\":\"3307\",\"type\":\"VBar\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3487\"},\"selection_policy\":{\"id\":\"3488\"}},\"id\":\"3315\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3310\"}},\"id\":\"3314\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3474\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3477\"},\"selection_policy\":{\"id\":\"3478\"}},\"id\":\"3290\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3475\",\"type\":\"Selection\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":28.0}},\"id\":\"3316\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3476\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"3310\"},\"glyph\":{\"id\":\"3311\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3312\"},\"selection_glyph\":null,\"view\":{\"id\":\"3314\"}},\"id\":\"3313\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3479\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3477\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":20.0}},\"id\":\"3312\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3480\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"3315\"}},\"id\":\"3319\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3482\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3452\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"3481\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3457\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"3315\"},\"glyph\":{\"id\":\"3316\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3317\"},\"selection_glyph\":null,\"view\":{\"id\":\"3319\"}},\"id\":\"3318\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3483\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":3},\"y\":{\"value\":28.0}},\"id\":\"3317\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"3484\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3454\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3455\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3485\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3453\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3488\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3486\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3456\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3487\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3458\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3459\",\"type\":\"Selection\"},{\"attributes\":{\"ticks\":[0,1,2,3]},\"id\":\"3320\",\"type\":\"FixedTicker\"},{\"attributes\":{},\"id\":\"3460\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"3461\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"3462\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":28.0},\"y1\":{\"value\":28.0}},\"id\":\"3292\",\"type\":\"Segment\"},{\"attributes\":{\"data_source\":{\"id\":\"3290\"},\"glyph\":{\"id\":\"3291\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3292\"},\"selection_glyph\":null,\"view\":{\"id\":\"3294\"}},\"id\":\"3293\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.125},\"y1\":{\"value\":39.25}},\"id\":\"3262\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":11.125},\"y1\":{\"value\":28.0}},\"id\":\"3266\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3465\"},\"selection_policy\":{\"id\":\"3466\"}},\"id\":\"3260\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3230\"}},\"id\":\"3234\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3255\"},\"glyph\":{\"id\":\"3256\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3257\"},\"selection_glyph\":null,\"view\":{\"id\":\"3259\"}},\"id\":\"3258\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"3290\"}},\"id\":\"3294\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"3260\"}},\"id\":\"3264\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3230\"},\"glyph\":{\"id\":\"3231\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3232\"},\"selection_glyph\":null,\"view\":{\"id\":\"3234\"}},\"id\":\"3233\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"3260\"},\"glyph\":{\"id\":\"3261\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3262\"},\"selection_glyph\":null,\"view\":{\"id\":\"3264\"}},\"id\":\"3263\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3467\"},\"selection_policy\":{\"id\":\"3468\"}},\"id\":\"3265\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":32.76437499999998},\"y1\":{\"value\":38.11749999999999}},\"id\":\"3236\",\"type\":\"Segment\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":11.125},\"y1\":{\"value\":28.0}},\"id\":\"3267\",\"type\":\"Segment\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":20.0},\"y1\":{\"value\":20.0}},\"id\":\"3296\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3455\"},\"selection_policy\":{\"id\":\"3456\"}},\"id\":\"3235\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3479\"},\"selection_policy\":{\"id\":\"3480\"}},\"id\":\"3295\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":32.76437499999998},\"y1\":{\"value\":38.11749999999999}},\"id\":\"3237\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":39.25},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":29.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3271\",\"type\":\"VBar\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"3221\"},{\"id\":\"3222\"},{\"id\":\"3223\"},{\"id\":\"3224\"}]},\"id\":\"3225\",\"type\":\"Toolbar\"},{\"attributes\":{\"bottom\":{\"value\":41.68625},\"fill_color\":{\"value\":\"#FDE725\"},\"top\":{\"value\":39.2675},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3241\",\"type\":\"VBar\"},{\"attributes\":{\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":28.0},\"y1\":{\"value\":28.0}},\"id\":\"3291\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"3265\"}},\"id\":\"3269\",\"type\":\"CDSView\"},{\"attributes\":{\"source\":{\"id\":\"3235\"}},\"id\":\"3239\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3265\"},\"glyph\":{\"id\":\"3266\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3267\"},\"selection_glyph\":null,\"view\":{\"id\":\"3269\"}},\"id\":\"3268\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3469\"},\"selection_policy\":{\"id\":\"3470\"}},\"id\":\"3270\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":3},\"x1\":{\"value\":3},\"y0\":{\"value\":20.0},\"y1\":{\"value\":20.0}},\"id\":\"3297\",\"type\":\"Segment\"},{\"attributes\":{\"data_source\":{\"id\":\"3235\"},\"glyph\":{\"id\":\"3236\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3237\"},\"selection_glyph\":null,\"view\":{\"id\":\"3239\"}},\"id\":\"3238\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3457\"},\"selection_policy\":{\"id\":\"3458\"}},\"id\":\"3240\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3224\",\"type\":\"PanTool\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3461\"},\"selection_policy\":{\"id\":\"3462\"}},\"id\":\"3250\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3270\"}},\"id\":\"3274\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3473\"},\"selection_policy\":{\"id\":\"3474\"}},\"id\":\"3280\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"3211\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3450\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"3270\"},\"glyph\":{\"id\":\"3271\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3272\"},\"selection_glyph\":null,\"view\":{\"id\":\"3274\"}},\"id\":\"3273\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":39.2675},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":38.11749999999999},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3246\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3223\",\"type\":\"ResetTool\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3471\"},\"selection_policy\":{\"id\":\"3472\"}},\"id\":\"3275\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"3240\"},\"glyph\":{\"id\":\"3241\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3242\"},\"selection_glyph\":null,\"view\":{\"id\":\"3244\"}},\"id\":\"3243\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"x0\":{\"value\":2},\"x1\":{\"value\":2},\"y0\":{\"value\":56.125},\"y1\":{\"value\":39.25}},\"id\":\"3261\",\"type\":\"Segment\"},{\"attributes\":{\"source\":{\"id\":\"3240\"}},\"id\":\"3244\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3222\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"bottom\":{\"value\":39.25},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":29.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3272\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"3218\",\"type\":\"BasicTicker\"},{\"attributes\":{\"formatter\":{\"id\":\"3452\"},\"major_label_overrides\":{\"1\":\"cpu\",\"2\":\"gpu0\",\"3\":\"gpu0_memory\"},\"major_label_text_font_size\":\"10px\",\"ticker\":{\"id\":\"3320\"}},\"id\":\"3213\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom\":{\"value\":41.68625},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#FDE725\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":39.2675},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3242\",\"type\":\"VBar\"},{\"attributes\":{\"source\":{\"id\":\"3275\"}},\"id\":\"3279\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3459\"},\"selection_policy\":{\"id\":\"3460\"}},\"id\":\"3245\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"Utilization in %\",\"formatter\":{\"id\":\"3450\"},\"ticker\":{\"id\":\"3218\"}},\"id\":\"3217\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data_source\":{\"id\":\"3275\"},\"glyph\":{\"id\":\"3276\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3277\"},\"selection_glyph\":null,\"view\":{\"id\":\"3279\"}},\"id\":\"3278\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"3213\"},\"grid_line_color\":null,\"grid_line_width\":0,\"ticker\":null},\"id\":\"3216\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"3245\"}},\"id\":\"3249\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":11.125}},\"id\":\"3281\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"3245\"},\"glyph\":{\"id\":\"3246\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3247\"},\"selection_glyph\":null,\"view\":{\"id\":\"3249\"}},\"id\":\"3248\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis\":{\"id\":\"3217\"},\"dimension\":1,\"grid_line_color\":\"white\",\"grid_line_width\":0,\"ticker\":null},\"id\":\"3220\",\"type\":\"Grid\"},{\"attributes\":{\"bottom\":{\"value\":29.0},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3277\",\"type\":\"VBar\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":32.76437499999998}},\"id\":\"3251\",\"type\":\"Rect\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3475\"},\"selection_policy\":{\"id\":\"3476\"}},\"id\":\"3285\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3453\"},\"selection_policy\":{\"id\":\"3454\"}},\"id\":\"3230\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3280\"}},\"id\":\"3284\",\"type\":\"CDSView\"},{\"attributes\":{\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":47.039375000000014},\"y1\":{\"value\":41.68625}},\"id\":\"3231\",\"type\":\"Segment\"},{\"attributes\":{\"bottom\":{\"value\":39.2675},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#440154\"},\"line_alpha\":{\"value\":0.1},\"top\":{\"value\":38.11749999999999},\"width\":{\"value\":0.7},\"x\":{\"value\":1}},\"id\":\"3247\",\"type\":\"VBar\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"x0\":{\"value\":1},\"x1\":{\"value\":1},\"y0\":{\"value\":47.039375000000014},\"y1\":{\"value\":41.68625}},\"id\":\"3232\",\"type\":\"Segment\"},{\"attributes\":{\"data\":{},\"selected\":{\"id\":\"3463\"},\"selection_policy\":{\"id\":\"3464\"}},\"id\":\"3255\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"3250\"}},\"id\":\"3254\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.125}},\"id\":\"3286\",\"type\":\"Rect\"},{\"attributes\":{\"callback\":null},\"id\":\"3221\",\"type\":\"HoverTool\"},{\"attributes\":{\"data_source\":{\"id\":\"3280\"},\"glyph\":{\"id\":\"3281\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3282\"},\"selection_glyph\":null,\"view\":{\"id\":\"3284\"}},\"id\":\"3283\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom\":{\"value\":29.0},\"fill_color\":{\"value\":\"#440154\"},\"top\":{\"value\":28.0},\"width\":{\"value\":0.7},\"x\":{\"value\":2}},\"id\":\"3276\",\"type\":\"VBar\"},{\"attributes\":{\"end\":20},\"id\":\"3205\",\"type\":\"Range1d\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":11.125}},\"id\":\"3282\",\"type\":\"Rect\"},{\"attributes\":{\"source\":{\"id\":\"3255\"}},\"id\":\"3259\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"3250\"},\"glyph\":{\"id\":\"3251\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3252\"},\"selection_glyph\":null,\"view\":{\"id\":\"3254\"}},\"id\":\"3253\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"3209\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3207\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"3285\"}},\"id\":\"3289\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":32.76437499999998}},\"id\":\"3252\",\"type\":\"Rect\"},{\"attributes\":{\"source\":{\"id\":\"3295\"}},\"id\":\"3299\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":47.039375000000014}},\"id\":\"3256\",\"type\":\"Rect\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":1},\"y\":{\"value\":47.039375000000014}},\"id\":\"3257\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"3285\"},\"glyph\":{\"id\":\"3286\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3287\"},\"selection_glyph\":null,\"view\":{\"id\":\"3289\"}},\"id\":\"3288\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"Node algo-2\"},\"id\":\"3203\",\"type\":\"Title\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"units\":\"data\",\"value\":0.01},\"line_alpha\":{\"value\":0.1},\"width\":{\"units\":\"data\",\"value\":0.2},\"x\":{\"value\":2},\"y\":{\"value\":56.125}},\"id\":\"3287\",\"type\":\"Rect\"},{\"attributes\":{\"data_source\":{\"id\":\"3295\"},\"glyph\":{\"id\":\"3296\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"3297\"},\"selection_glyph\":null,\"view\":{\"id\":\"3299\"}},\"id\":\"3298\",\"type\":\"GlyphRenderer\"}],\"root_ids\":[\"3202\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"9d32758d-420f-415b-be2f-3f1e04e24df2\",\"root_ids\":[\"3202\"],\"roots\":{\"3202\":\"1a41be28-be8d-4cf0-8e1e-639f5bd34be5\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3202" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\" ### Batch size\"\"\"))\n", + " report = load_report('BatchSize')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " cpu_threshold_p95 = int(params[0].split(':')[1])\n", + " gpu_threshold_p95 = int(params[1].split(':')[1])\n", + " gpu_memory_threshold_p95 = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " window = int(params[4].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text = Paragraph(text=f\"\"\"The BatchSize rule helps to detect if GPU is underutilized because of the batch size being \n", + " too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of \n", + " {cpu_threshold_p95}%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of {gpu_threshold_p95}% and the 95th percentile of memory footprint \\\n", + " below gpu_memory_threshold_p95 of {gpu_memory_threshold_p95}%. In your training job this happened {violations} times. \\\n", + " The rule skipped the first {patience} datapoints. The rule computed the percentiles over window size of {window} continuous datapoints.\\n\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\n", + " \"\"\", width=800)\n", + " show(text)\n", + " if len(report['Details']) >0: \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " del report['Details']['last_timestamp']\n", + " text = Paragraph(text=f\"\"\"Your training job is underutilizing the instance. You may want to consider\n", + " either switch to a smaller instance type or to increase the batch size. \n", + " The last time the BatchSize rule triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. They the total \n", + " CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).\"\"\", \n", + " width=800)\n", + " show(text)\n", + "\n", + " for node_id in report['Details']:\n", + " xmax = max(20, len(report['Details'][node_id]))\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,xmax)\n", + " )\n", + " \n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " show(plot)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:02.952514Z", + "iopub.status.busy": "2023-04-10T22:56:02.936324Z", + "iopub.status.idle": "2023-04-10T22:56:03.048663Z", + "shell.execute_reply": "2023-04-10T22:56:03.049038Z" + }, + "papermill": { + "duration": 0.149871, + "end_time": "2023-04-10T22:56:03.049178", + "exception": false, + "start_time": "2023-04-10T22:56:02.899307", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### CPU bottlenecks\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"2ee6b2ea-ec1b-40e1-8779-dd51a9c9b129\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of 90% \\n and GPU utilization was below gpu_threshold of 10%. \\n During initialization utilization is likely to be zero, so the rule skipped the first 1000 datapoints.\\n With this configuration the rule found 447 CPU bottlenecks which is 8% of the total time. This is below the threshold of 50%\\n The rule analysed 5337 data points and triggered 0 times.\",\"width\":900},\"id\":\"3689\",\"type\":\"Paragraph\"}],\"root_ids\":[\"3689\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"2ee6b2ea-ec1b-40e1-8779-dd51a9c9b129\",\"root_ids\":[\"3689\"],\"roots\":{\"3689\":\"e44ac58e-0f34-4b30-a50c-8bd021d088e7\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3689" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"### CPU bottlenecks\\n\\n\"\"\"))\n", + "\n", + " report = load_report('CPUBottleneck')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = int(params[0].split(':')[1])\n", + " cpu_threshold = int(params[1].split(':')[1])\n", + " gpu_threshold = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " if report['Violations'] > 0:\n", + " perc = int(report['Violations']/report['Datapoints']*100)\n", + " else:\n", + " perc = 0\n", + " if perc < threshold:\n", + " string = 'below'\n", + " else:\n", + " string = 'above'\n", + " text = f\"\"\"The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of {cpu_threshold}% \n", + " and GPU utilization was below gpu_threshold of {gpu_threshold}%. \n", + " During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints.\n", + " With this configuration the rule found {violations} CPU bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%\n", + " The rule analysed {datapoints} data points and triggered {triggered} times.\"\"\"\n", + " \n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(paragraph)\n", + " if report:\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " low_gpu = report['Details']['low_gpu_utilization']\n", + " cpu_bottleneck = {}\n", + " cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n", + " cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n", + " cpu_bottleneck[\"Low GPU usage due to CPU bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n", + "\n", + " n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n", + " text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n", + " and how many of those datapoints were likely caused by a CPU bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n", + " below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by CPU bottlenecks. \n", + " \"\"\"\n", + "\n", + " plot = create_piechart(cpu_bottleneck, \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Low GPU usage caused by CPU bottlenecks\")\n", + "\n", + " plots.append(plot)\n", + "\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether CPU bottlenecks mainly \n", + " happened during train/validation phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between time spent on TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event {event}\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened during CPU bottlenecks. \n", + " It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + " \n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics \n", + " that have been recorded when the CPU bottleneck happened. The most expensive function was \n", + " {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot)))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:03.140429Z", + "iopub.status.busy": "2023-04-10T22:56:03.122267Z", + "iopub.status.idle": "2023-04-10T22:56:03.180197Z", + "shell.execute_reply": "2023-04-10T22:56:03.180599Z" + }, + "papermill": { + "duration": 0.096336, + "end_time": "2023-04-10T22:56:03.180737", + "exception": false, + "start_time": "2023-04-10T22:56:03.084401", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### I/O bottlenecks\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"fbbeed78-ff04-4efc-adbb-c96fdc3f8252\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The IOBottleneck rule checked when I/O wait time was above io_threshold of 50% \\n and GPU utilization was below gpu_threshold of 10. During initialization utilization is likely to be zero, so the rule skipped the first 1000 datapoints. \\n With this configuration the rule found 18 I/O bottlenecks which is 0% of the total time. This is below the threshold of 50%.\\n The rule analysed 5337 datapoints and triggered 0 times.\",\"width\":900},\"id\":\"3857\",\"type\":\"Paragraph\"}],\"root_ids\":[\"3857\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"fbbeed78-ff04-4efc-adbb-c96fdc3f8252\",\"root_ids\":[\"3857\"],\"roots\":{\"3857\":\"6970ac6e-449d-4b6a-9b65-84857b82c90d\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "3857" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\": \n", + " display(Markdown(\"\"\"### I/O bottlenecks\\n\\n\"\"\"))\n", + "\n", + " report = load_report('IOBottleneck')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " threshold = int(params[0].split(':')[1])\n", + " io_threshold = int(params[1].split(':')[1])\n", + " gpu_threshold = int(params[2].split(':')[1])\n", + " patience = int(params[3].split(':')[1])\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " if report['Violations'] > 0:\n", + " perc = int(report['Violations']/report['Datapoints']*100)\n", + " else:\n", + " perc = 0\n", + " if perc < threshold:\n", + " string = 'below'\n", + " else:\n", + " string = 'above'\n", + " text = f\"\"\"The IOBottleneck rule checked when I/O wait time was above io_threshold of {io_threshold}% \n", + " and GPU utilization was below gpu_threshold of {gpu_threshold}. During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints. \n", + " With this configuration the rule found {violations} I/O bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\"\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(paragraph)\n", + " \n", + " if report:\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if report['RuleTriggered'] > 0:\n", + "\n", + " low_gpu = report['Details']['low_gpu_utilization']\n", + " cpu_bottleneck = {}\n", + " cpu_bottleneck[\"GPU usage above threshold\"] = report[\"Datapoints\"] - report[\"Details\"][\"low_gpu_utilization\"]\n", + " cpu_bottleneck[\"GPU usage below threshold\"] = report[\"Details\"][\"low_gpu_utilization\"] - len(report[\"Details\"])\n", + " cpu_bottleneck[\"Low GPU usage due to I/O bottlenecks\"] = len(report[\"Details\"][\"bottlenecks\"])\n", + "\n", + " n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)\n", + " text = f\"\"\"The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%\n", + " and how many of those datapoints were likely caused by a I/O bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization \n", + " below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by I/O bottlenecks. \n", + " \"\"\"\n", + "\n", + " plot = create_piechart(cpu_bottleneck, \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Low GPU usage caused by I/O bottlenecks\")\n", + "\n", + " plots.append(plot)\n", + "\n", + " if 'phase' in report['Details']:\n", + " text = f\"\"\"{text} The chart (in the middle) shows whether I/O bottlenecks mainly happened during the training or validation phase.\n", + " \"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['phase'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between the time spent on the TRAIN/EVAL phase\")\n", + " plots.append(plot)\n", + "\n", + " if 'forward_backward' in report['Details'] and len(report['Details']['forward_backward']) > 0:\n", + "\n", + " event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)\n", + " perc = report['Details']['forward_backward'][event]\n", + "\n", + " text = f\"\"\"{text} The pie charts on the right shows a more detailed breakdown. \n", + " It shows that {int(perc)}% of the training time was spent on event \"{event}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['forward_backward'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"The ratio between forward and backward pass\") \n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:\n", + "\n", + " key = list(report['Details']['ratio'].keys())[0]\n", + " ratio = report['Details']['ratio'][key]\n", + "\n", + " text = f\"\"\"The following pie chart shows a breakdown of the CPU/GPU operators that happened \n", + " during I/O bottlenecks. It shows that {int(ratio)}% of the training time was spent on executing operators in \"{key}\".\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['ratio'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"Ratio between CPU/GPU operators\")\n", + " plots.append(plot)\n", + "\n", + "\n", + " if 'general' in report['Details'] and len(report['Details']['general']) > 0:\n", + "\n", + " event = max(report['Details']['general'], key=report['Details']['general'].get)\n", + " perc = report['Details']['general'][event]\n", + "\n", + " plot = create_piechart(report['Details']['general'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + " plots.append(plot)\n", + "\n", + " if len(plots) > 0:\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plots)))\n", + "\n", + " plots = []\n", + " text = \"\"\n", + " if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:\n", + "\n", + " event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)\n", + " perc = report['Details']['horovod'][event]\n", + " text = f\"\"\"The following pie chart shows a detailed breakdown of the Horovod metrics that have been\n", + " recorded when I/O bottleneck happened. The most expensive function was {event} with {int(perc)}%\"\"\"\n", + "\n", + " plot = create_piechart(report['Details']['horovod'], \n", + " height=350,\n", + " width=600,\n", + " x1=0.2,\n", + " x2=0.6,\n", + " radius=0.3, \n", + " title=\"General metrics recorded in framework \")\n", + "\n", + " paragraph = Paragraph(text=text, width=900)\n", + " show(column(paragraph, row(plot))) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "execution": { + "iopub.execute_input": "2023-04-10T22:56:03.270700Z", + "iopub.status.busy": "2023-04-10T22:56:03.256940Z", + "iopub.status.idle": "2023-04-10T22:56:03.310296Z", + "shell.execute_reply": "2023-04-10T22:56:03.310683Z" + }, + "papermill": { + "duration": 0.093559, + "end_time": "2023-04-10T22:56:03.310821", + "exception": false, + "start_time": "2023-04-10T22:56:03.217262", + "status": "completed" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### GPU memory\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"cec4953e-2ac5-4c57-a9fa-2c20c6a0c37c\":{\"roots\":{\"references\":[{\"attributes\":{\"text\":\"The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. \\n The rule checked if the moving average of memory increased by more than 5.0%. \\n So if the moving average increased for instance from 10% to 16.0%, \\n the rule would have triggered. During initialization utilization is likely 0, so the rule skipped the first 1000 datapoints.\\n The moving average was computed on a window size of 10 continuous datapoints. The rule detected 0 violations\\n where the moving average between previous and current time window increased by more than 5.0%.\\n The rule analysed 2660 datapoints and triggered 0 times.\",\"width\":900},\"id\":\"4025\",\"type\":\"Paragraph\"}],\"root_ids\":[\"4025\"]},\"title\":\"Bokeh Application\",\"version\":\"2.2.3\"}};\n", + " var render_items = [{\"docid\":\"cec4953e-2ac5-4c57-a9fa-2c20c6a0c37c\",\"root_ids\":[\"4025\"],\"roots\":{\"4025\":\"2bee40d1-6a68-4d01-90c7-91d0a59a23c9\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " clearInterval(timer);\n", + " embed_document(root);\n", + " } else {\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " clearInterval(timer);\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " }\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "4025" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if analyse_phase == \"training\":\n", + " display(Markdown(\"\"\"### GPU memory\\n\\n\"\"\"))\n", + " \n", + " report = load_report('GPUMemoryIncrease')\n", + " if report:\n", + " params = report['RuleParameters'].split('\\n')\n", + " increase = float(params[0].split(':')[1])\n", + " patience = params[1].split(':')[1]\n", + " window = params[2].split(':')[1]\n", + " violations = report['Violations']\n", + " triggered = report['RuleTriggered']\n", + " datapoints = report['Datapoints']\n", + " \n", + " text=Paragraph(text=f\"\"\"The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. \n", + " The rule checked if the moving average of memory increased by more than {increase}%. \n", + " So if the moving average increased for instance from 10% to {11+increase}%, \n", + " the rule would have triggered. During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints.\n", + " The moving average was computed on a window size of {window} continuous datapoints. The rule detected {violations} violations\n", + " where the moving average between previous and current time window increased by more than {increase}%.\n", + " The rule analysed {datapoints} datapoints and triggered {triggered} times.\"\"\",\n", + " width=900)\n", + " show(text)\n", + "\n", + " if len(report['Details']) > 0:\n", + " \n", + " timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])\n", + " date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')\n", + " day = date.date().strftime(\"%m/%d/%Y\")\n", + " hour = date.time().strftime(\"%H:%M:%S\")\n", + " text = Paragraph(text=f\"\"\"Your training job triggered memory spikes. \n", + " The last time the GPUMemoryIncrease rule triggered in your training job was on {day} at {hour}.\n", + " The following boxplots are a snapshot from the timestamps. They show for each node and GPU the corresponding\n", + " memory utilization (without outliers).\"\"\", width=900)\n", + " show(text)\n", + " \n", + " del report['Details']['last_timestamp']\n", + " \n", + " for node_id in report['Details']:\n", + " \n", + " plot = figure(plot_height=350, \n", + " plot_width=1000,\n", + " toolbar_location='right',\n", + " tools=\"hover,wheel_zoom,reset,pan\", \n", + " title=f\"Node {node_id}\",\n", + " x_range=(0,17),\n", + " )\n", + "\n", + " for index, key in enumerate(report['Details'][node_id]):\n", + " display(Markdown(f\"\"\"**Memory utilization of {key} on node {node_id}:**\"\"\"))\n", + " text = \"\"\n", + " gpu_max = report['Details'][node_id][key]['gpu_max']\n", + " text = f\"\"\"{text} The max memory utilization of {key} on node {node_id} was {gpu_max}%.\"\"\"\n", + " \n", + " p_95 = int(report['Details'][node_id][key]['p95'])\n", + " p_5 = report['Details'][node_id][key]['p05']\n", + " if p_95 < int(50): \n", + " text = f\"\"\"{text} The 95th percentile was only {p_95}%.\"\"\"\n", + " if p_5 < int(5): \n", + " text = f\"\"\"{text} The 5th percentile was only {p_5}%.\"\"\"\n", + " if p_95 - p_5 > 50:\n", + " text = f\"\"\"{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite \n", + " significant, which means that memory utilization on {key} is fluctuating quite a lot.\"\"\"\n", + " \n", + " text = Paragraph(text=f\"\"\"{text}\"\"\", width=900)\n", + " show(text)\n", + " \n", + " upper = report['Details'][node_id][key]['upper']\n", + " lower = report['Details'][node_id][key]['lower']\n", + " p75 = report['Details'][node_id][key]['p75']\n", + " p25 = report['Details'][node_id][key]['p25']\n", + " p50 = report['Details'][node_id][key]['p50']\n", + "\n", + " plot.segment(index+1, upper, index+1, p75, line_color=\"black\")\n", + " plot.segment(index+1, lower, index+1, p25, line_color=\"black\")\n", + "\n", + " plot.vbar(index+1, 0.7, p50, p75, fill_color=\"#FDE725\", line_color=\"black\")\n", + " plot.vbar(index+1, 0.7, p25, p50, fill_color=\"#440154\", line_color=\"black\")\n", + "\n", + " plot.rect(index+1, lower, 0.2, 0.01, line_color=\"black\")\n", + " plot.rect(index+1, upper, 0.2, 0.01, line_color=\"black\")\n", + "\n", + " plot.xaxis.major_label_overrides[index+1] = key\n", + " plot.xgrid.grid_line_color = None\n", + " plot.ygrid.grid_line_color = \"white\"\n", + " plot.grid.grid_line_width = 0\n", + "\n", + " plot.xaxis.major_label_text_font_size=\"10px\"\n", + " plot.xaxis.ticker = np.arange(index+2)\n", + " plot.yaxis.axis_label = \"Utilization in %\"\n", + " show(plot)" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "papermill": { + "duration": 4.342233, + "end_time": "2023-04-10T22:56:03.653906", + "environment_variables": {}, + "exception": null, + "input_path": "/opt/ml/code/profiler_report.ipynb", + "output_path": "/opt/ml/processing/output/rule/profiler-output/.sagemaker-ignore/out.tmp", + "parameters": { + "processing_job_arn": "arn:aws:sagemaker:us-east-1:598348623909:processing-job/pytorch-training-2023-04-1-profilerreport-644ed05c" + }, + "start_time": "2023-04-10T22:55:59.311673", + "version": "2.1.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/BatchSize.json b/starter/ProfilerReports/improved/profiler-reports/BatchSize.json new file mode 100644 index 00000000..b0eb047a --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/BatchSize.json @@ -0,0 +1 @@ +{"RuleTriggered": 28, "Violations": 28, "Details": {"algo-1": {"cpu": {"p25": 38.32750000000001, "p50": 39.379999999999995, "p75": 41.439375, "p95": 74.747875, "upper": 46.10718749999998, "lower": 33.65968750000002}, "gpu0": {"p25": 28.0, "p50": 28.0, "p75": 38.0, "p95": 48.0, "upper": 53.0, "lower": 13.0}, "gpu0_memory": {"p25": 20.0, "p50": 21.0, "p75": 28.0, "p95": 36.0, "upper": 28.0, "lower": 20.0}}, "last_timestamp": 1681165920000000, "algo-2": {"cpu": {"p25": 38.11749999999999, "p50": 39.2675, "p75": 41.68625, "p95": 74.473, "upper": 47.039375000000014, "lower": 32.76437499999998}, "gpu0": {"p25": 28.0, "p50": 29.0, "p75": 39.25, "p95": 50.0, "upper": 56.125, "lower": 11.125}, "gpu0_memory": {"p25": 20.0, "p50": 21.0, "p75": 28.0, "p95": 37.0, "upper": 28.0, "lower": 20.0}}}, "Datapoints": 2659, "RuleParameters": "cpu_threshold_p95:70\ngpu_threshold_p95:70\ngpu_memory_threshold_p95:70\npatience:1000\nwindow:500"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/CPUBottleneck.json b/starter/ProfilerReports/improved/profiler-reports/CPUBottleneck.json new file mode 100644 index 00000000..77453d96 --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/CPUBottleneck.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 447, "Details": {"low_gpu_utilization": 1105, "bottlenecks": {"1681165970.002247": {"GPUs": 1, "CPUs": 1}, "1681165970.502743": {"GPUs": 1, "CPUs": 2}, "1681165971.002226": {"GPUs": 1, "CPUs": 2}, "1681165971.502211": {"GPUs": 1, "CPUs": 4}, "1681165972.502242": {"GPUs": 1, "CPUs": 2}, "1681165973.502614": {"GPUs": 1, "CPUs": 2}, "1681165974.006123": {"GPUs": 1, "CPUs": 4}, "1681165974.502542": {"GPUs": 1, "CPUs": 4}, "1681165975.006108": {"GPUs": 1, "CPUs": 4}, "1681165975.50305": {"GPUs": 1, "CPUs": 4}, "1681165976.002229": {"GPUs": 1, "CPUs": 4}, "1681165976.502592": {"GPUs": 1, "CPUs": 4}, "1681165977.002214": {"GPUs": 1, "CPUs": 4}, "1681165977.502292": {"GPUs": 1, "CPUs": 4}, "1681165978.00222": {"GPUs": 1, "CPUs": 1}, "1681165978.502216": {"GPUs": 1, "CPUs": 2}, "1681165979.002287": {"GPUs": 1, "CPUs": 4}, "1681165979.504756": {"GPUs": 1, "CPUs": 4}, "1681165970.500245": {"GPUs": 1, "CPUs": 1}, "1681165971.006104": {"GPUs": 1, "CPUs": 2}, "1681165971.500276": {"GPUs": 1, "CPUs": 2}, "1681165972.001018": {"GPUs": 1, "CPUs": 3}, "1681165972.502516": {"GPUs": 1, "CPUs": 1}, "1681165973.000275": {"GPUs": 1, "CPUs": 1}, "1681165974.00211": {"GPUs": 1, "CPUs": 2}, "1681165974.500266": {"GPUs": 1, "CPUs": 3}, "1681165975.00027": {"GPUs": 1, "CPUs": 4}, "1681165975.501564": {"GPUs": 1, "CPUs": 4}, "1681165976.000302": {"GPUs": 1, "CPUs": 4}, "1681165976.511003": {"GPUs": 1, "CPUs": 3}, "1681165977.003017": {"GPUs": 1, "CPUs": 4}, "1681165977.500511": {"GPUs": 1, "CPUs": 4}, "1681165978.001242": {"GPUs": 1, "CPUs": 4}, "1681165978.502539": {"GPUs": 1, "CPUs": 1}, "1681165979.006083": {"GPUs": 1, "CPUs": 4}, "1681165979.502067": {"GPUs": 1, "CPUs": 4}, "1681165980.002229": {"GPUs": 1, "CPUs": 4}, "1681165980.502206": {"GPUs": 1, "CPUs": 4}, "1681165981.002219": {"GPUs": 1, "CPUs": 3}, "1681165981.50222": {"GPUs": 1, "CPUs": 4}, "1681165982.002213": {"GPUs": 1, "CPUs": 4}, "1681165982.506163": {"GPUs": 1, "CPUs": 4}, "1681165983.005486": {"GPUs": 1, "CPUs": 4}, "1681165983.502244": {"GPUs": 1, "CPUs": 4}, "1681165984.002247": {"GPUs": 1, "CPUs": 4}, "1681165984.502244": {"GPUs": 1, "CPUs": 3}, "1681165991.00239": {"GPUs": 1, "CPUs": 1}, "1681165991.505602": {"GPUs": 1, "CPUs": 4}, "1681165992.002936": {"GPUs": 1, "CPUs": 3}, "1681165992.504321": {"GPUs": 1, "CPUs": 4}, "1681165993.002563": {"GPUs": 1, "CPUs": 4}, "1681165993.502249": {"GPUs": 1, "CPUs": 3}, "1681165994.005322": {"GPUs": 1, "CPUs": 4}, "1681165994.505223": {"GPUs": 1, "CPUs": 4}, "1681165995.00337": {"GPUs": 1, "CPUs": 4}, "1681165995.502417": {"GPUs": 1, "CPUs": 4}, "1681165996.005055": {"GPUs": 1, "CPUs": 4}, "1681165996.502219": {"GPUs": 1, "CPUs": 4}, "1681165997.00386": {"GPUs": 1, "CPUs": 4}, "1681165997.502216": {"GPUs": 1, "CPUs": 4}, "1681165998.002247": {"GPUs": 1, "CPUs": 4}, "1681165998.509222": {"GPUs": 1, "CPUs": 4}, "1681165999.009509": {"GPUs": 1, "CPUs": 4}, "1681165999.505229": {"GPUs": 1, "CPUs": 4}, "1681166000.004445": {"GPUs": 1, "CPUs": 4}, "1681166000.506006": {"GPUs": 1, "CPUs": 4}, "1681166001.002742": {"GPUs": 1, "CPUs": 4}, "1681166001.507002": {"GPUs": 1, "CPUs": 4}, "1681166002.004289": {"GPUs": 1, "CPUs": 4}, "1681166002.506079": {"GPUs": 1, "CPUs": 4}, "1681166003.002369": {"GPUs": 1, "CPUs": 4}, "1681166003.503123": {"GPUs": 1, "CPUs": 4}, "1681166004.002592": {"GPUs": 1, "CPUs": 4}, "1681166004.503247": {"GPUs": 1, "CPUs": 4}, "1681166005.015774": {"GPUs": 1, "CPUs": 4}, "1681166005.502541": {"GPUs": 1, "CPUs": 4}, "1681166006.006153": {"GPUs": 1, "CPUs": 4}, "1681166006.502206": {"GPUs": 1, "CPUs": 3}, "1681166024.502224": {"GPUs": 1, "CPUs": 1}, "1681166031.50243": {"GPUs": 1, "CPUs": 1}, "1681166036.502226": {"GPUs": 1, "CPUs": 1}, "1681165980.000554": {"GPUs": 1, "CPUs": 4}, "1681165980.500247": {"GPUs": 1, "CPUs": 4}, "1681165981.001151": {"GPUs": 1, "CPUs": 4}, "1681165981.500272": {"GPUs": 1, "CPUs": 1}, "1681165982.001762": {"GPUs": 1, "CPUs": 4}, "1681165982.502166": {"GPUs": 1, "CPUs": 3}, "1681165983.002075": {"GPUs": 1, "CPUs": 4}, "1681165983.50304": {"GPUs": 1, "CPUs": 4}, "1681165984.001249": {"GPUs": 1, "CPUs": 4}, "1681165984.500295": {"GPUs": 1, "CPUs": 4}, "1681165985.000273": {"GPUs": 1, "CPUs": 1}, "1681165995.006819": {"GPUs": 1, "CPUs": 3}, "1681165995.502045": {"GPUs": 1, "CPUs": 1}, "1681165996.006084": {"GPUs": 1, "CPUs": 3}, "1681165996.502073": {"GPUs": 1, "CPUs": 4}, "1681165997.00081": {"GPUs": 1, "CPUs": 4}, "1681165997.502068": {"GPUs": 1, "CPUs": 4}, "1681165998.000291": {"GPUs": 1, "CPUs": 1}, "1681165998.500733": {"GPUs": 1, "CPUs": 4}, "1681165999.001331": {"GPUs": 1, "CPUs": 2}, "1681165999.502153": {"GPUs": 1, "CPUs": 4}, "1681166000.000264": {"GPUs": 1, "CPUs": 4}, "1681166000.503428": {"GPUs": 1, "CPUs": 4}, "1681166001.004782": {"GPUs": 1, "CPUs": 4}, "1681166001.500263": {"GPUs": 1, "CPUs": 4}, "1681166002.003394": {"GPUs": 1, "CPUs": 4}, "1681166002.502907": {"GPUs": 1, "CPUs": 4}, "1681166003.002062": {"GPUs": 1, "CPUs": 4}, "1681166003.500553": {"GPUs": 1, "CPUs": 4}, "1681166004.000311": {"GPUs": 1, "CPUs": 4}, "1681166004.500328": {"GPUs": 1, "CPUs": 4}, "1681166005.000431": {"GPUs": 1, "CPUs": 4}, "1681166005.50191": {"GPUs": 1, "CPUs": 4}, "1681166006.000497": {"GPUs": 1, "CPUs": 4}, "1681166006.501719": {"GPUs": 1, "CPUs": 4}, "1681166007.000264": {"GPUs": 1, "CPUs": 4}, "1681166007.502072": {"GPUs": 1, "CPUs": 4}, "1681166008.001164": {"GPUs": 1, "CPUs": 4}, "1681166008.500527": {"GPUs": 1, "CPUs": 4}, "1681166009.001538": {"GPUs": 1, "CPUs": 4}, "1681166009.500438": {"GPUs": 1, "CPUs": 4}, "1681166010.003296": {"GPUs": 1, "CPUs": 4}, "1681166010.500274": {"GPUs": 1, "CPUs": 4}, "1681166024.500263": {"GPUs": 1, "CPUs": 1}, "1681166031.500971": {"GPUs": 1, "CPUs": 1}, "1681166037.500268": {"GPUs": 1, "CPUs": 1}, "1681166038.000257": {"GPUs": 1, "CPUs": 1}, "1681166042.502237": {"GPUs": 1, "CPUs": 2}, "1681166044.502216": {"GPUs": 1, "CPUs": 1}, "1681166046.002178": {"GPUs": 1, "CPUs": 1}, "1681166053.002232": {"GPUs": 1, "CPUs": 1}, "1681166054.002217": {"GPUs": 1, "CPUs": 1}, "1681166055.502215": {"GPUs": 1, "CPUs": 1}, "1681166067.502188": {"GPUs": 1, "CPUs": 1}, "1681166068.002185": {"GPUs": 1, "CPUs": 1}, "1681166042.500246": {"GPUs": 1, "CPUs": 2}, "1681166044.000274": {"GPUs": 1, "CPUs": 1}, "1681166053.000254": {"GPUs": 1, "CPUs": 1}, "1681166054.50027": {"GPUs": 1, "CPUs": 1}, "1681166067.000265": {"GPUs": 1, "CPUs": 1}, "1681166098.500278": {"GPUs": 1, "CPUs": 1}, "1681166160.502197": {"GPUs": 1, "CPUs": 1}, "1681166166.002198": {"GPUs": 1, "CPUs": 1}, "1681166166.50223": {"GPUs": 1, "CPUs": 1}, "1681166167.00217": {"GPUs": 1, "CPUs": 1}, "1681166168.502231": {"GPUs": 1, "CPUs": 1}, "1681166169.00223": {"GPUs": 1, "CPUs": 1}, "1681166169.502229": {"GPUs": 1, "CPUs": 1}, "1681166170.002214": {"GPUs": 1, "CPUs": 1}, "1681166170.502234": {"GPUs": 1, "CPUs": 1}, "1681166171.002168": {"GPUs": 1, "CPUs": 1}, "1681166171.502243": {"GPUs": 1, "CPUs": 1}, "1681166172.002224": {"GPUs": 1, "CPUs": 1}, "1681166172.502158": {"GPUs": 1, "CPUs": 1}, "1681166173.002215": {"GPUs": 1, "CPUs": 1}, "1681166173.502283": {"GPUs": 1, "CPUs": 1}, "1681166174.002214": {"GPUs": 1, "CPUs": 1}, "1681166174.502179": {"GPUs": 1, "CPUs": 1}, "1681166175.502237": {"GPUs": 1, "CPUs": 1}, "1681166176.002377": {"GPUs": 1, "CPUs": 1}, "1681166176.502206": {"GPUs": 1, "CPUs": 1}, "1681166177.002187": {"GPUs": 1, "CPUs": 1}, "1681166177.502213": {"GPUs": 1, "CPUs": 1}, "1681166178.002156": {"GPUs": 1, "CPUs": 1}, "1681166178.502174": {"GPUs": 1, "CPUs": 1}, "1681166179.002227": {"GPUs": 1, "CPUs": 1}, "1681166160.000268": {"GPUs": 1, "CPUs": 1}, "1681166166.000277": {"GPUs": 1, "CPUs": 1}, "1681166166.500255": {"GPUs": 1, "CPUs": 1}, "1681166167.000273": {"GPUs": 1, "CPUs": 1}, "1681166168.500274": {"GPUs": 1, "CPUs": 1}, "1681166169.000269": {"GPUs": 1, "CPUs": 1}, "1681166169.500251": {"GPUs": 1, "CPUs": 1}, "1681166170.000311": {"GPUs": 1, "CPUs": 1}, "1681166170.500253": {"GPUs": 1, "CPUs": 1}, "1681166171.00027": {"GPUs": 1, "CPUs": 1}, "1681166171.500275": {"GPUs": 1, "CPUs": 1}, "1681166172.000257": {"GPUs": 1, "CPUs": 1}, "1681166172.500278": {"GPUs": 1, "CPUs": 1}, "1681166173.000261": {"GPUs": 1, "CPUs": 1}, "1681166173.500329": {"GPUs": 1, "CPUs": 1}, "1681166174.500267": {"GPUs": 1, "CPUs": 1}, "1681166175.500271": {"GPUs": 1, "CPUs": 1}, "1681166176.000257": {"GPUs": 1, "CPUs": 1}, "1681166176.500277": {"GPUs": 1, "CPUs": 1}, "1681166177.000246": {"GPUs": 1, "CPUs": 1}, "1681166178.000272": {"GPUs": 1, "CPUs": 1}, "1681166178.500256": {"GPUs": 1, "CPUs": 1}, "1681166179.00026": {"GPUs": 1, "CPUs": 1}, "1681166226.50221": {"GPUs": 1, "CPUs": 2}, "1681167132.500251": {"GPUs": 1, "CPUs": 1}, "1681167228.001322": {"GPUs": 1, "CPUs": 1}, "1681167228.500267": {"GPUs": 1, "CPUs": 2}, "1681167229.001135": {"GPUs": 1, "CPUs": 1}, "1681167229.501035": {"GPUs": 1, "CPUs": 2}, "1681167230.000238": {"GPUs": 1, "CPUs": 2}, "1681167230.501585": {"GPUs": 1, "CPUs": 2}, "1681167231.000238": {"GPUs": 1, "CPUs": 2}, "1681167231.500594": {"GPUs": 1, "CPUs": 1}, "1681167232.000343": {"GPUs": 1, "CPUs": 2}, "1681167232.500286": {"GPUs": 1, "CPUs": 2}, "1681167233.000302": {"GPUs": 1, "CPUs": 1}, "1681167233.500264": {"GPUs": 1, "CPUs": 2}, "1681167234.000266": {"GPUs": 1, "CPUs": 2}, "1681167234.500255": {"GPUs": 1, "CPUs": 2}, "1681167235.000312": {"GPUs": 1, "CPUs": 1}, "1681167235.500267": {"GPUs": 1, "CPUs": 2}, "1681167236.001134": {"GPUs": 1, "CPUs": 2}, "1681167236.500266": {"GPUs": 1, "CPUs": 2}, "1681167237.001094": {"GPUs": 1, "CPUs": 2}, "1681167237.500253": {"GPUs": 1, "CPUs": 1}, "1681167238.000243": {"GPUs": 1, "CPUs": 2}, "1681167238.500249": {"GPUs": 1, "CPUs": 2}, "1681167239.000273": {"GPUs": 1, "CPUs": 2}, "1681167239.500295": {"GPUs": 1, "CPUs": 1}, "1681167242.002215": {"GPUs": 1, "CPUs": 2}, "1681167242.506153": {"GPUs": 1, "CPUs": 2}, "1681167243.003115": {"GPUs": 1, "CPUs": 2}, "1681167243.502258": {"GPUs": 1, "CPUs": 1}, "1681167244.002226": {"GPUs": 1, "CPUs": 2}, "1681167244.502212": {"GPUs": 1, "CPUs": 2}, "1681167245.002231": {"GPUs": 1, "CPUs": 1}, "1681167245.502241": {"GPUs": 1, "CPUs": 1}, "1681167246.0023": {"GPUs": 1, "CPUs": 2}, "1681167246.502264": {"GPUs": 1, "CPUs": 1}, "1681167247.002218": {"GPUs": 1, "CPUs": 2}, "1681167247.502241": {"GPUs": 1, "CPUs": 2}, "1681167248.002244": {"GPUs": 1, "CPUs": 1}, "1681167248.502217": {"GPUs": 1, "CPUs": 2}, "1681167249.002245": {"GPUs": 1, "CPUs": 2}, "1681167249.50227": {"GPUs": 1, "CPUs": 1}, "1681167250.002251": {"GPUs": 1, "CPUs": 1}, "1681167250.502224": {"GPUs": 1, "CPUs": 2}, "1681167251.002168": {"GPUs": 1, "CPUs": 2}, "1681167251.502248": {"GPUs": 1, "CPUs": 2}, "1681167252.002257": {"GPUs": 1, "CPUs": 2}, "1681167252.502177": {"GPUs": 1, "CPUs": 1}, "1681167253.002207": {"GPUs": 1, "CPUs": 2}, "1681167253.502186": {"GPUs": 1, "CPUs": 2}, "1681167254.002231": {"GPUs": 1, "CPUs": 2}, "1681167254.502231": {"GPUs": 1, "CPUs": 2}, "1681167255.002219": {"GPUs": 1, "CPUs": 2}, "1681167255.502248": {"GPUs": 1, "CPUs": 2}, "1681167256.002176": {"GPUs": 1, "CPUs": 2}, "1681167256.502207": {"GPUs": 1, "CPUs": 1}, "1681167257.002197": {"GPUs": 1, "CPUs": 2}, "1681167258.002217": {"GPUs": 1, "CPUs": 1}, "1681167258.502247": {"GPUs": 1, "CPUs": 2}, "1681167259.002221": {"GPUs": 1, "CPUs": 2}, "1681167259.502234": {"GPUs": 1, "CPUs": 2}, "1681167260.002252": {"GPUs": 1, "CPUs": 2}, "1681167260.502222": {"GPUs": 1, "CPUs": 1}, "1681167261.002235": {"GPUs": 1, "CPUs": 2}, "1681167261.502244": {"GPUs": 1, "CPUs": 2}, "1681167262.002226": {"GPUs": 1, "CPUs": 2}, "1681167262.502239": {"GPUs": 1, "CPUs": 1}, "1681167263.002212": {"GPUs": 1, "CPUs": 2}, "1681167263.502197": {"GPUs": 1, "CPUs": 2}, "1681167264.002198": {"GPUs": 1, "CPUs": 2}, "1681167264.502214": {"GPUs": 1, "CPUs": 2}, "1681167265.002214": {"GPUs": 1, "CPUs": 2}, "1681167265.502189": {"GPUs": 1, "CPUs": 1}, "1681167266.002211": {"GPUs": 1, "CPUs": 2}, "1681167266.502214": {"GPUs": 1, "CPUs": 2}, "1681167267.00223": {"GPUs": 1, "CPUs": 2}, "1681167267.502259": {"GPUs": 1, "CPUs": 2}, "1681167268.002236": {"GPUs": 1, "CPUs": 1}, "1681167268.502196": {"GPUs": 1, "CPUs": 2}, "1681167269.002218": {"GPUs": 1, "CPUs": 2}, "1681167269.502157": {"GPUs": 1, "CPUs": 2}, "1681167270.502223": {"GPUs": 1, "CPUs": 1}, "1681167271.00222": {"GPUs": 1, "CPUs": 2}, "1681167271.502263": {"GPUs": 1, "CPUs": 1}, "1681167272.002245": {"GPUs": 1, "CPUs": 2}, "1681167272.50224": {"GPUs": 1, "CPUs": 2}, "1681167273.002218": {"GPUs": 1, "CPUs": 1}, "1681167273.502236": {"GPUs": 1, "CPUs": 2}, "1681167274.002233": {"GPUs": 1, "CPUs": 2}, "1681167274.502202": {"GPUs": 1, "CPUs": 1}, "1681167275.002205": {"GPUs": 1, "CPUs": 2}, "1681167275.502231": {"GPUs": 1, "CPUs": 2}, "1681167276.002188": {"GPUs": 1, "CPUs": 2}, "1681167276.502223": {"GPUs": 1, "CPUs": 2}, "1681167277.002211": {"GPUs": 1, "CPUs": 2}, "1681167277.502189": {"GPUs": 1, "CPUs": 1}, "1681167278.002217": {"GPUs": 1, "CPUs": 1}, "1681167278.502259": {"GPUs": 1, "CPUs": 2}, "1681167279.002248": {"GPUs": 1, "CPUs": 2}, "1681167279.5022": {"GPUs": 1, "CPUs": 2}, "1681167280.002224": {"GPUs": 1, "CPUs": 2}, "1681167280.502186": {"GPUs": 1, "CPUs": 2}, "1681167281.002271": {"GPUs": 1, "CPUs": 2}, "1681167281.502208": {"GPUs": 1, "CPUs": 2}, "1681167282.002234": {"GPUs": 1, "CPUs": 1}, "1681167282.502173": {"GPUs": 1, "CPUs": 2}, "1681167283.002239": {"GPUs": 1, "CPUs": 2}, "1681167283.502223": {"GPUs": 1, "CPUs": 2}, "1681167284.002186": {"GPUs": 1, "CPUs": 2}, "1681167284.502272": {"GPUs": 1, "CPUs": 1}, "1681167285.002245": {"GPUs": 1, "CPUs": 2}, "1681167285.502173": {"GPUs": 1, "CPUs": 2}, "1681167286.002238": {"GPUs": 1, "CPUs": 2}, "1681167286.502224": {"GPUs": 1, "CPUs": 2}, "1681167287.002217": {"GPUs": 1, "CPUs": 2}, "1681167288.002219": {"GPUs": 1, "CPUs": 2}, "1681167288.502188": {"GPUs": 1, "CPUs": 2}, "1681167289.002217": {"GPUs": 1, "CPUs": 2}, "1681167289.502216": {"GPUs": 1, "CPUs": 2}, "1681167290.002218": {"GPUs": 1, "CPUs": 2}, "1681167290.502169": {"GPUs": 1, "CPUs": 2}, "1681167291.002233": {"GPUs": 1, "CPUs": 2}, "1681167291.502244": {"GPUs": 1, "CPUs": 1}, "1681167292.002238": {"GPUs": 1, "CPUs": 2}, "1681167292.50227": {"GPUs": 1, "CPUs": 2}, "1681167293.002163": {"GPUs": 1, "CPUs": 1}, "1681167293.50221": {"GPUs": 1, "CPUs": 2}, "1681167294.002197": {"GPUs": 1, "CPUs": 2}, "1681167294.502238": {"GPUs": 1, "CPUs": 2}, "1681167295.002218": {"GPUs": 1, "CPUs": 1}, "1681167295.502234": {"GPUs": 1, "CPUs": 2}, "1681167296.002209": {"GPUs": 1, "CPUs": 2}, "1681167296.502224": {"GPUs": 1, "CPUs": 2}, "1681167297.002267": {"GPUs": 1, "CPUs": 2}, "1681167297.502252": {"GPUs": 1, "CPUs": 2}, "1681167298.002197": {"GPUs": 1, "CPUs": 1}, "1681167298.502221": {"GPUs": 1, "CPUs": 1}, "1681167299.002227": {"GPUs": 1, "CPUs": 2}, "1681167299.502216": {"GPUs": 1, "CPUs": 1}, "1681167240.000226": {"GPUs": 1, "CPUs": 2}, "1681167240.501124": {"GPUs": 1, "CPUs": 1}, "1681167241.001197": {"GPUs": 1, "CPUs": 1}, "1681167241.500327": {"GPUs": 1, "CPUs": 1}, "1681167242.000314": {"GPUs": 1, "CPUs": 3}, "1681167242.500256": {"GPUs": 1, "CPUs": 1}, "1681167243.001717": {"GPUs": 1, "CPUs": 1}, "1681167243.500853": {"GPUs": 1, "CPUs": 2}, "1681167244.000279": {"GPUs": 1, "CPUs": 2}, "1681167244.500278": {"GPUs": 1, "CPUs": 1}, "1681167245.000505": {"GPUs": 1, "CPUs": 2}, "1681167245.500214": {"GPUs": 1, "CPUs": 2}, "1681167246.000276": {"GPUs": 1, "CPUs": 1}, "1681167246.500277": {"GPUs": 1, "CPUs": 1}, "1681167247.000225": {"GPUs": 1, "CPUs": 2}, "1681167247.500263": {"GPUs": 1, "CPUs": 2}, "1681167248.00026": {"GPUs": 1, "CPUs": 2}, "1681167248.500293": {"GPUs": 1, "CPUs": 1}, "1681167249.000278": {"GPUs": 1, "CPUs": 2}, "1681167249.501152": {"GPUs": 1, "CPUs": 1}, "1681167250.000283": {"GPUs": 1, "CPUs": 2}, "1681167250.500282": {"GPUs": 1, "CPUs": 1}, "1681167251.000276": {"GPUs": 1, "CPUs": 2}, "1681167251.500271": {"GPUs": 1, "CPUs": 1}, "1681167252.000259": {"GPUs": 1, "CPUs": 2}, "1681167252.500244": {"GPUs": 1, "CPUs": 2}, "1681167253.000274": {"GPUs": 1, "CPUs": 2}, "1681167253.501103": {"GPUs": 1, "CPUs": 1}, "1681167254.001073": {"GPUs": 1, "CPUs": 1}, "1681167254.501256": {"GPUs": 1, "CPUs": 1}, "1681167255.000301": {"GPUs": 1, "CPUs": 2}, "1681167255.500287": {"GPUs": 1, "CPUs": 2}, "1681167256.00108": {"GPUs": 1, "CPUs": 1}, "1681167256.500258": {"GPUs": 1, "CPUs": 2}, "1681167257.001154": {"GPUs": 1, "CPUs": 2}, "1681167258.000267": {"GPUs": 1, "CPUs": 1}, "1681167258.500249": {"GPUs": 1, "CPUs": 2}, "1681167259.000226": {"GPUs": 1, "CPUs": 2}, "1681167259.500233": {"GPUs": 1, "CPUs": 2}, "1681167260.00025": {"GPUs": 1, "CPUs": 2}, "1681167260.500266": {"GPUs": 1, "CPUs": 2}, "1681167261.000864": {"GPUs": 1, "CPUs": 2}, "1681167262.001218": {"GPUs": 1, "CPUs": 2}, "1681167262.50029": {"GPUs": 1, "CPUs": 2}, "1681167263.000298": {"GPUs": 1, "CPUs": 2}, "1681167263.500325": {"GPUs": 1, "CPUs": 1}, "1681167264.000285": {"GPUs": 1, "CPUs": 1}, "1681167264.501127": {"GPUs": 1, "CPUs": 1}, "1681167265.000256": {"GPUs": 1, "CPUs": 2}, "1681167265.500242": {"GPUs": 1, "CPUs": 2}, "1681167266.000258": {"GPUs": 1, "CPUs": 2}, "1681167266.500262": {"GPUs": 1, "CPUs": 1}, "1681167267.000262": {"GPUs": 1, "CPUs": 2}, "1681167267.500255": {"GPUs": 1, "CPUs": 2}, "1681167268.000291": {"GPUs": 1, "CPUs": 2}, "1681167268.500258": {"GPUs": 1, "CPUs": 2}, "1681167269.000244": {"GPUs": 1, "CPUs": 2}, "1681167269.500275": {"GPUs": 1, "CPUs": 2}, "1681167270.000264": {"GPUs": 1, "CPUs": 1}, "1681167270.501066": {"GPUs": 1, "CPUs": 1}, "1681167271.000282": {"GPUs": 1, "CPUs": 1}, "1681167271.500271": {"GPUs": 1, "CPUs": 2}, "1681167272.00027": {"GPUs": 1, "CPUs": 2}, "1681167272.500261": {"GPUs": 1, "CPUs": 2}, "1681167273.000259": {"GPUs": 1, "CPUs": 2}, "1681167273.500214": {"GPUs": 1, "CPUs": 2}, "1681167274.000252": {"GPUs": 1, "CPUs": 2}, "1681167274.50028": {"GPUs": 1, "CPUs": 1}, "1681167275.000345": {"GPUs": 1, "CPUs": 2}, "1681167275.500295": {"GPUs": 1, "CPUs": 2}, "1681167276.000308": {"GPUs": 1, "CPUs": 1}, "1681167276.500279": {"GPUs": 1, "CPUs": 2}, "1681167277.00028": {"GPUs": 1, "CPUs": 2}, "1681167277.500291": {"GPUs": 1, "CPUs": 1}, "1681167278.000324": {"GPUs": 1, "CPUs": 2}, "1681167278.500253": {"GPUs": 1, "CPUs": 1}, "1681167279.00105": {"GPUs": 1, "CPUs": 2}, "1681167279.500287": {"GPUs": 1, "CPUs": 2}, "1681167280.000277": {"GPUs": 1, "CPUs": 1}, "1681167280.500295": {"GPUs": 1, "CPUs": 2}, "1681167281.000288": {"GPUs": 1, "CPUs": 2}, "1681167281.50278": {"GPUs": 1, "CPUs": 1}, "1681167282.000249": {"GPUs": 1, "CPUs": 2}, "1681167282.500263": {"GPUs": 1, "CPUs": 2}, "1681167283.000255": {"GPUs": 1, "CPUs": 2}, "1681167283.500337": {"GPUs": 1, "CPUs": 1}, "1681167284.000289": {"GPUs": 1, "CPUs": 1}, "1681167284.500307": {"GPUs": 1, "CPUs": 1}, "1681167285.000275": {"GPUs": 1, "CPUs": 1}, "1681167285.500252": {"GPUs": 1, "CPUs": 2}, "1681167286.000291": {"GPUs": 1, "CPUs": 2}, "1681167286.50032": {"GPUs": 1, "CPUs": 1}, "1681167287.000307": {"GPUs": 1, "CPUs": 2}, "1681167287.500252": {"GPUs": 1, "CPUs": 1}, "1681167288.00026": {"GPUs": 1, "CPUs": 1}, "1681167288.501178": {"GPUs": 1, "CPUs": 2}, "1681167289.000216": {"GPUs": 1, "CPUs": 2}, "1681167289.500288": {"GPUs": 1, "CPUs": 2}, "1681167290.000297": {"GPUs": 1, "CPUs": 2}, "1681167290.500274": {"GPUs": 1, "CPUs": 2}, "1681167291.000404": {"GPUs": 1, "CPUs": 2}, "1681167291.500265": {"GPUs": 1, "CPUs": 1}, "1681167292.001157": {"GPUs": 1, "CPUs": 1}, "1681167292.50117": {"GPUs": 1, "CPUs": 1}, "1681167293.001059": {"GPUs": 1, "CPUs": 2}, "1681167293.500246": {"GPUs": 1, "CPUs": 2}, "1681167294.000247": {"GPUs": 1, "CPUs": 2}, "1681167294.500249": {"GPUs": 1, "CPUs": 2}, "1681167295.000321": {"GPUs": 1, "CPUs": 1}, "1681167295.50029": {"GPUs": 1, "CPUs": 1}, "1681167296.000706": {"GPUs": 1, "CPUs": 2}, "1681167296.500282": {"GPUs": 1, "CPUs": 1}, "1681167297.000258": {"GPUs": 1, "CPUs": 2}, "1681167297.500296": {"GPUs": 1, "CPUs": 1}, "1681167298.001753": {"GPUs": 1, "CPUs": 1}, "1681167298.501175": {"GPUs": 1, "CPUs": 1}, "1681167299.001176": {"GPUs": 1, "CPUs": 2}, "1681167299.500277": {"GPUs": 1, "CPUs": 1}}}, "Datapoints": 5337, "RuleParameters": "threshold:50\ncpu_threshold:90\ngpu_threshold:10\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/Dataloader.json b/starter/ProfilerReports/improved/profiler-reports/Dataloader.json new file mode 100644 index 00000000..ac63f3ed --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/Dataloader.json @@ -0,0 +1 @@ +{"RuleTriggered": 1, "Violations": 19, "Details": {"pin_memory": false, "num_workers": 0, "cores": 4, "dataloaders": 2, "dataloading_time": {"p25": 0.093944, "p50": 0.104068, "p95": 0.129339, "probs": [16, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 19, 45, 50, 61, 107, 153, 209, 258, 284, 361, 439, 523, 613, 667, 780, 844, 846, 900, 845, 786, 776, 697, 567, 498, 383, 312, 255, 190, 146, 115, 64, 57, 47, 23, 21, 12, 7, 10, 8, 2, 5, 3, 2, 4, 1, 1, 4, 1, 1, 1, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], "binedges": [0.016137, 0.018631309999999998, 0.021125619999999998, 0.023619929999999997, 0.026114239999999997, 0.028608549999999996, 0.03110286, 0.033597169999999996, 0.036091479999999995, 0.038585789999999995, 0.041080099999999994, 0.043574409999999994, 0.04606872, 0.04856303, 0.05105734, 0.05355165, 0.05604596, 0.05854027, 0.06103458, 0.06352889, 0.0660232, 0.06851751, 0.07101182, 0.07350613, 0.07600044, 0.07849475, 0.08098906, 0.08348337, 0.08597768, 0.08847199, 0.0909663, 0.09346061, 0.09595492, 0.09844923, 0.10094354, 0.10343785, 0.10593216, 0.10842647, 0.11092078, 0.11341509, 0.1159094, 0.11840371, 0.12089802, 0.12339233, 0.12588664, 0.12838095, 0.13087526, 0.13336957, 0.13586388, 0.13835819, 0.1408525, 0.14334681, 0.14584112, 0.14833543, 0.15082974, 0.15332405, 0.15581836, 0.15831267, 0.16080698, 0.16330129, 0.1657956, 0.16828991, 0.17078422, 0.17327853, 0.17577283999999999, 0.17826714999999999, 0.18076145999999998, 0.18325576999999998, 0.18575007999999998, 0.18824438999999998, 0.19073869999999998, 0.19323300999999998, 0.19572731999999998, 0.19822162999999998, 0.20071593999999998, 0.20321024999999998, 0.20570455999999998, 0.20819886999999998, 0.21069317999999998, 0.21318748999999998, 0.21568179999999998, 0.21817610999999998, 0.22067041999999998, 0.22316472999999998, 0.22565903999999998, 0.22815334999999998, 0.23064765999999998, 0.23314196999999998, 0.23563627999999998, 0.23813058999999998, 0.24062489999999997, 0.24311920999999997, 0.24561351999999997, 0.24810782999999997, 0.25060214, 0.25309645, 0.25559076000000003, 0.25808507, 0.26057938, 0.26307369, 0.265568]}}, "Datapoints": 13041, "RuleParameters": "min_threshold:70\nmax_threshold:200"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/GPUMemoryIncrease.json b/starter/ProfilerReports/improved/profiler-reports/GPUMemoryIncrease.json new file mode 100644 index 00000000..d229a4d9 --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/GPUMemoryIncrease.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {}, "Datapoints": 2660, "RuleParameters": "increase:5\npatience:1000\nwindow:10"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/IOBottleneck.json b/starter/ProfilerReports/improved/profiler-reports/IOBottleneck.json new file mode 100644 index 00000000..443eab1e --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/IOBottleneck.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 18, "Details": {"low_gpu_utilization": 1105, "bottlenecks": {"1681165978.502539": {"GPUs": 1, "CPUs": 1}, "1681166018.502198": {"GPUs": 1, "CPUs": 1}, "1681166142.002224": {"GPUs": 1, "CPUs": 1}, "1681166155.002197": {"GPUs": 1, "CPUs": 1}, "1681166157.0022": {"GPUs": 1, "CPUs": 1}, "1681166159.002208": {"GPUs": 1, "CPUs": 1}, "1681166159.502192": {"GPUs": 1, "CPUs": 1}, "1681166154.500253": {"GPUs": 1, "CPUs": 1}, "1681166156.500272": {"GPUs": 1, "CPUs": 1}, "1681166157.000248": {"GPUs": 1, "CPUs": 1}, "1681166157.500248": {"GPUs": 1, "CPUs": 1}, "1681166160.002206": {"GPUs": 1, "CPUs": 1}, "1681166160.502197": {"GPUs": 1, "CPUs": 1}, "1681166163.502207": {"GPUs": 1, "CPUs": 1}, "1681166160.000268": {"GPUs": 1, "CPUs": 1}, "1681166161.000239": {"GPUs": 1, "CPUs": 1}, "1681166163.000248": {"GPUs": 1, "CPUs": 1}, "1681166163.500276": {"GPUs": 1, "CPUs": 1}}}, "Datapoints": 5337, "RuleParameters": "threshold:50\nio_threshold:50\ngpu_threshold:10\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/LoadBalancing.json b/starter/ProfilerReports/improved/profiler-reports/LoadBalancing.json new file mode 100644 index 00000000..962d88bf --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/LoadBalancing.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"algo-1": {"workloads": {"gpu0": [530, 3, 4, 2, 1, 0, 0, 4, 0, 0, 1, 1, 24, 24, 937, 99, 119, 115, 120, 114, 125, 100, 110, 76, 71, 50, 19, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}, "algo-2": {"workloads": {"gpu0": [556, 1, 2, 5, 1, 0, 0, 0, 0, 0, 0, 2, 7, 24, 860, 105, 117, 88, 122, 105, 114, 119, 103, 106, 87, 78, 42, 11, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}}, "Datapoints": 2660, "RuleParameters": "threshold:0.2\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/LowGPUUtilization.json b/starter/ProfilerReports/improved/profiler-reports/LowGPUUtilization.json new file mode 100644 index 00000000..cbfe1e2a --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/LowGPUUtilization.json @@ -0,0 +1 @@ +{"RuleTriggered": 28, "Violations": 28, "Details": {"algo-1": {"gpu0": {"gpu_max": 56.0, "gpu_95": 47.0, "gpu_5": 0.0, "p25": 28.0, "p50": 28.0, "p75": 38.0, "p95": 48.0, "upper": 53.0, "lower": 13.0}}, "last_timestamp": 1681167240000000, "algo-2": {"gpu0": {"gpu_max": 58.0, "gpu_95": 51.0, "gpu_5": 0.0, "p25": 28.0, "p50": 29.0, "p75": 39.25, "p95": 50.0, "upper": 56.125, "lower": 11.125}}}, "Datapoints": 2660, "RuleParameters": "threshold_p95:70\nthreshold_p5:10\nwindow:500\npatience:1000"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/MaxInitializationTime.json b/starter/ProfilerReports/improved/profiler-reports/MaxInitializationTime.json new file mode 100644 index 00000000..f1eb1aea --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/MaxInitializationTime.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"step_num": {}, "job_start": 1681165969.502138, "job_end": 1681167299.503495}, "Datapoints": 0, "RuleParameters": "threshold:20"} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/OverallFrameworkMetrics.json b/starter/ProfilerReports/improved/profiler-reports/OverallFrameworkMetrics.json new file mode 100644 index 00000000..4b9fc278 --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/OverallFrameworkMetrics.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"ratio": {}, "phase": {}, "phase_time": {}, "general": {"DataLoaderIterInitialize": 0.0010734443129493596, "DataLoaderIter": 99.99892655568705}}, "Datapoints": 0, "RuleParameters": ""} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/OverallSystemUsage.json b/starter/ProfilerReports/improved/profiler-reports/OverallSystemUsage.json new file mode 100644 index 00000000..bddfaa04 --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/OverallSystemUsage.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"Network": {"algo-1": {"max": 93824135.4, "p99": 0, "p95": 0, "p50": 0, "min": 0}, "algo-2": {"max": 100261086.28, "p99": 0, "p95": 0, "p50": 0, "min": 0}}, "GPU": {"algo-1": {"max": 56.0, "p99": 52.0, "p95": 48.0, "p50": 28.0, "min": 0}, "algo-2": {"max": 58.0, "p99": 53.0, "p95": 50.0, "p50": 29.0, "min": 0}}, "CPU": {"algo-1": {"max": 100.0, "p99": 96.93, "p95": 74.75, "p50": 39.38, "min": 0}, "algo-2": {"max": 99.49, "p99": 96.97, "p95": 74.47, "p50": 39.27, "min": 0}}, "CPU memory": {"algo-1": {"max": 32.9, "p99": 31.77, "p95": 30.05, "p50": 29.68, "min": 4.45}, "algo-2": {"max": 32.84, "p99": 31.68, "p95": 30.87, "p50": 29.76, "min": 4.34}}, "GPU memory": {"algo-1": {"max": 42.0, "p99": 39.0, "p95": 36.0, "p50": 21.0, "min": 0}, "algo-2": {"max": 41.0, "p99": 39.0, "p95": 37.0, "p50": 21.0, "min": 0}}, "I/O": {"algo-1": {"max": 40.01, "p99": 24.78, "p95": 15.06, "p50": 0, "min": 0}, "algo-2": {"max": 42.88, "p99": 25.03, "p95": 14.49, "p50": 0, "min": 0}}}, "Datapoints": 2660, "RuleParameters": ""} \ No newline at end of file diff --git a/starter/ProfilerReports/improved/profiler-reports/StepOutlier.json b/starter/ProfilerReports/improved/profiler-reports/StepOutlier.json new file mode 100644 index 00000000..e024a30e --- /dev/null +++ b/starter/ProfilerReports/improved/profiler-reports/StepOutlier.json @@ -0,0 +1 @@ +{"RuleTriggered": 0, "Violations": 0, "Details": {"step_details": {}}, "Datapoints": 0, "RuleParameters": "threshold:3\nmode:None\nn_outliers:10\nstddev:3"} \ No newline at end of file