From e86d0c6d38d65158836c275e2d7b6a8f8e92f026 Mon Sep 17 00:00:00 2001 From: Felipe Vieira Frujeri <55607890+microsoft-fevieira@users.noreply.github.com> Date: Mon, 6 Nov 2023 13:28:08 -0800 Subject: [PATCH 01/58] Refactor deepspeed-chat into a python package. (#731) Co-authored-by: Ammar Ahmad Awan Co-authored-by: Michael Santacroce Co-authored-by: Lev Kurilenko Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> --- applications/DeepSpeed-Chat/.gitignore | 137 ++++++++++++++++++ applications/DeepSpeed-Chat/README.md | 15 +- .../rlhf}/ppo_trainer.py | 7 +- .../rlhf}/rlhf_engine.py | 8 +- .../utils/data/data_utils.py | 2 +- .../utils/data/raw_datasets.py | 0 .../{training => dschat}/utils/ds_utils.py | 0 .../utils/model/model_utils.py | 5 +- .../utils/model/reward_model.py | 0 .../{training => dschat}/utils/module/lora.py | 0 .../{training => dschat}/utils/perf.py | 0 .../{training => dschat}/utils/utils.py | 0 .../DeepSpeed-Chat/{train.py => e2e_rlhf.py} | 0 applications/DeepSpeed-Chat/setup.py | 32 ++++ .../{training => }/tests/test_training.py | 4 +- .../step1_supervised_finetuning/main.py | 16 +- .../prompt_eval.py | 8 +- .../step2_reward_model_finetuning/main.py | 14 +- .../step2_reward_model_finetuning/rw_eval.py | 10 +- .../training/step3_rlhf_finetuning/main.py | 19 +-- 20 files changed, 212 insertions(+), 65 deletions(-) create mode 100644 applications/DeepSpeed-Chat/.gitignore rename applications/DeepSpeed-Chat/{training/step3_rlhf_finetuning => dschat/rlhf}/ppo_trainer.py (99%) rename applications/DeepSpeed-Chat/{training/step3_rlhf_finetuning => dschat/rlhf}/rlhf_engine.py (97%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/data/data_utils.py (99%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/data/raw_datasets.py (100%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/ds_utils.py (100%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/model/model_utils.py (97%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/model/reward_model.py (100%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/module/lora.py (100%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/perf.py (100%) rename applications/DeepSpeed-Chat/{training => dschat}/utils/utils.py (100%) rename applications/DeepSpeed-Chat/{train.py => e2e_rlhf.py} (100%) create mode 100644 applications/DeepSpeed-Chat/setup.py rename applications/DeepSpeed-Chat/{training => }/tests/test_training.py (97%) diff --git a/applications/DeepSpeed-Chat/.gitignore b/applications/DeepSpeed-Chat/.gitignore new file mode 100644 index 000000000..cbc922f8f --- /dev/null +++ b/applications/DeepSpeed-Chat/.gitignore @@ -0,0 +1,137 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +results/ +outputs/ + +.amltconfig +.test_output +*.hdf5 +*.h5 diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md index c0a456823..ac9f3ab6a 100644 --- a/applications/DeepSpeed-Chat/README.md +++ b/applications/DeepSpeed-Chat/README.md @@ -33,20 +33,24 @@ A fast, affordable, scalable and open system framework for enabling end-to-end R ## Table of Contents +- [🐕DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales🐕](#deepspeed-chat-easy-fast-and-affordable-rlhf-training-of-chatgpt-like-models-at-all-scales) +- [Table of Contents](#table-of-contents) - [📰 Latest News 📰](#-latest-news-) -- [🚀 What is DeepSpeed Chat 🚀️](#-what-is-deepspeed-chat-) +- [🚀 What is DeepSpeed Chat 🚀](#-what-is-deepspeed-chat-) - [🧨 Capabilities 🧨](#-capabilities-) - [☕ Quick Start ☕](#-quick-start-) - [🐼 Installation](#-installation) - - [🐼 Single Script for Training 3-Step RLHF Pipeline](#-one-single-script-completes-all-three-stages-of-rlhf-training-and-generate-your-first-chatgpt-model) + - [🐼 One Single Script Completes All Three Steps of RLHF Training and Generate Your First ChatGPT Model](#-one-single-script-completes-all-three-steps-of-rlhf-training-and-generate-your-first-chatgpt-model) - [🐼 Demonstration: Individual Step Fine-Tuning](#-demonstration-individual-step-fine-tuning) - [🕐 Step 1 - Supervised Fine-Tuning](#-step-1---supervised-fine-tuning) - [🕑 Step 2 - Reward Model](#-step-2---reward-model) - [🕒 Step 3 - Reinforcement Learning with Human Feedback](#-step-3---reinforcement-learning-with-human-feedback) - - [🐼 Adding and using your own datasets in DeepSpeed-Chat](#-adding-and-using-your-own-datasets-in-deepspeed-chat) - - [🐼 Customizing RLHF training pipeline via DeepSpeed-Chat’s APIs](#-customizing-your-own-rlhf-training-pipeline-using-deepspeed-chats-rlhf-apis) - - [🐼 Serving Your Model: Plug-in and Test!](#-serving-plug-in-your-final-model-trained-by-deepspeed-chat-and-test-it-out) + - [🐼 Adding and using your own datasets in DeepSpeed-Chat](#-adding-and-using-your-own-datasets-in-deepspeed-chat) + - [🐼 Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs](#-customizing-your-own-rlhf-training-pipeline-using-deepspeed-chats-rlhf-apis) + - [🐼 Serving: Plug-in your final model trained by DeepSpeed-Chat and test it out!](#-serving-plug-in-your-final-model-trained-by-deepspeed-chat-and-test-it-out) - [🔥 Training Performance Evaluation 🔥](#-training-performance-evaluation-) + - [🐲 Superior Model Scale and Low Training Cost](#-superior-model-scale-and-low-training-cost) + - [🐲 Throughput and Model Size Scalability Comparisons with Existing RLHF Systems](#-throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) - [😽 Supported Models 😽](#-supported-models-) - [🔬 Build Pipeline Status 🔬](#-build-pipeline-status-) - [⚓ Documentation and Tutorial ⚓](#-documentation-and-tutorial-) @@ -119,6 +123,7 @@ pip install deepspeed>=0.9.0 git clone https://github.com/microsoft/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-Chat/ pip install -r requirements.txt +pip install -e . ``` ### 🐼 One Single Script Completes All Three Steps of RLHF Training and Generate Your First ChatGPT Model diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py similarity index 99% rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py rename to applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py index 2a5056cfe..22cba6be0 100644 --- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py +++ b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py @@ -4,17 +4,12 @@ # DeepSpeed Team import torch import torch.nn.functional as F -import sys -import os import time import deepspeed from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus from deepspeed.accelerator import get_accelerator -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) - -from utils.utils import print_rank_0 +from dschat.utils.utils import print_rank_0 def print_all_ranks(tag, value, rank): diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py similarity index 97% rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py rename to applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py index 3a192d017..5b6778cc2 100755 --- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py +++ b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py @@ -9,10 +9,10 @@ from deepspeed.ops.adam import DeepSpeedCPUAdam from transformers import AutoModelForCausalLM, get_scheduler -from utils.ds_utils import get_train_ds_config, get_eval_ds_config -from utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible -from utils.model.model_utils import create_hf_model, create_critic_model -from utils.utils import get_optimizer_grouped_parameters +from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config +from dschat.utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible +from dschat.utils.model.model_utils import create_hf_model, create_critic_model +from dschat.utils.utils import get_optimizer_grouped_parameters """ TODOs: * support HF models for critic (for debugging), must be a previously saved ckpt from step-2 diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py similarity index 99% rename from applications/DeepSpeed-Chat/training/utils/data/data_utils.py rename to applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py index 277ebd730..744ad7927 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py @@ -14,7 +14,7 @@ import os import hashlib from itertools import chain -from . import raw_datasets +from dschat.utils.data import raw_datasets from deepspeed.accelerator import get_accelerator diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py rename to applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py diff --git a/applications/DeepSpeed-Chat/training/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/ds_utils.py rename to applications/DeepSpeed-Chat/dschat/utils/ds_utils.py diff --git a/applications/DeepSpeed-Chat/training/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py similarity index 97% rename from applications/DeepSpeed-Chat/training/utils/model/model_utils.py rename to applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py index 27e5720ca..97d3bff15 100644 --- a/applications/DeepSpeed-Chat/training/utils/model/model_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py @@ -12,9 +12,8 @@ from huggingface_hub import snapshot_download from transformers.deepspeed import HfDeepSpeedConfig -from .reward_model import RewardModel -from ..utils import load_state_dict_into_model -from ..utils import print_rank_0 +from dschat.utils.model.reward_model import RewardModel +from dschat.utils.utils import load_state_dict_into_model, print_rank_0 def configure_dropout(model_config, dropout): diff --git a/applications/DeepSpeed-Chat/training/utils/model/reward_model.py b/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/model/reward_model.py rename to applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py diff --git a/applications/DeepSpeed-Chat/training/utils/module/lora.py b/applications/DeepSpeed-Chat/dschat/utils/module/lora.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/module/lora.py rename to applications/DeepSpeed-Chat/dschat/utils/module/lora.py diff --git a/applications/DeepSpeed-Chat/training/utils/perf.py b/applications/DeepSpeed-Chat/dschat/utils/perf.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/perf.py rename to applications/DeepSpeed-Chat/dschat/utils/perf.py diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py similarity index 100% rename from applications/DeepSpeed-Chat/training/utils/utils.py rename to applications/DeepSpeed-Chat/dschat/utils/utils.py diff --git a/applications/DeepSpeed-Chat/train.py b/applications/DeepSpeed-Chat/e2e_rlhf.py similarity index 100% rename from applications/DeepSpeed-Chat/train.py rename to applications/DeepSpeed-Chat/e2e_rlhf.py diff --git a/applications/DeepSpeed-Chat/setup.py b/applications/DeepSpeed-Chat/setup.py new file mode 100644 index 000000000..343f5b476 --- /dev/null +++ b/applications/DeepSpeed-Chat/setup.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# setup.py: install script for deepspeed_chat +""" +to install deepspeed_chat and its dependencies for development work, +run this cmd from the root directory: + pip install -e . +""" +import setuptools + +setuptools.setup( + name="deepspeed-chat", + version="0.1", + url= + "https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat", + include_package_data=True, + packages=setuptools.find_packages(include=['dschat']), + install_requires=[ + "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3", + "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2", + "transformers", "tensorboard" + ], + extras_require={ + "azureml": [ + "azure-ml-component", + "azureml-core", + ], + }) diff --git a/applications/DeepSpeed-Chat/training/tests/test_training.py b/applications/DeepSpeed-Chat/tests/test_training.py similarity index 97% rename from applications/DeepSpeed-Chat/training/tests/test_training.py rename to applications/DeepSpeed-Chat/tests/test_training.py index 3be4f6ff6..7ffe02972 100644 --- a/applications/DeepSpeed-Chat/training/tests/test_training.py +++ b/applications/DeepSpeed-Chat/tests/test_training.py @@ -66,7 +66,7 @@ def test_ds_chat(zero_stage, hybrid_engine, offload, lora): # cd into execution dir wd = os.getcwd() - os.chdir("../step3_rlhf_finetuning") + os.chdir("../training/step3_rlhf_finetuning") sweep_script = "training_scripts/opt/single_node/sweep/run_single.sh" # Run bash script @@ -85,3 +85,5 @@ def test_ds_chat(zero_stage, hybrid_engine, offload, lora): ), "Actor model was not saved during step 3 training." assert file_exists(f"{output_path}/critic/", "pytorch_model.bin" ), "Critic model was not saved during step 3 training." + + os.chdir(wd) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index 0332e2e3f..c37d1f4cd 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -4,9 +4,7 @@ # DeepSpeed Team import argparse -import os import math -import sys import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler @@ -23,14 +21,12 @@ from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam from deepspeed import get_accelerator -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.data.data_utils import create_prompt_dataset -from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer -from utils.ds_utils import get_train_ds_config -from utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible -from utils.model.model_utils import create_hf_model, causal_lm_model_to_fp32_loss -from utils.perf import print_throughput +from dschat.utils.data.data_utils import create_prompt_dataset +from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer +from dschat.utils.ds_utils import get_train_ds_config +from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible +from dschat.utils.model.model_utils import create_hf_model, causal_lm_model_to_fp32_loss +from dschat.utils.perf import print_throughput def parse_args(): diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py index 21f79d63a..a25b0edea 100644 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py @@ -5,16 +5,12 @@ import argparse import logging import torch -import sys -import os from transformers import ( AutoModelForCausalLM, ) -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.model.model_utils import create_hf_model -from utils.utils import load_hf_tokenizer +from dschat.utils.model.model_utils import create_hf_model +from dschat.utils.utils import load_hf_tokenizer from deepspeed import get_accelerator logger = logging.getLogger(__name__) diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index dae906173..265c1caf4 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -4,9 +4,7 @@ # DeepSpeed Team import argparse -import os import math -import sys import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler @@ -21,13 +19,11 @@ from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam from deepspeed.accelerator import get_accelerator -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.model.model_utils import create_critic_model -from utils.data.data_utils import create_prompt_dataset, DataCollatorReward -from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer -from utils.ds_utils import get_train_ds_config -from utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible +from dschat.utils.model.model_utils import create_critic_model +from dschat.utils.data.data_utils import create_prompt_dataset, DataCollatorReward +from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer +from dschat.utils.ds_utils import get_train_ds_config +from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible def parse_args(): diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py index 28ee87d1c..23f9a66af 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py @@ -4,16 +4,10 @@ # DeepSpeed Team import argparse -import os import torch -import sys - -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.model.model_utils import create_critic_model -from utils.utils import to_device -from utils.utils import load_hf_tokenizer +from dschat.utils.model.model_utils import create_critic_model +from dschat.utils.utils import to_device, load_hf_tokenizer from deepspeed import get_accelerator diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py index 78f3249c0..a5be5671b 100644 --- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py @@ -33,18 +33,13 @@ import deepspeed -from ppo_trainer import DeepSpeedPPOTrainer, DeepSpeedPPOTrainerUnsupervised -from rlhf_engine import DeepSpeedRLHFEngine - -import sys - -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from utils.data.data_utils import create_prompt_dataset, MiniDataset, DataCollatorRLHF, get_unsupervised_data -from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, \ - moving_average, save_zero_three_model, load_hf_tokenizer, ExponentialMovingAverage -from utils.module.lora import convert_lora_to_linear_layer -from utils.perf import print_throughput_step3 +from dschat.rlhf.ppo_trainer import DeepSpeedPPOTrainer, DeepSpeedPPOTrainerUnsupervised +from dschat.rlhf.rlhf_engine import DeepSpeedRLHFEngine +from dschat.utils.data.data_utils import create_prompt_dataset, MiniDataset, DataCollatorRLHF, get_unsupervised_data +from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, moving_average, save_zero_three_model, load_hf_tokenizer, \ + ExponentialMovingAverage +from dschat.utils.module.lora import convert_lora_to_linear_layer +from dschat.utils.perf import print_throughput_step3 from deepspeed.accelerator import get_accelerator writer = None From 4dbff684ca84de15bf5aef37ede20bb9e246a5de Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Mon, 6 Nov 2023 13:37:10 -0800 Subject: [PATCH 02/58] Update transformers dependency in deepspeed-chat install (#802) --- applications/DeepSpeed-Chat/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/DeepSpeed-Chat/setup.py b/applications/DeepSpeed-Chat/setup.py index 343f5b476..01a1ed83f 100644 --- a/applications/DeepSpeed-Chat/setup.py +++ b/applications/DeepSpeed-Chat/setup.py @@ -22,7 +22,7 @@ install_requires=[ "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3", "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2", - "transformers", "tensorboard" + "transformers>=4.31.0,!=4.33.2", "tensorboard" ], extras_require={ "azureml": [ From ff0e25485d6fde856d890f5d860bf65edbe40424 Mon Sep 17 00:00:00 2001 From: zhou fan <1247714429@qq.com> Date: Wed, 8 Nov 2023 00:11:19 +0800 Subject: [PATCH 03/58] fix: using DistributedSampler when evaluating the reward model (#804) Co-authored-by: yi.luo --- .../training/step2_reward_model_finetuning/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index 265c1caf4..04f178504 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -291,7 +291,6 @@ def main(): collate_fn=data_collator, sampler=train_sampler, batch_size=args.per_device_train_batch_size) - eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, sampler=eval_sampler, From 089baad52f0051853946b1722cb9af8e10cc65f3 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:56:52 -0800 Subject: [PATCH 04/58] Add benchmark scripts for DeepSpeed-FastGen (#805) * Merge MII benchmark (#182) Add MII Benchmark --------- Co-authored-by: Ammar Ahmad Awan Co-authored-by: Michael Wyatt * removed deprecated files * update README for DS-FastGen * add renamed file --- benchmarks/inference/mii/README.md | 32 ++ .../mii/plot_effective_throughput.py | 156 +++++++++ .../inference/mii/plot_latency_percentile.py | 110 +++++++ benchmarks/inference/mii/plot_repl_scale.py | 95 ++++++ benchmarks/inference/mii/plot_th_lat.py | 98 ++++++ benchmarks/inference/mii/plot_tp_sizes.py | 98 ++++++ .../inference/mii/postprocess_results.py | 112 +++++++ .../inference/mii/random_query_generator.py | 30 ++ benchmarks/inference/mii/run_all.sh | 25 ++ benchmarks/inference/mii/run_all_replica.sh | 25 ++ benchmarks/inference/mii/run_all_vllm.sh | 26 ++ .../inference/mii/run_benchmark_client.py | 304 ++++++++++++++++++ .../inference/mii/run_benchmark_client.sh | 23 ++ benchmarks/inference/mii/sample_input.py | 221 +++++++++++++ benchmarks/inference/mii/server.py | 83 +++++ 15 files changed, 1438 insertions(+) create mode 100644 benchmarks/inference/mii/README.md create mode 100644 benchmarks/inference/mii/plot_effective_throughput.py create mode 100644 benchmarks/inference/mii/plot_latency_percentile.py create mode 100644 benchmarks/inference/mii/plot_repl_scale.py create mode 100644 benchmarks/inference/mii/plot_th_lat.py create mode 100644 benchmarks/inference/mii/plot_tp_sizes.py create mode 100644 benchmarks/inference/mii/postprocess_results.py create mode 100644 benchmarks/inference/mii/random_query_generator.py create mode 100644 benchmarks/inference/mii/run_all.sh create mode 100644 benchmarks/inference/mii/run_all_replica.sh create mode 100644 benchmarks/inference/mii/run_all_vllm.sh create mode 100644 benchmarks/inference/mii/run_benchmark_client.py create mode 100644 benchmarks/inference/mii/run_benchmark_client.sh create mode 100644 benchmarks/inference/mii/sample_input.py create mode 100644 benchmarks/inference/mii/server.py diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md new file mode 100644 index 000000000..f9a825daa --- /dev/null +++ b/benchmarks/inference/mii/README.md @@ -0,0 +1,32 @@ +# Benchmarking Scripts for DeepSpeed-FastGen + +## Run the Benchmark + +The benchmarking scripts use DeepSpeed-FastGen in the persistent mode. +You can start the server with the command below: + +```bash +python server.py [options] start +``` + +Use the -h option to view all available options. To stop the server, use this command: + +```bash +python server.py stop +``` + +Once the server is up and running, initiate the client using the command below. The -h option will display all the possible options. + +```bash +python run_benchmark_client.py [options] +``` + +The run_all.sh script performs benchmarks across various model sizes and client numbers. For VLLM benchmarks, use the run_all_vllm.sh script. Results are logged in a directory named logs.[BENCHMARK_PARAMETERS]. + +## Analyze the Benchmark Results + +The scripts mentioned below were used for generating the plots featured in our blog. Specify the root directory for log files using --log_dir. + +- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts. +- `plot_effective_throughput.py`: Use this to chart effective throughput. +- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies. diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/plot_effective_throughput.py new file mode 100644 index 000000000..357fc7f9e --- /dev/null +++ b/benchmarks/inference/mii/plot_effective_throughput.py @@ -0,0 +1,156 @@ +import argparse +from pathlib import Path +import glob +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from postprocess_results import read_json, get_tokenizer + +RAGGED_BATCH_SIZE = 768 +SLA_PROMPT_TOKENS_PER_SEC = 512 +SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8] +EMA_SPAN = 16 + +tp_sizes = { + "7b": [1], + "70b": [4, 8], +} + +prompt_gen_pairs = [ + (1200, 60), + (1200, 128), + (2600, 60), + (2600, 128), +] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default=".") + parser.add_argument("--out_dir", type=Path, default="charts/goodtput") + args = parser.parse_args() + return args + + +def check_token_latency_step(response_details, token_index): + P50_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 50) + P90_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 90) + P99_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 99) + + return P50_token_latency, P90_token_latency, P99_token_latency + + +def validate_token_cum_latency_SLA(response_detail, sla_token_gen): + cumsum_latencies = np.cumsum(np.array(response_detail.token_gen_time[1:])) + return all([cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1) for i in range(len(cumsum_latencies))]) + + +def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span): + ema_latency = pd.Series(response_detail.token_gen_time[1:]).ewm(span=ema_span).mean().values.tolist() + return all([t < 1. / sla_token_gen for t in ema_latency]) + + +def validate_prompt_latency_SLA(response_detail, sla_token_gen, f): + tokenizer = get_tokenizer() + prompt_length = len(tokenizer.tokenize(response_detail.prompt)) + prompt_latency_SLA = prompt_length / SLA_PROMPT_TOKENS_PER_SEC + if prompt_latency_SLA < response_detail.token_gen_time[0]: + return False + + if len(response_detail.token_gen_time) == 1: + return True + + return f[0](response_detail, sla_token_gen, *f[1]) + + +def calc_throughput(response_details): + start_time = min([r.start_time for r in response_details]) + end_time = max([r.end_time for r in response_details]) + return len(response_details) / (end_time - start_time) + + +def extract_values(file_pattern, sla_token_gen, validate_func): + files = glob.glob(file_pattern) + print(f"Found {len(files)} files") + goodputs = {} + good_ratios = {} + for f in files: + prof_args, response_details = read_json(f) + client_num = prof_args["client_num"] + num_req_ok = len([r for r in response_details if validate_prompt_latency_SLA(r, sla_token_gen, validate_func)]) + goodputs[client_num] = calc_throughput(response_details) * (num_req_ok / len(response_details)) + good_ratios[client_num] = num_req_ok / len(response_details) + + return goodputs, good_ratios + + +def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out_dir): + if not log_dir.exists(): + print(f"Log directory {log_dir} does not exist") + return + + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}") + + mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + + validate_funcs = [ + (validate_token_cum_latency_SLA, (), "cum"), + (validate_token_ema_latency_SLA, (EMA_SPAN, ), f"ema{EMA_SPAN}"), + ] + + for f in validate_funcs: + + mii_goodputs, mii_good_ratios = extract_values(mii_file_pattern, sla_token_gen, f) + client_num_list = sorted(list(mii_goodputs.keys())) + mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list] + + vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) + vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] + + # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}") + # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}") + + # Plotting the scatter plot + plt.figure(figsize=(7, 4)) + plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue") + plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") + + fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) + mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--") + + vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) + vllm_model_fn = np.poly1d(vllm_fit_model) + plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") + + title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \ + + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}' + plt.title(title, fontsize=10) + plt.xlabel('Number of clients', fontsize=10) + plt.ylabel('Effective throughput (queries/s)', fontsize=10) + # plt.rcParams['figure.subplot.bottom'] = 0.30 + plt.ylim(bottom=-0.05) + plt.legend() + plt.grid(True) + # plt.show() + out_file = out_dir / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png" + plt.savefig(out_file) + plt.clf() + print(f"Saved {out_file}") + + +if __name__ == "__main__": + args = get_args() + + for model_size, tps in tp_sizes.items(): + for tp in tps: + for prompt, gen in prompt_gen_pairs: + for sla_token_gen in SLA_GEN_TOKENS_PER_SEC: + display_results(model_size, tp, RAGGED_BATCH_SIZE, sla_token_gen, prompt, gen, args.log_dir, args.out_dir) + diff --git a/benchmarks/inference/mii/plot_latency_percentile.py b/benchmarks/inference/mii/plot_latency_percentile.py new file mode 100644 index 000000000..c91c78bf1 --- /dev/null +++ b/benchmarks/inference/mii/plot_latency_percentile.py @@ -0,0 +1,110 @@ +import argparse +import glob +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import itertools + +from postprocess_results import read_json, get_token_latency + +bs = 768 +SKIP_HEAD_TOKEN_NUM = 2 +SKIP_REQUEST_NUM = 100 + +tp_sizes = { + "70b": [4], +} + +prompt_gen_pairs = [ + (2600, 128), +] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default=".") + parser.add_argument("--out_dir", type=Path, default="charts/percentile_token_latency") + args = parser.parse_args() + return args + + +def extract_values(file_pattern): + files = glob.glob(file_pattern) + + latencies = {} + for f in files: + prof_args, response_details = read_json(f) + client_num = prof_args["client_num"] + + response_details.sort(key=lambda r: r.start_time) + response_details = response_details[SKIP_REQUEST_NUM:-SKIP_REQUEST_NUM] + token_latencies = [r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details] + + flat_latency_list = list(itertools.chain(*token_latencies)) + latencies[client_num] = flat_latency_list + return latencies + + +def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): + if not log_dir.exists(): + print(f"Log directory {log_dir} does not exist") + return + + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + + mii_latencies = extract_values(mii_file_pattern) + vllm_latencies = extract_values(vllm_file_pattern) + client_num_list = sorted(list(mii_latencies.keys())) + + for client_num in client_num_list: + plt.figure(figsize=(6, 4)) + + percentile = 95 + + P50_vllm_val = np.percentile(vllm_latencies[client_num], 50) + P50_mii_val = np.percentile(mii_latencies[client_num], 50) + P90_vllm_val = np.percentile(vllm_latencies[client_num], 90) + P90_mii_val = np.percentile(mii_latencies[client_num], 90) + P95_vllm_val = np.percentile(vllm_latencies[client_num], 95) + P95_mii_val = np.percentile(mii_latencies[client_num], 95) + + # print(f"P50_vllm_val={P50_vllm_val}") + # print(f"P50_mii_val={P50_mii_val}") + # print(f"P90_vllm_val={P90_vllm_val}") + # print(f"P90_mii_val={P90_mii_val}") + # print(f"P95_vllm_val={P95_vllm_val}") + # print(f"P95_mii_val={P95_mii_val}") + + out_file = out_dir / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png" + + x1 = [1, 2, 3] + y1 = [P50_vllm_val, P90_vllm_val, P95_vllm_val] + + x2 = [1.3, 2.3, 3.3] + y2 = [P50_mii_val, P90_mii_val, P95_mii_val] + + label_x = ['P50', 'P90', 'P95'] + + plt.bar(x1, y1, width=0.3, label='vLLM', align="center", color="orange") + plt.bar(x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue") + plt.ylabel('Latency', fontsize=14) + plt.legend(loc=2) + + plt.xticks([1.15, 2.15, 3.15], label_x) + + plt.savefig(out_file) + print(f"Saved {out_file}") + + +if __name__ == "__main__": + args = get_args() + + for model_size, tps in tp_sizes.items(): + for tp in tps: + for prompt, gen in prompt_gen_pairs: + output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) + diff --git a/benchmarks/inference/mii/plot_repl_scale.py b/benchmarks/inference/mii/plot_repl_scale.py new file mode 100644 index 000000000..394c54588 --- /dev/null +++ b/benchmarks/inference/mii/plot_repl_scale.py @@ -0,0 +1,95 @@ +import glob +import matplotlib.pyplot as plt +import argparse +from pathlib import Path +import numpy as np + +from postprocess_results import read_json, get_summary + +bs = 768 + +REPLICA_NUMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + +tp_sizes = { + "70b": [4], +} + +prompt_gen_pairs = [ + (2600, 60), +] + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default=".") + parser.add_argument("--out_dir", type=Path, default="charts/repl_scale") + args = parser.parse_args() + return args + + +def extract_values(file_pattern): + files = glob.glob(file_pattern) + + clients = [] + throughputs = [] + latencies = [] + for f in files: + prof_args, response_details = read_json(f) + summary = get_summary(prof_args, response_details) + clients.append(prof_args["client_num"]) + throughputs.append(summary.throughput) + latencies.append(summary.latency) + + return clients, throughputs, latencies + + +def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): + if not log_dir.exists(): + print(f"Log directory {log_dir} does not exist") + return + + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + throughputs = {} + for repl in REPLICA_NUMS: + mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}_repl{repl}/llama2-{model_size}-tp{tp}-b{bs}_repl{repl}_c*_p{prompt}_g{gen}.json" + print(f"Looking for {mii_file_pattern}") + clients, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) + + for c, th in zip(clients, mii_throughputs): + client_per_repl = c // repl + if client_per_repl not in throughputs: + throughputs[client_per_repl] = [] + print(f"Throughput for {client_per_repl} clients: {th}") + throughputs[client_per_repl].append(th) + + for c in throughputs: + + # Plotting the scatter plot + plt.figure(figsize=(6, 4)) + + plt.bar(REPLICA_NUMS, throughputs[c], color="blue", alpha=0.9) + + fit_x_list = np.arange(min(REPLICA_NUMS), max(REPLICA_NUMS), 0.1) + mii_fit_model = np.polyfit(REPLICA_NUMS, throughputs[c], 1) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--") + + plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}') + plt.xlabel('Number of replicas', fontsize=14) + plt.ylabel('Throughput (queries/s)', fontsize=14) + plt.grid(True) + plt.tight_layout() + # plt.show() + out_file = out_dir / f"repl_scale_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" + plt.savefig(out_file) + + +if __name__ == "__main__": + args = get_args() + + for model_size, tps in tp_sizes.items(): + for tp in tps: + for prompt, gen in prompt_gen_pairs: + output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) + diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py new file mode 100644 index 000000000..8ede6e818 --- /dev/null +++ b/benchmarks/inference/mii/plot_th_lat.py @@ -0,0 +1,98 @@ +import glob +import matplotlib.pyplot as plt +import argparse +from pathlib import Path +import numpy as np + +from postprocess_results import read_json, get_summary + +bs = 768 + +tp_sizes = { + "7b": [1], + "70b": [4, 8], +} + +prompt_gen_pairs = [ + (1200, 60), + (1200, 128), + (2600, 60), + (2600, 128), +] + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default="logs.release") + parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency") + args = parser.parse_args() + return args + + +def extract_values(file_pattern): + files = glob.glob(file_pattern) + + print(f"Found {len(files)}") + print('\n'.join(files)) + + clients = [] + throughputs = [] + latencies = [] + for f in files: + prof_args, response_details = read_json(f) + summary = get_summary(prof_args, response_details) + clients.append(prof_args["client_num"]) + throughputs.append(summary.throughput) + latencies.append(summary.latency) + + return clients, throughputs, latencies + + +def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): + if not log_dir.exists(): + print(f"Log directory {log_dir} does not exist") + return + + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + + _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) + _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) + + # Plotting the scatter plot + plt.figure(figsize=(6, 4)) + + plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") + fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) + vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) + vllm_model_fn = np.poly1d(vllm_vllm_model) + plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") + + plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue") + fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) + mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color="blue", alpha=0.5, linestyle="--") + + plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}') + plt.xlabel('Throughput (queries/s)', fontsize=14) + plt.ylabel('Latency', fontsize=14) + plt.legend() + plt.grid(True) + plt.tight_layout() + # plt.show() + out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" + print(f"Saving {out_file}") + plt.savefig(out_file) + + +if __name__ == "__main__": + args = get_args() + + for model_size, tps in tp_sizes.items(): + for tp in tps: + for prompt, gen in prompt_gen_pairs: + output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) + diff --git a/benchmarks/inference/mii/plot_tp_sizes.py b/benchmarks/inference/mii/plot_tp_sizes.py new file mode 100644 index 000000000..546310258 --- /dev/null +++ b/benchmarks/inference/mii/plot_tp_sizes.py @@ -0,0 +1,98 @@ +import glob +import matplotlib.pyplot as plt +import argparse +from pathlib import Path +import numpy as np + +from postprocess_results import read_json, get_summary + +bs = 768 + +tp_sizes = { + # "7b": [1], + "13b": [1, 2, 4], + # "70b": [4, 8], +} + +prompt_gen_pairs = [ + (1200, 60), + (1200, 128), + (2600, 60), + (2600, 128), + (2600, 256), +] + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default="logs.release") + parser.add_argument("--out_dir", type=Path, default="charts/tp_sizes") + args = parser.parse_args() + return args + + +def extract_values(file_pattern): + files = glob.glob(file_pattern) + + print(f"Found {len(files)}") + print('\n'.join(files)) + + clients = [] + throughputs = [] + latencies = [] + for f in files: + prof_args, response_details = read_json(f) + summary = get_summary(prof_args, response_details) + clients.append(prof_args["client_num"]) + throughputs.append(summary.throughput) + latencies.append(summary.latency) + + return clients, throughputs, latencies + + +def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir): + if not log_dir.exists(): + print(f"Log directory {log_dir} does not exist") + return + + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + # Plotting the scatter plot + plt.figure(figsize=(6, 4)) + + colors = ["orange", "green", "brown"] + + for tp, color in zip(tps, colors): + mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" + _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) + + if len(mii_throughputs) == 0: + continue + + n_params = int(model_size[:-1]) + tflops_per_query = n_params * (prompt + gen) * 2 * 1e-3 + mii_tflops = [th * tflops_per_query / tp for th in mii_throughputs] + + plt.scatter(mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color) + fit_mii_x_list = np.arange(min(mii_tflops), max(mii_tflops), 0.01) + mii_fit_model = np.polyfit(mii_tflops, mii_latencies, 3) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color=color, alpha=0.5, linestyle="--") + + plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}') + plt.xlabel('TFLOPs (per GPU)', fontsize=14) + plt.ylabel('Latency', fontsize=14) + plt.legend() + plt.grid(True) + # plt.show() + out_file = out_dir / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png" + plt.savefig(out_file) + + +if __name__ == "__main__": + args = get_args() + + for model_size, tps in tp_sizes.items(): + for prompt, gen in prompt_gen_pairs: + output_charts(model_size, tps, bs, prompt, gen, args.log_dir, args.out_dir) + diff --git a/benchmarks/inference/mii/postprocess_results.py b/benchmarks/inference/mii/postprocess_results.py new file mode 100644 index 000000000..cb2000d5f --- /dev/null +++ b/benchmarks/inference/mii/postprocess_results.py @@ -0,0 +1,112 @@ +import argparse +from pathlib import Path +import json +import numpy as np +from statistics import mean +from functools import reduce +from dataclasses import dataclass +from typing import List + +from transformers import AutoTokenizer + + +tokenizer = None + + +@dataclass +class ResponseDetails: + generated_tokens: List[str] + prompt: str + start_time: float + end_time: float + model_time: float + token_gen_time: List[float] + + +@dataclass +class ProfilingSummary: + throughput: float + latency: float + token_gen_latency: float + first_token_latency: float + tokens_per_sec: float + + +def parse_args(): + parser = argparse.ArgumentParser(description="Postprocess results") + parser.add_argument('-i', '--input_path', type=Path, default="results.json") + + args = parser.parse_args() + return args + + +def get_tokenizer(): + global tokenizer + if tokenizer is None: + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + return tokenizer + + +def read_json(file_path): + with open(file_path, 'r') as f: + data = json.load(f) + + args = data["args"] + + response_details = [] + for response in data["response_details"]: + response_details.append(ResponseDetails(**response)) + + return args, response_details + + +def get_summary(args, response_details): + client_num = args["client_num"] + + # Calculate latency and throughput using P95 latency + latency = mean([r.end_time - r.start_time for r in response_details]) + throughput = client_num / latency + + tokens_per_sec = mean([(len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details]) + first_token_latency = mean([r.token_gen_time[0] for r in response_details]) + + token_gen_latency_flat = reduce(list.__add__, [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2]) + token_gen_latency = mean([t for t in token_gen_latency_flat]) + + return ProfilingSummary(throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec) + + +def get_token_latency(response_details, percentile=None, variance=False, cumulative=False): + req_latencies = [r.token_gen_time for r in response_details] + if cumulative: + req_latencies = [np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details] + max_gen_length = max([len(r.generated_tokens) for r in response_details]) + latency = [] + for i in range(max_gen_length): + if variance: + token_latency_step = np.var([latency[i] for latency in req_latencies if len(latency) > i]) + if percentile is None: + token_latency_step = [latency[i] for latency in req_latencies if len(latency) > i] + else: + token_latency_step = np.percentile([latency[i] for latency in req_latencies if len(latency) > i], percentile) + + latency.append(token_latency_step) + + return latency + + +def get_token_acc_latency(response_details, percentile=99): + return get_token_latency(response_details, percentile, cumulative=True) + + +if __name__ == "__main__": + args = parse_args() + prof_args, response_details = read_json(args.input_path) + + ps = get_summary(prof_args, response_details) + print(f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['client_num']}, " + + f"Query throughput: {ps.throughput:.3f} queries/s, " + + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " + + f"Query latency: {ps.latency:.3f} s, " + + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " + + f"First token received: {ps.first_token_latency:.3f} s") diff --git a/benchmarks/inference/mii/random_query_generator.py b/benchmarks/inference/mii/random_query_generator.py new file mode 100644 index 000000000..b8442af4f --- /dev/null +++ b/benchmarks/inference/mii/random_query_generator.py @@ -0,0 +1,30 @@ +import torch +import random +import numpy as np +import time + +class RandomQueryGenerator: + def __init__(self, input_text, tokenizer, seed): + self.input_text = input_text + self.tokenizer = tokenizer + + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + + def get_random_request_text(self, length, variance, max_length, batch): + request_text = [] + tokenized_input = self.tokenizer.batch_encode_plus([self.input_text], + return_tensors="pt", + padding=False) + offset = list(range(512)) + random.shuffle(offset) + + text_ids = tokenized_input["input_ids"][0] + for i in range(batch): + # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens + req_prompt_length = min(int(np.random.normal(length, variance)), max_length) + + text = self.tokenizer.decode(text_ids[i:req_prompt_length+i]) + request_text.append(text) + return request_text diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh new file mode 100644 index 000000000..ca504a6c9 --- /dev/null +++ b/benchmarks/inference/mii/run_all.sh @@ -0,0 +1,25 @@ +RAGGED_BATCH_SIZE=768 +PARAM_SIZES=(7b 13b 70b) + +declare -A TP_SIZES +TP_SIZES["7b"]="1" +TP_SIZES["13b"]="1:2:4" +TP_SIZES["70b"]="4:8" + +for PARAM_SIZE in ${PARAM_SIZES[@]}; do + + IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} + for TP in ${TP_VALUES[@]}; do + DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE} + python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start + + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh + + echo "Stopping server" + python server.py -d ${DEPLOYMENT_NAME} stop + sleep 120 + done +done diff --git a/benchmarks/inference/mii/run_all_replica.sh b/benchmarks/inference/mii/run_all_replica.sh new file mode 100644 index 000000000..b3fba0408 --- /dev/null +++ b/benchmarks/inference/mii/run_all_replica.sh @@ -0,0 +1,25 @@ +RAGGED_BATCH_SIZE=768 +PARAM_SIZES=(7b) +REPLICA_NUMS=(1) + +declare -A TP_SIZES +TP_SIZES["7b"]="4" +TP_SIZES["13b"]="1" +TP_SIZES["70b"]="4" + +for PARAM_SIZE in ${PARAM_SIZES[@]}; do + IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} + for TP in ${TP_VALUES[@]}; do + for REPL in ${REPLICA_NUMS[@]}; do + DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}_repl${REPL} + python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -r ${REPL} -b ${RAGGED_BATCH_SIZE} start + + REQUEST_NUM=$((256 * ${REPL})) + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 CLIENT_NUMS=$((16 * ${REPL})) REQUEST_NUM=$((256 * ${REPL})) bash ./run_bench_client_num.sh + + echo "Stopping server" + python server.py -d ${DEPLOYMENT_NAME} stop + sleep 120 + done + done +done diff --git a/benchmarks/inference/mii/run_all_vllm.sh b/benchmarks/inference/mii/run_all_vllm.sh new file mode 100644 index 000000000..572377f13 --- /dev/null +++ b/benchmarks/inference/mii/run_all_vllm.sh @@ -0,0 +1,26 @@ +RAGGED_BATCH_SIZE=768 +PARAM_SIZES=(7b 13b 70b) + +declare -A TP_SIZES +TP_SIZES["7b"]="1" +TP_SIZES["13b"]="1:2:4" +TP_SIZES["70b"]="4:8" + +for PARAM_SIZE in ${PARAM_SIZES[@]}; do + + IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} + for TP in ${TP_VALUES[@]}; do + DEPLOYMENT_NAME=vllm-llama2-${PARAM_SIZE}-tp${TP} + python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model meta-llama/Llama-2-${PARAM_SIZE}-hf & + sleep 60 + + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh + DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh + + echo "Stopping server" + pkill -u ${USER} -f vllm.entrypoints.api_server + sleep 30 + done +done diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py new file mode 100644 index 000000000..77377a93a --- /dev/null +++ b/benchmarks/inference/mii/run_benchmark_client.py @@ -0,0 +1,304 @@ +import os +import time +import random +import argparse +import queue +import multiprocessing +import threading +from statistics import mean +from dataclasses import dataclass, asdict +from typing import List, Iterable +from pathlib import Path +from datetime import datetime +import numpy as np + +from transformers import AutoTokenizer +from random_query_generator import RandomQueryGenerator +from sample_input import all_text +import time +import json +import asyncio +import requests + +from postprocess_results import get_summary, ResponseDetails + +MAX_PROMPT_LENGTH = 4000 +PROMPT_LENGTH_VAR = 0.3 +MAX_NEW_TOKENS_VAR = 0.3 + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark MII services") + parser.add_argument("-k", + "--max_new_tokens", + type=int, + default=60, + help="min and max num tokens argument for huggingface") + parser.add_argument("-d", + "--deployment_name", + type=str, + default="benchmark_deployment") + parser.add_argument("-n", + "--num_queries", + type=int, + help="number of queries to run", + default=10) + parser.add_argument("-w", + "--warmup", + type=int, + help="number of queries for warming up", + default=1) + parser.add_argument("-c", + "--client_num", + type=int, + help="number of parallel client processes", + default=2) + parser.add_argument("-l", + "--prompt_length", + type=int, + default=2600) + parser.add_argument('--use_thread', action='store_true', + help='use thread to run parallel clients, otherwise use multiprocessing', + default=False) + parser.add_argument('--stream', action='store_true', default=True) + parser.add_argument('--vllm', action='store_true', default=False) + parser.add_argument('-o', '--out_json_path', type=Path, default=None) + + args = parser.parse_args() + return args + + +def call_mii(client, input_tokens, max_new_tokens, stream): + output_tokens = [] + token_gen_time = [] + time_last_token = 0 + + def callback(response): + nonlocal time_last_token + # print(f"Received: {response.response} time_last_token={time_last_token}") + output_tokens.append(response.response[0]) + time_now = time.time() + token_gen_time.append(time_now - time_last_token) + time_last_token = time_now + + postprocess_config = { + "logit_processor": { + # "name": "TopP", + # "args": { + # "top_p": 0.9 + # } + "name": "Temperature", + "args": { + "temperature": 0.9 + } + }, + "sampler": { + "name": "Logits" + }, + "stop_criterion": { + "name": "EosGeneration" + } + } + + time_last_token = start_time = time.time() + token_gen_time = [] + if stream: + output_tokens = [] + client.generate( + input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config, + streaming_fn=callback) + else: + result = client.generate( + input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config) + output_tokens = result.response[0] + + return ResponseDetails( + generated_tokens=output_tokens, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time) + + +def call_vllm(input_tokens, max_new_tokens, stream=True): + api_url = "http://localhost:26500/generate" + headers = {"User-Agent": "Benchmark Client"} + pload = { + "prompt": input_tokens, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": max_new_tokens, + "ignore_eos": False, + "stream": stream, + } + def clear_line(n: int = 1) -> None: + LINE_UP = '\033[1A' + LINE_CLEAR = '\x1b[2K' + for _ in range(n): + print(LINE_UP, end=LINE_CLEAR, flush=True) + + def get_streaming_response(response: requests.Response, time_last_token) -> Iterable[List[str]]: + for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, + delimiter=b"\0"): + if chunk: + data = json.loads(chunk.decode("utf-8")) + output = data["text"][0] + time_now = time.time() + yield output, time_now - time_last_token + time_last_token = time_now + + def get_response(response: requests.Response) -> List[str]: + data = json.loads(response.content) + output = data["text"] + return output + + start_time = time.time() + response = requests.post(api_url, headers=headers, json=pload, stream=stream) + if stream: + token_gen_time = [] + for h, t in get_streaming_response(response, start_time): + output = h + token_gen_time.append(t) + + return ResponseDetails( + generated_tokens=output, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time) + else: + output = get_response(response) + raise NotImplementedError("Not implemented for non-streaming") + + +def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm): + pid = os.getpid() + session_id = f"test_session_p{pid}_t{threading.get_ident()}" + + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + if not vllm: + import mii + client = mii.client(deployment_name) + + barrier.wait() + + for _ in range(warmup): + print(f"warmup queue size: {query_queue.qsize()} ({pid})", flush=True) + input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0) + + if vllm: + call_vllm(input_tokens, req_max_new_tokens, stream) + else: + call_mii(client, input_tokens, req_max_new_tokens, stream) + + barrier.wait() + + time.sleep(random.uniform(0, client_num) * 0.01) + try: + while not query_queue.empty(): + print(f"queue size: {query_queue.qsize()} ({pid})", flush=True) + input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0) + + # Set max_new_tokens following normal distribution + if vllm: + r = call_vllm(input_tokens, req_max_new_tokens) + else: + r = call_mii(client, input_tokens, req_max_new_tokens, stream) + + result_queue.put(r) + except queue.Empty: + print(f"queue is empty ({pid})") + + print(f"Worker ({pid}) finished. session_id: {session_id}") + + +def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_queries, warmup, stream, vllm, use_thread=False): + """ + Run MII client for benchmarking. The scenario is a bit complicated: + 1. The main process puts `num_queries` queries into the input queue + 2. Each client runs `warmup` iterations () taking the queries from the input queue + 3. --- barrier --- + 4. The main process marks the start time + 5a. All clients send `num_queries' query in total and put the results into the result queue + 5b. The main process takes the results from the result queue (in parallel with 5a) + 6. The main process marks the end time after receiving `num_queries' results + """ + + if use_thread: + runnable_cls = threading.Thread + barrier_cls = threading.Barrier + queue_cls = queue.Queue + else: + runnable_cls = multiprocessing.Process + barrier_cls = multiprocessing.Barrier + queue_cls = multiprocessing.Queue + + barrier = barrier_cls(client_num + 1) + query_queue = queue_cls() + result_queue = queue_cls() + + processes = [runnable_cls(target=_run_parallel, + args=(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm)) + for i in range(client_num)] + for p in processes: + p.start() + + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42) + MAX_PROMPT_LENGTH = 4000 + request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, MAX_PROMPT_LENGTH, num_queries + warmup*client_num) + + for t in request_text: + req_max_new_tokens = int(np.random.normal(max_new_tokens, MAX_NEW_TOKENS_VAR*max_new_tokens)) + query_queue.put((t, req_max_new_tokens)) + + # Tokenizers must be initialized after fork. + # So we need to fork before putting inputs to the queue. + # We need this barrier to stop child processse from taking inputs before the main process puts them + barrier.wait() + # This barrier is to make sure that all clients have finished warmup + barrier.wait() + + response_details = [] + while len(response_details) < num_queries: + res = result_queue.get() + # vLLM returns concatinated tokens + if vllm: + all_tokens = tokenizer.tokenize(res.generated_tokens) + res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)):] + response_details.append(res) + + return response_details + +if __name__ == "__main__": + args = parse_args() + print(args) + + if args.out_json_path is not None and not args.out_json_path.parent.exists(): + raise ValueError(f"Parent directory of {args.out_json_path}") + + response_details = run_client(args.client_num, args.deployment_name, + args.prompt_length, + args.max_new_tokens, args.num_queries, args.warmup, + args.stream, args.vllm, args.use_thread) + + args_dict = vars(args) + ps = get_summary(args_dict, response_details) + print(f"Deployment: {args.deployment_name} Clients: {args.client_num}, " + + f"Prompt (mean): {args.prompt_length} tokens, " + + f"Generation (mean): {args.max_new_tokens} tokens, " + + f"Query throughput: {ps.throughput:.3f} queries/s, " + + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " + + f"Query latency: {ps.latency:.3f} s, " + + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " + + f"First token received: {ps.first_token_latency:.3f} s") + + if args.out_json_path is not None: + with open(args.out_json_path, "w") as f: + args_dict["out_json_path"] = str(args.out_json_path) # Path is not JSON serializable + data = {"args": args_dict, "time": str(datetime.now()), "response_details": [asdict(r) for r in response_details]} + json.dump(data, f, indent=2) diff --git a/benchmarks/inference/mii/run_benchmark_client.sh b/benchmarks/inference/mii/run_benchmark_client.sh new file mode 100644 index 000000000..318e9092e --- /dev/null +++ b/benchmarks/inference/mii/run_benchmark_client.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-llama2-7b} +VLLM=${VLLM:-""} + +CLIENT_NUMS=${CLIENT_NUMS:-1 2 4 6 8 12 16 20 24 28 32} +MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-60} +PROMPT_LENGTH=${PROMPT_LENGTH:-3072} +REQUEST_NUM=${REQUEST_NUM:-512} + +LOG_DIR=logs.${DEPLOYMENT_NAME} +mkdir -p ${LOG_DIR} + +for client_num in ${CLIENT_NUMS[@]}; do + RESULT_FILE=${DEPLOYMENT_NAME}_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.json + + python run_benchmark_client.py -w 1 \ + -d ${DEPLOYMENT_NAME} -n ${REQUEST_NUM} -c ${client_num} \ + -k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} \ + -o ${LOG_DIR}/${RESULT_FILE} \ + ${VLLM} --stream \ + 2>&1 | tee ${LOG_DIR}/bench_client_num_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.log +done diff --git a/benchmarks/inference/mii/sample_input.py b/benchmarks/inference/mii/sample_input.py new file mode 100644 index 000000000..77d02af5f --- /dev/null +++ b/benchmarks/inference/mii/sample_input.py @@ -0,0 +1,221 @@ + +# This is a sample input consisting of: +# Code & Text + +all_text = '''Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output. + During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss. + The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates. + Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). + By adjusting the parameters of the neural network during training, deep learning models learn to represent and generalize from complex data patterns. They have achieved remarkable success in various tasks, including image recognition, speech recognition, and natural language processing. + Here are the key fundamentals of deep learning for training large language models: + Neural Networks: At the heart of deep learning are artificial neural networks, which are inspired by the structure and functioning of biological neurons in the human brain. These networks consist of interconnected layers of artificial neurons called nodes or units. The nodes receive input, perform computations, and pass the results to the next layer. + Representation Learning: Deep learning models excel at learning meaningful representations of data. In the context of language, the models can automatically learn hierarchical representations of text, capturing complex relationships and semantic structures. + Feedforward and Backpropagation: Deep learning models typically use feedforward neural networks, where information flows from the input layer through intermediate hidden layers to the output layer. The network makes predictions based on the input data, and the prediction error is then backpropagated through the network. Backpropagation calculates gradients that indicate how each parameter in the network should be adjusted to minimize the error. + Activation Functions: Activation functions introduce non-linearities to neural networks, enabling them to learn complex patterns. Common activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). These functions determine the output of each neuron based on its weighted inputs. + Loss Functions: During training, a loss function is used to measure the discrepancy between the predicted output of the neural network and the desired output. In language modeling tasks, common loss functions include cross-entropy loss, which quantifies the difference in probability distributions. + Optimization Algorithms: Optimization algorithms determine how the network's parameters are updated based on the calculated gradients during backpropagation. Stochastic Gradient Descent (SGD) is a widely used algorithm that iteratively updates the parameters in the direction that minimizes the loss. Variants of SGD, such as Adam or RMSprop, adaptively adjust the learning rate to accelerate convergence. + Regularization Techniques: Deep learning models are prone to overfitting, where they memorize the training data but fail to generalize well to unseen examples. Regularization techniques such as dropout and weight decay are commonly used to prevent overfitting and improve generalization by adding constraints to the model's parameters. + Training on Large-Scale Datasets: Deep learning models, including large language models, require substantial amounts of labeled training data to learn effectively. Large-scale datasets are crucial to expose the model to diverse language patterns and ensure it captures a broad understanding of language. + Parallel Computing: Training large language models is computationally demanding. To accelerate the training process, parallel computing techniques, such as using multiple GPUs or distributed computing systems, are employed. These techniques allow for efficient processing of large datasets and speeding up the training iterations. + Transfer Learning and Fine-tuning: Transfer learning is a technique where a pre-trained model, trained on a large-scale dataset, is used as a starting point for a new task or dataset. Fine-tuning involves adjusting the pre-trained model's parameters on the new dataset to adapt it to the specific task at hand. This approach significantly reduces the training time and data requirements for new models. + The training process of a large language model typically involves the following steps: + Data Collection: A diverse and comprehensive dataset is collected, which typically consists of a vast range of text from sources like books, websites, articles, and other textual resources. The quality and variety of the dataset are crucial to ensure the model learns a broad understanding of language. + Preprocessing: The collected text data is preprocessed to clean and normalize it. This step involves removing irrelevant characters or symbols, converting the text to a consistent format, and organizing it into smaller units such as sentences or paragraphs. + Tokenization: The preprocessed text is divided into individual tokens, which can be as small as words or even subword units. Tokenization helps in representing and processing the text efficiently during training. + Architecture Design: The model architecture, often based on the transformer architecture, is defined. Transformers are neural network models that excel in capturing long-range dependencies in sequential data, making them well-suited for language modeling tasks. + Model Initialization: The model parameters are randomly initialized to start the training process. These parameters will be adjusted iteratively during training to optimize the model's performance. + Training Loop: The model is trained using a large-scale computational infrastructure. The training loop typically involves several iterations over the dataset, known as epochs. During each epoch, the model processes the input data, generates predictions, and compares them with the expected output. The discrepancy between the predicted and expected output is used to compute a loss, which quantifies the model's performance. + Backpropagation and Optimization: Backpropagation is employed to calculate the gradients of the model's parameters with respect to the loss. These gradients indicate the direction and magnitude of the parameter updates needed to minimize the loss. Optimization algorithms, such as stochastic gradient descent (SGD) or its variants, are then used to update the model's parameters based on the computed gradients. + Iterative Refinement: Steps 6 and 7 are repeated for multiple epochs, gradually refining the model's performance. The model's ability to generate coherent and contextually relevant responses improves as it learns from the dataset. + Evaluation: The trained model is evaluated on a separate dataset to assess its performance and identify areas for improvement. Various metrics, such as perplexity or accuracy, can be used to evaluate the model's language generation capabilities. + Fine-tuning and Iteration: Based on the evaluation results, the model may undergo fine-tuning or further iterations of training to enhance its performance. This process helps in addressing specific limitations or biases and aligning the model's output more closely with desired expectations. + It's important to note that training a large language model from scratch is a computationally intensive process that requires substantial computational resources, including powerful hardware like GPUs or specialized hardware accelerators, and large-scale distributed systems to handle the massive amount of data and model parameters involved. + Here are ten highly recommended books that can help you learn deep learning: + "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville: + This comprehensive book covers the fundamental concepts of deep learning, including neural networks, optimization algorithms, and regularization techniques. It also explores advanced topics like generative models and deep reinforcement learning. + "Deep Learning with Python" by François Chollet: + Written by the creator of the Keras deep learning library, this book provides a practical introduction to deep learning with Python. It covers essential concepts, tools, and techniques, and includes hands-on examples and case studies. + "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow" by Aurélien Géron: + This book offers a hands-on approach to learning machine learning and deep learning using popular Python libraries such as Scikit-Learn, Keras, and TensorFlow. It covers various algorithms and provides practical examples and exercises. + "Deep Learning for Computer Vision" by Rajalingappaa Shanmugamani: + Focusing on deep learning techniques for computer vision tasks, this book explores topics such as convolutional neural networks (CNNs), image classification, object detection, and image generation. It includes code examples using Python and popular deep learning frameworks. + "Deep Learning: A Practitioner's Approach" by Josh Patterson and Adam Gibson: + This book offers a practical guide to implementing deep learning solutions using the Deeplearning4j library. It covers key concepts, architectures, and techniques, and includes code examples and case studies. + "Grokking Deep Learning" by Andrew Trask: + Geared towards beginners, this book provides an intuitive and accessible introduction to deep learning concepts. It covers neural networks, backpropagation, gradient descent, and other fundamental topics with clear explanations and visualizations. + "Deep Learning for Natural Language Processing" by Palash Goyal, Sumit Pandey, and Karan Jain: + Focusing on deep learning techniques for natural language processing (NLP), this book explores topics like word embeddings, recurrent neural networks (RNNs), and sequence-to-sequence models. It includes code examples using Python and popular NLP libraries. + "Deep Reinforcement Learning" by Pieter Abbeel and John Schulman: + This book provides an in-depth exploration of deep reinforcement learning, a subfield that combines deep learning with reinforcement learning. It covers topics like Q-learning, policy gradients, and deep Q-networks (DQNs) and provides practical examples. + "Deep Learning for Time Series Forecasting" by N.D. Lewis: + Focusing on deep learning techniques for time series data, this book covers topics such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks, and attention models. It includes code examples using Python and popular deep learning frameworks. + "Interpretable Deep Learning" by Christoph Molnar: + This book delves into the challenges and techniques for interpreting and understanding deep learning models. It covers model visualization, feature importance, and other methods for explaining and interpreting deep learning predictions. + These books cover a range of deep learning topics and provide valuable insights and practical guidance for learning and applying deep learning techniques. Choose the ones that align with your interests and learning style to enhance your understanding of deep learning. + Here are 10 popular GitHub projects that can be useful for building large language models (LLMs) or working with natural language processing (NLP) tasks: + TensorFlow: An open-source deep learning framework that provides tools and resources for building and training LLMs. It offers extensive support for various neural network architectures and has a large community. + PyTorch: Another popular deep learning framework that provides a dynamic computational graph and a wide range of tools for building LLMs. It is known for its user-friendly interface and flexibility. + Hugging Face Transformers: A library that provides pre-trained models and a high-level API for natural language understanding (NLU) tasks, including LLMs. It supports popular models like GPT, BERT, and RoBERTa. + Fairseq: A library developed by Facebook AI Research that focuses on sequence modeling tasks, including LLMs. It offers pre-trained models and tools for training and evaluating models using sequence-to-sequence architectures. + AllenNLP: A powerful NLP research library that simplifies the process of building and evaluating deep learning models. It offers pre-built components for common NLP tasks and supports LLMs with various architectures. + OpenAI GPT-3: Although not available on GitHub, OpenAI's GPT-3 language model is widely recognized and can be accessed via the OpenAI API. It offers state-of-the-art language generation capabilities and can be used for various NLP tasks. + BERT: A pre-trained language model developed by Google Research that has achieved exceptional results on various NLP benchmarks. The official implementation is available on GitHub and can be fine-tuned for specific tasks. + spaCy: A popular Python library for NLP tasks that provides efficient and scalable tools for tokenization, named entity recognition, part-of-speech tagging, and more. It integrates well with deep learning frameworks. + FastText: A library developed by Facebook Research that provides efficient tools for text classification and word representation learning. It offers pre-trained word embeddings and supports training LLMs for classification tasks. + NLTK (Natural Language Toolkit): A comprehensive library for NLP tasks in Python. It provides various modules for tokenization, stemming, tagging, parsing, and more. Although it doesn't focus explicitly on LLMs, it is widely used for preprocessing text data in NLP pipelines. + These projects offer a range of resources, pre-trained models, and tools that can assist you in building and working with large language models. Make sure to review the documentation and examples provided by each project to understand their capabilities and how they can be integrated into your workflow. + Here are some popular backend libraries that are commonly used for deep learning: + TensorFlow: Developed by Google's Brain Team, TensorFlow is one of the most widely used deep learning frameworks. It provides a flexible and comprehensive ecosystem for building and deploying machine learning models. TensorFlow offers high-level APIs for easy model construction, as well as lower-level APIs for fine-grained control. It supports distributed computing and has extensive community support. + PyTorch: Developed by Facebook's AI Research lab, PyTorch is known for its simplicity and dynamic computational graph. It allows for intuitive model construction and debugging. PyTorch is widely used in both research and industry due to its flexibility, support for dynamic networks, and strong GPU acceleration capabilities. + Keras: Initially developed as a user-friendly deep learning library, Keras is now integrated as the official high-level API in TensorFlow. It provides a user-friendly and modular interface for building neural networks. Keras abstracts away many complexities and allows users to build models with just a few lines of code. It supports multiple backends, including TensorFlow and Theano. + Theano: Although its development has been discontinued, Theano was one of the first widely-used deep learning libraries. It allows for efficient mathematical operations on multi-dimensional arrays and supports GPU acceleration. Theano was influential in shaping the deep learning landscape and served as a precursor to subsequent frameworks. + Caffe: Developed by the Berkeley Vision and Learning Center (BVLC), Caffe is a popular deep learning framework known for its efficiency and simplicity. It is particularly suitable for convolutional neural networks (CNNs) and image-related tasks. Caffe has a clean and expressive architecture description language that makes it easy to define and train deep models. + MXNet: MXNet is an open-source deep learning framework developed by Apache. It offers a flexible and efficient interface for building and deploying neural networks. MXNet provides a hybrid frontend that allows users to seamlessly switch between symbolic and imperative programming. It is known for its scalability and supports multiple programming languages. + Chainer: Chainer is a flexible deep learning framework that focuses on dynamic neural networks. It allows for intuitive model construction using imperative programming, making it easy to define complex architectures and manipulate data within the network. Chainer is known for its "define-by-run" approach, which facilitates dynamic computations. + Microsoft Cognitive Toolkit (CNTK): CNTK is a deep learning framework developed by Microsoft. It provides a highly efficient and scalable implementation of deep neural networks. CNTK supports both declarative and imperative programming models, making it suitable for both research and production-level deployments. + Deeplearning4j: Deeplearning4j is an open-source deep learning library that focuses on scalability and performance. It is designed to integrate with the Java ecosystem and supports distributed computing. Deeplearning4j provides tools for building various types of neural networks and offers integration with other popular libraries like Hadoop and Spark. + PaddlePaddle: PaddlePaddle (PArallel Distributed Deep LEarning) is a deep learning framework developed by Baidu. It emphasizes scalability and supports large-scale distributed training. PaddlePaddle provides a rich set of built-in models and algorithms, making it accessible to both beginners and advanced users. + Each of these backend libraries offers unique features, performance characteristics, and levels of abstraction. The choice of a backend library depends on factors such as your programming language preferences, the complexity of your models, the availability of community support, and the specific requirements of your deep learning project. + Here's an example code snippet that demonstrates how to create a GPT-Neox20B model using the Hugging Face Transformers library and start fine-tuning it with sample data from the '/tmp/wikitext' directory: + + import torch + from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments + + # Load the GPT-Neo model and tokenizer + model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B") + tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") + + # Set the path to the training data + data_path = "/tmp/wikitext" + + # Define the dataset and data collator + dataset = TextDataset(tokenizer=tokenizer, file_path=data_path, block_size=128) + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + # Define the training arguments + training_args = TrainingArguments( + output_dir="./output_dir", # Directory to save the fine-tuned model and logs + overwrite_output_dir=True, + num_train_epochs=3, # Number of training epochs + per_device_train_batch_size=8, + save_steps=500, # Number of updates steps before saving the model + save_total_limit=2, # Limit the total number of saved models + prediction_loss_only=True, + ) + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=dataset, + ) + + # Start fine-tuning + trainer.train() + + # Save the fine-tuned model + trainer.save_model("./fine-tuned-model") + + To deploy the GPT-Neo model on an HTTP server, you can use a framework like Flask or FastAPI. Here's a basic example using Flask: + + Install Flask: + + pip install flask + + Create a Python file, let's call it app.py, and import the necessary libraries: + + from flask import Flask, request, jsonify + from transformers import GPTNeoForCausalLM, GPT2Tokenizer + + # Initialize Flask app + app = Flask(__name__) + + # Load the pretrained GPT-Neo model and tokenizer + model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B") + tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") + Define an endpoint for model inference: + + python + Copy code + @app.route("/generate", methods=["POST"]) + def generate_text(): + data = request.get_json() + input_text = data["input_text"] + + # Tokenize the input text + input_ids = tokenizer.encode(input_text, return_tensors="pt") + + # Generate text with the model + output = model.generate(input_ids, max_length=100, num_return_sequences=1) + generated_text = tokenizer.decode(output[0], skip_special_tokens=True) + + return jsonify({"generated_text": generated_text}) + Run the Flask app: + + python + Copy code + if __name__ == "__main__": + app.run(host="0.0.0.0", port=5000) + The code that includes the text generation sampling functions and provides a commented example: + + import requests + import numpy as np + + class TextGeneratorAPI: + def __init__(self, server_url): + self.server_url = server_url + + def generate_text(self, input_text, sampling_algorithm="greedy", temperature=0.7): + url = f"{self.server_url}/generate" + payload = { + "input_text": input_text, + "sampling_algorithm": sampling_algorithm, + "temperature": temperature + } + response = requests.post(url, json=payload) + generated_text = response.json()["generated_text"] + return generated_text + + def greedy_sampling(self, logits): + return np.argmax(logits) + + def random_sampling(self, logits): + probabilities = np.exp(logits / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(len(logits), p=probabilities) + + def top_k_sampling(self, logits, k=10): + indices = np.argsort(logits)[-k:] + probabilities = np.exp(logits[indices] / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(indices, p=probabilities) + + def top_p_sampling(self, logits, p=0.9): + sorted_logits = np.sort(logits)[::-1] + cumulative_probs = np.cumsum(np.exp(sorted_logits) / temperature) + indices = np.arange(len(sorted_logits)) + selected_indices = indices[cumulative_probs <= p] + probabilities = np.exp(logits[selected_indices] / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(selected_indices, p=probabilities) + In this updated code, the TextGeneratorAPI class includes the additional sampling functions: greedy_sampling, random_sampling, top_k_sampling, and top_p_sampling. These functions take logits (output of the model) as input and return the index of the selected token based on the respective sampling algorithm. + The greedy_sampling function selects the token with the highest probability (argmax) as the next token. The random_sampling function applies a temperature scaling to the logits and then samples from the resulting probability distribution. The top_k_sampling function selects from the top-k tokens with the highest probabilities. The top_p_sampling function selects from the tokens with cumulative probabilities below a certain threshold (top-p). + You can now use the updated TextGeneratorAPI class with the sampling functions. Here's an example: + + api = TextGeneratorAPI(server_url="http://localhost:5000") + + input_text = "Once upon a time" + + # Generate text using different sampling algorithms and temperatures + greedy_text = api.generate_text(input_text, sampling_algorithm="greedy") + random_text = api.generate_text(input_text, sampling_algorithm="random") + top_k_text = api.generate_text(input_text, sampling_algorithm="top_k", temperature=0.8) + top_p_text = api.generate_text(input_text, sampling_algorithm="top_p", temperature=0.9) + + print("Greedy Sampling:", greedy_text) + print("Random Sampling:", random_text) + print("Top-k Sampling:", top_k_text) + print("Top-p Sampling:", top_p_text) + Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API. + ''' \ No newline at end of file diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/server.py new file mode 100644 index 000000000..2e6164187 --- /dev/null +++ b/benchmarks/inference/mii/server.py @@ -0,0 +1,83 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import mii +import argparse + +from mii.constants import DeploymentType + +from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig +from deepspeed.inference.v2.ragged import DSStateManagerConfig + +def start_server(model_name, + deployment_name, + task, + tensor_parallel, + replica_num, + max_ragged_batch_size): + tp_config = DeepSpeedTPConfig(tp_size=tensor_parallel) + mgr_config = DSStateManagerConfig(max_ragged_batch_size=max_ragged_batch_size, max_ragged_sequence_count=max_ragged_batch_size) + inference_config = RaggedInferenceEngineConfig(tensor_parallel=tp_config, + state_manager=mgr_config) + + mii.serve( + model_name, + deployment_name=deployment_name, + tensor_parallel=tensor_parallel, + task=task, + inference_engine_config=inference_config, + replica_num=replica_num + ) + +def stop_server(deployment_name): + mii.client(deployment_name).terminate_server() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", + type=str, + default="meta-llama/Llama-2-7b-hf", + help="Name of the model in the model_files to benchmark") + parser.add_argument("-d", + "--deployment_name", + type=str, + default="benchmark_deployment") + parser.add_argument("-t", "--task", type=str, + help="Task type. Currently only text-generation is supported", + default="text-generation") + parser.add_argument("-m", + "--tensor_parallel", + type=int, + help="Degree of tensor (model) parallelism", + default=1) + parser.add_argument("-b", + "--ragged_batch_size", + type=int, + help="Max batch size for ragged batching", + default=768) + parser.add_argument("-r", + "--replica_num", + type=int, + help="Number of replicas for load balancing", + default=1) + parser.add_argument("cmd", help="start, stop, or restart") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + if args.cmd == "start": + start_server(args.model_name, + args.deployment_name, + args.task, + args.tensor_parallel, + args.replica_num, + args.ragged_batch_size) + elif args.cmd == "stop": + print("running stop") + stop_server(args.deployment_name) + else: + raise ValueError(f"Unknown command: {args.cmd}") From fe7a76d6979067cd597eb10551d14e234e5e2d66 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Wed, 8 Nov 2023 10:40:03 -0800 Subject: [PATCH 05/58] Fix SD example imports for latest diffusers (#806) --- .../stable-diffusion/local_pipeline_stable_diffusion.py | 5 +++-- inference/huggingface/stable-diffusion/requirements.txt | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py index ab0b708e6..4774fac4f 100644 --- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py +++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py @@ -29,11 +29,12 @@ is_accelerate_available, is_accelerate_version, logging, - randn_tensor, replace_example_docstring, ) -from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.utils.torch_utils import randn_tensor + +from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker diff --git a/inference/huggingface/stable-diffusion/requirements.txt b/inference/huggingface/stable-diffusion/requirements.txt index 22524d2df..37f9f9ea5 100644 --- a/inference/huggingface/stable-diffusion/requirements.txt +++ b/inference/huggingface/stable-diffusion/requirements.txt @@ -1,4 +1,4 @@ deepspeed torch -diffusers +diffusers>=0.22.3 triton==2.0.0.dev20221202 From ccb2a3400a05ea075b643bb3aeabb02f9883c5da Mon Sep 17 00:00:00 2001 From: Pareesa Ameneh Golnari <120066333+PareesaMS@users.noreply.github.com> Date: Wed, 8 Nov 2023 13:27:17 -0800 Subject: [PATCH 06/58] Adding Imagenet Example (#680) Co-authored-by: Michael Wyatt --- training/imagenet/README.md | 95 ++++ training/imagenet/assets/resnetplot.png | Bin 0 -> 138796 bytes training/imagenet/config/ds_config.json | 30 ++ training/imagenet/config/ds_fp16_config.json | 31 ++ .../imagenet/config/ds_fp16_z1_config.json | 31 ++ training/imagenet/extract_ILSVRC.sh | 80 +++ training/imagenet/main.py | 508 ++++++++++++++++++ training/imagenet/requirements.txt | 2 + training/imagenet/run_ds.sh | 3 + training/imagenet/run_ds_fp16.sh | 3 + training/imagenet/run_ds_fp16_z1.sh | 3 + 11 files changed, 786 insertions(+) create mode 100644 training/imagenet/README.md create mode 100755 training/imagenet/assets/resnetplot.png create mode 100644 training/imagenet/config/ds_config.json create mode 100644 training/imagenet/config/ds_fp16_config.json create mode 100644 training/imagenet/config/ds_fp16_z1_config.json create mode 100644 training/imagenet/extract_ILSVRC.sh create mode 100644 training/imagenet/main.py create mode 100644 training/imagenet/requirements.txt create mode 100644 training/imagenet/run_ds.sh create mode 100644 training/imagenet/run_ds_fp16.sh create mode 100644 training/imagenet/run_ds_fp16_z1.sh diff --git a/training/imagenet/README.md b/training/imagenet/README.md new file mode 100644 index 000000000..47f57a5e0 --- /dev/null +++ b/training/imagenet/README.md @@ -0,0 +1,95 @@ +# ImageNet training with DeepSpeed + +This example activated DeepSpeed on the implementation of training a set of popular model architectures on ImageNet dataset. The models include ResNet, AlexNet, and VGG, and the +baseline implementation could be found at pytorch examples [Github repository](https://github.com/pytorch/examples/tree/main/imagenet). Parts of this README is also borrowed from the original repo. DeepSpeed activation allows for ease in +running the code in distributed manner, allowing for easily applying fp16 quantization benefitting Zero stage1 memory reduction. + +## DeepSpeed Optimizations + +Applying fp16 quantization and Zero stage 1 memory optimization we were able to reduce the required memory. The table bellow summarizes the results of running resnet 50 on a DGX-1 +node (with 16 V100 GPUs): + +| Optimization level | Allocated Memory (GB) | Mem. Consumption Improvement (%) | +|-------------------|-------------------|---------| +|Baseline | 1.66 | -| +|DS + fp16 | 1.04 | 37.3| +|Ds + fp16 + Zero 1 | 0.81 | 51.2| + +Furthermore, the memory optimization had no adverse impact on accuracy, a point illustrated by the graph below. +![resnet-plot](assets/resnetplot.png) + +## Requirements + +- Install PyTorch ([pytorch.org](http://pytorch.org)) +- `pip install -r requirements.txt` +- Download the ImageNet dataset from http://www.image-net.org/ + - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh) + +## Training + +To train a model, run one of the bash files after setting the model name as well as the path to the ImageNet dataset: +run_ds.sh : baseline code with DeepSpeed activated +run_ds_fp16.sh : fp16 activated +run_ds_fp16_z1.sh: fp16 and Zero1 are activated + + +The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG: + + +## Use Dummy Data + +ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". It's also useful for training speed benchmark. Note that the loss or accuracy is useless in this case. + + +## Usage + +```bash +usage: deepspeed main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e] [--pretrained] [--world-size WORLD_SIZE] +[--num_gpus NUM GPU] [--num_nodes NUM NODES] [--seed SEED] [--gpu GPU] [--multiprocessing-distributed] [--deepspeed] [--deepspeed_config] [CONFIG FILE] [--dummy] [DIR] + + +positional arguments: + DIR path to dataset (default: imagenet) + CINFIG FILE path to deepspeed config file + +optional arguments: + --deepspeed activated deepspeed library + --deepspeed_config path to the deepspeed config file + --help, -h show this help message and exit + --arch ARCH, -a ARCH model architecture: alexnet | convnext_base | convnext_large | convnext_small | convnext_tiny | densenet121 | densenet161 | densenet169 | densenet201 | efficientnet_b0 | + efficientnet_b1 | efficientnet_b2 | efficientnet_b3 | efficientnet_b4 | efficientnet_b5 | efficientnet_b6 | efficientnet_b7 | googlenet | inception_v3 | mnasnet0_5 | mnasnet0_75 | + mnasnet1_0 | mnasnet1_3 | mobilenet_v2 | mobilenet_v3_large | mobilenet_v3_small | regnet_x_16gf | regnet_x_1_6gf | regnet_x_32gf | regnet_x_3_2gf | regnet_x_400mf | regnet_x_800mf | + regnet_x_8gf | regnet_y_128gf | regnet_y_16gf | regnet_y_1_6gf | regnet_y_32gf | regnet_y_3_2gf | regnet_y_400mf | regnet_y_800mf | regnet_y_8gf | resnet101 | resnet152 | resnet18 | + resnet34 | resnet50 | resnext101_32x8d | resnext50_32x4d | shufflenet_v2_x0_5 | shufflenet_v2_x1_0 | shufflenet_v2_x1_5 | shufflenet_v2_x2_0 | squeezenet1_0 | squeezenet1_1 | vgg11 | + vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19 | vgg19_bn | vit_b_16 | vit_b_32 | vit_l_16 | vit_l_32 | wide_resnet101_2 | wide_resnet50_2 (default: resnet18) + --workers N, -j N number of data loading workers (default: 4) + --epochs N number of total epochs to run + --start-epoch N manual epoch number (useful on restarts) + --batch-size N, -b N mini-batch size (default: 256), this is the total batch size of all GPUs on the current node when using Data Parallel or Distributed Data Parallel + --lr LR, --learning-rate LR + initial learning rate + --momentum M momentum + --wd W, --weight-decay W + weight decay (default: 1e-4) + --print-freq N, -p N print frequency (default: 10) + --resume PATH path to latest checkpoint (default: none) + --evaluate, -e evaluate model on validation set + --pretrained use pre-trained model + --world-size WORLD_SIZE + number of nodes for distributed training + --local_rank LOCAL_RANK + node rank for distributed training + --seed SEED seed for initializing training. + --gpu GPU GPU id to use. + --num_gpus NUM GPU Num GPUs to use. + --num_nodes NUM NODES Num nodes to use + --multiprocessing-distributed + use multi-processing distributed training to launch N processes per node, which has N GPUs. This is the fastest way to use PyTorch for either single node or multi node data parallel + training + --dummy use fake data to benchmark + + +Example of running resnet50 on single GPU: +deepspeed --num_nodes=1 --num_gpus=1 main.py -a resnet50 --deepspeed --deepspeed_config config/ds_config.json --dummy + +``` diff --git a/training/imagenet/assets/resnetplot.png b/training/imagenet/assets/resnetplot.png new file mode 100755 index 0000000000000000000000000000000000000000..76eccfc5c45a090c37933dea02996b64eb0438d5 GIT binary patch literal 138796 zcmeFZby$>L`!0-ufg*|`ij;&%45$c5gFy`;F*Jj;ba#p((mg|?$PhzGH;72bNaqY4 zLk!K(>>GWo=l6blzu&+6*y}hv7;d@Othj1j*Lg0#=d$9&L=;2>1O&vA5>FKf2#A1x zgpKFU08jWX+e!dGPS`4lKPJfOpu_^doHi1b5hWnV3%RhbLkRqS-bzBvmVn^F=i`4T z?&_-#6A&CBB%g{ZIYAfE=iM}W-SJ0BVtedr0Y!uPl@id}igv>|E|GT{R#)$WQf@zF zV7Nrw91$tN;PZwXAtwcHQdnb1B|b{AIF~{qSx0*P*2RH_Tij5FVbx*-4Oq52jk#w+ zCTCYd8E4nanP9<>JC)kGwX-8Jyyf@~LC(IF&E>C3%F37CDMO(-8o&VmY*!T_kSGhk zM;l3hJ_SC@`_eN0{pg+gY337uJre1mlQ{p^6N2RHqEdf9xm#E^6w|KCRb8U|NTfl;QtNj|C&|09~BjKsvI;uJzY-trenO$*I*e_)z#T~yD@}@ zrxjI+75IR5KHk@J4D^Y1t&9?29)KuRhX^INAHZdpD=${~{wEHA28zFcTXVxO-Tr*T)p|E(3BO_V8ibHcAe zop+|;zE2`oS6A19EZyorPczaPOZu}lm>kjeaLj{ipErswmJV4|TW-$x=6f8t(BI8S zUztY0p(bzcvU-r2IkuT1&imWeXd8T= zh9$P|xlgMru1p-CrSw^WkV~uB0*(CJ#(2FH4<)^=qNSCs@#I5eUi52Njugz$WH^|L z-%f7oYSQ)_d2AQCeNq_9 zWd?!lFPQ8mQyV$#ZWyEgHc?MGy(7<+4O>*ml)Qyad+e_^R=s5op%G?B3GHR!{4{|j zg+2+NLBhIIWxBIFGprR96zouY;p2agVD1Spbsp0`Hu`JW*5S5jhC0^g=u-LHKBu^;R zZS`4IJ=y@Sb>~>VGzPjBN6Kv1KA$3)OKd8)UkHp7rgmMtf2@6-iTsW^0{bg~>fjOEdLxe4qB9)Y$H7&a@yEZ6bRw4fifroM z5{pes$uToCmy^X;Rtlhrgjpn{E z6VgT3N`@qt@9AW#YK8u%=ipQ@acAEV#J18aSX*l9GPrA53T!e$Ou} zn*4YyR*mrl|9t%PfFnq1>omb-W?%s1fkXnatl%82iayhK3Nc)!FoE5~{# zdy55KJv}L|Pujq*TroTpwjvSQhJWl3@R3MV>P=58M~xXVoe&nO; z6GTl2U><;%MO(Ym?4`q-#_lPv4=5N361*maLWqh&lZK?Wo>%>;8T+K3tlCS6FsUKO1 z;N6DHY||`=eeSmFNdNVPA{%so>0kkYdO=6<@h!J_BOj?=RBLN@)raXi@$Q};hBzL} zKEbfeoN}p>zuqM13h&hU%kJB z{^?2w#0lwXTo?1kk)em`fT0+19Ca@@LU@PE39P=s1uf;bvPWz#^e;fShYF1&Ec^hs zC|2S%r>v-`_a-GOJe>4o=R24GohoI0_pgiR321Gig=^A*C5}6mr(^B5m-3vNp1yga zrSoy5KgDvVIE{U#6E+C>(z%h!@#)idf5&`QI`Uzp{J zo;{Ny5Mg-W=>f!%UoMIQuW&FBIR0;^b{weET>-oPZFvbU{=X>q{{`Ova*F?VA5WB} z4GgBeEWpmrWfmt&Rk_WFQZ(pH`?5MgwB@}$Jrd&LzMbsM%wds{*FF+IWt};F>~DX1 z9>5{u0N)5HZ4ba~IT#OADN;Bt!;1A9&sfr6U7v=ptutnul~1v0-;TJ>0eh2W=1xK< z`lzP(-Y6-OgYq~1)LB2wJ7zQ(R_FQn_b$KrqYqFP8AS6FZ zPnv{_4BS5v<187*rd9pa#+GTpwKAZ1;nKNlz)@C&0Z{%o_W`d@rv2j7@x1GXjgrkRo<;^QC3zwqb{V-{w zQnZ198O+)~dF#&@;h)fj%Jw9=KRW8n5~XF&C#_h;2X{g^%=(?S0xQ?oSQWI;cg;bP zUFh)C<(owB5vh(6*usZu&PyiDY;lO?n;(J=MjdImPWl-62TVz{DPR{U2AQ;#L~>{k zLX{{7El(30q49*wr8Da8%b`n01(FmVtAz~w!<*MfcPXvA?CaZY)?ef2K_9LlR}!_x z9~ZoM9W&G4Clh#x`n+<~K%HFdYWAR}*u<0D!b!AaF{B&6%fVHn(TA>3Z`t6waXiWN z|8|nz-zSN}eL}~fj?^)OJrj-u*;&cYaJwxoRR(vJwhwpr5ZMJHr|@fb-fQvahmR^chBG87cY{>61rxWLN(Uy`7!-p zo>AHE!g26XVU;C!Pk~d>(33I#L#S!f^{MjS%qKSJ6IZGJ8)=N2v!$biZxWMLunL{g zfXL@nbiGfQ9Hc5%oLzt)70b_?+H77wVl*gfN^&#FYlAw(T)8ax$+2WnuDr|jy%pt& zvJ7Td{o1jFjHKN{rK48in~=Mwh*^>Ik@u6fFO==(_F-M0d8{v^T0SJ8Jwl@Ri$rlp zGb$UfuooW^#;X+332P%EC4;2wm8&yxLDgFQ{x0V2tK<@j`844<|J|JYOYP#C zZ0WAG?hZ8vzfTT8ek)zZ(loDr?YaNj=UI^bgT1RI!Ro8`2ax_hgR+8Xjm^%aBiL3pv6U~@K;HbQqxDKWjc4H9aTPdfF5GY-ribMlfX6fEcn z-`}Mkmqs&Q^R>-T&Aye>?>weU*8m`wSEx0t|zaDIphHc6;t)MAmEJ>eU_ej z8<2Ak2KpydG4WoB%izcAnH!r7651v1{pe&<;XB>9ZQClc=;(h*ZW5mt|Nk_mcP8S` z_2yUmYe8t1&atrM5Cv7Rk_7Xsma-N1E39_rApIRwC@QCxsiRJf9x-V&fR~mc@cB6f zAT#`GO)MkF!J3zOZDUWHZCBq5ZTp@#h1Z#v2bj-ZWiMgOiXO9T5x*bcO>Npz$j7|% zA}$*+lqfA-2l$3V{g6X;y443#JFA}iw5JZ{WpC|ltG!a_M;jRNw06}TiNm%V-T-?< zaM=%-hfW|ZPCdmd%{$+Qk_X|tpa{-yKRhp5~N6G9g5K6g-d zFAQpLe96tb7pf^-A#FHfn~OOqEpVLJ40}h*c)0fyuuK}$$JgK_Oc zgqhoBrFrca#%|droT5*05IoiyIz+Ly2^-@jbur6OpVP%V6@TKtMh?1N|NI=@mb^aK zTy{%*t<2J6CwZU#^w?wC|3aI|E9$%qKSJC$O1>Zxz#_xW$Lng|?I zGcJ1esIRE?SzA}yMM`j*ATK*nF3zq|lH^XBH}P7nb(*xtU#!cRJmTLw5U_H}j90jWzdetP;R6O(GGXFG#_pS*s=g5)qF za%zuVrnSx*VfutF4z6d4DK2YfJ9pg^DT6fFdYRV)HDqvh8Ldfd=$O^;fnIiVeg~<^ zM6GaMqnxiK)v|^xk!AQW%ho|2%)Q5v9e40nsk<>$X(lNid)&MST{H}L`g8&(^YMV? zAD&Gmq`};J7dC@&!wPPMODFO{iiOucnOUbcXB6)|__knV@U`~-Q2E-FlUs7((W**# zSS>cX>5T(0DE5u0^)UZb?o=#kbp7UlOS$Q_sk+@$uvS zA+>KGXoOIQk&g;8BxFZy$X)6Z35*Yrgu(EK8V(IPD?&L&FB`z`QcL(HO{fqyS%$ei zNuWLr^Y=me+<4~i% zB96(|QX>^Z$8V^`*w|ty;p&z}>AFRpzh`*_w~I&Ke;mz(ao5;l(!%nu%o$yJN_bn& zjxBP2f?3q}fpBEz!KD|w^+l*!LsdG_hm`cmUg%f%D;n3=gRCC|M%f5|=%esykHUMQ zL*S`J+LRF^_Be&~h4@#g?(>_Ek3;P>Fb}*#`DylV4Z1B1jSm;^;i3+Xwn@b7C5;q6 zdc5U_*egIjD22#_Uu2F=4U zsG~;Jj^Uhd$wZ3Ej=}knB{erg5ATsQL_2cCJNDw-FRa4}uloHM?6t+IPqsFLg7S3- zBZF0*I=f=Co1eT#R6^32$Y-&~$;2;L{uppFQlxslm!q{6bsz@KF@~$0H7tXKx`B+# zDn~AJa2y@t^RcYj`E2#skE8dh_g;Q$HR4IV)7qRRy&Oer(@;v|&N$(EloBdIhi#5QY1jGUHS{4;7vH45?z-*=()|u0>q)T?rfK(OY+_6j|MIjBm12U;Q8}wU39zo?{=bq?W64&K&=Gus zc%VUHh+Kjp8{z<07*YnRuQHeikoeF|S+mI(p?x$MehVIiWQ$k24W&c>-wMs`vYjgoIfR{YmFrJ*cbW(>i5? zzV@wbE2|T03n{K96}YlF_bsk0(~8Cqyfzy3VSO2{qATlG`??gZxG}xX6c*+-zW1)| zuM+sl*yFp`gK;s0)t(>HcU5Dn6?G%ZH?Ox3Gv+v%si0J5O$Wo;NmcS%`W7tOiFv>t z%HbHmQg~VYE{|K)W)OuVi4aH;=hV_BRk7hDL}^JHnk|ZRRuXK0$U9x?a_D=@gN0@z znABtAqWtzbNIFqC(-65ci5?IqXPwvlvh!xJ%JTLj<%+i#XW^pldNj7Yv<@NUIC%4l z(`#bi*;fnIxk=a|-0DNJ;yCw>Cz3G&*KBC63^+*+ZsrSm_)*@t3tu-lH6gKf4&#g~ z4rp{B3swA3n$TP@tptt5n9P_q3aC=gn4djK1y1=?sb+KCB{*ZR`mMb=-bww-l*3Ms z5e^2}d<5NU!dRV;I1ktYz4Pgkp&@xvM1F0JM%JUFe0>?djFMd@=8uxs`LsDZRGy5p zl`UD~V2moG51r7&vLUp&6|V?Ynu5Rqo65&J29)Q3AIHt>g4D4rpv8k&B9SKzXu;N& zfZ2YSdI#KFbD+xIZ#SeYYV&>+b-B@LhX{K53ry=`r}a?7`Wr&qlS@L%{CF845)y5= zjA>Y1*DMuelg=fpaBO{;h>kij$&}O!D)`*E05a}8{uviu+@JUf9oC*-rXN)q@#*od zKgDYb4&yarXO1P(s%KOBmVS##nhXj9$dc{Gu-@CfiVD+Sk+%+pFeisuhz(_yV3qmn z<+G#%s?8De)44Uf20LBk%P>OE2fXPki9~ur8w0b9A1fcX;L0_K;Ei>nFd3!RQFgB6S`~?uwr3?<))0&B#3(FKGVSY?(M+0Y z*(I>LLw029a(EQYfqYl&Fya7NWU~-*G+641jzX-($f4P|3=k)@9J#u?g6_?fELSR7 z0`L`)(g3X*j|2L05HBxpIpLUYMovy2D9pG!6?^?MT3uZ|B`vM-rIEhAgqvGs=b2=( zzYj`USP;g^Z z`SL1Z&k&z?h8H%CbqL~Vc~p&CTM!?%KCBEHpN)o6r+WJzEG0&rIYpD}Zc?Jp&?{-_ zT8`13Fjb`)hCu4gn=cqBC})4=Vw%goGf622x?0KP5u80)RMOa7*?l(Uo3pg~1qDPV zr{X9-H(9a9x5MIVNkYuMHxUo8C>-nwSG!GL7>7BtwL-YlV;8qL zdZT9b-#c0bS}RPVv?AE>Zdi6Thh77lYMtDN@e9L274A9zj=_lg9Ve@-i;JTE z=f*N{D<7-RDkOhw;F2WnR%m|B_66pneR9T3H&{>gmbj#)Iq|N#yrDZzsBDWvh>bb>AVM?NLQ>d)vU2y5f_26DXp zeTiUFI(xq1PIM4UuA8$?EUKlf2E`XIVjLw$eZ#+nlCG42+p?qZq%g_0F_gljj2}Pa zj!}Hq=rkh%?-?$rigj9ETM>G){?esK2!y)DqeEoMR$=Slmy^6vLRv%WOkmFtF3R-8 zxLA26-l{aj;($EX{%doeH(8fseUR8@)ockR+-UR>k_PmkWcj^5Z$(Olb2w@>Jb%7E z<{Jn9#Px-2x(i0q;p^>M3br0|wk_|WoSQ?&gw{n9D^9oN@Ym;IkjL>)Je-cCno&~W zKmvTQbexd4VDyMU-^8ONtHQA;GOE2x<88c%|K%z$bdGwYVa9*Fv2K}i)`k!o#u2rh{2d3gbG3|wqts#SIaSUHbTNGySnJc zc(+t#pho*bIm=T-!5_SFJ?tvSzF4r}-UW1owKgmJ{PLL#)m)kj;vI>(@FS;PXKa=O zNmZ~Ip?L1mI;;N9mmQaY3UqE2YplQS9s8Gr7Yv$u`aE`O-Ounf5N3 zmNd#dEvyx?tvU3Y;TR3QGF%5Zk1aNap0&kxI@YH;SM&v4CKA~N%4Z_*YSKB}KThZ0 zJWl5y6GV$Ctc*jJ7f~RoQm=0H+P|7rIE6dQfA!VSO3W_w)6}i zww<%7tU2@;OjNmjoGkuW7n!?4mL=dV^Q6)u(CvzyD$D0tGP_?iUq`n)QVe>`aWTSnOk+<9+cs;u5CPy#3`}5Y*G_5UR zZ>v8LRXc>>cj8zBUyOHiEQa|&JR{3C&jv2FsOnU3X85elY=kru^RV5AJ$8`SDM*6~ z+zm3vfq7ImRHPQY_PVfVb!X@LE|WP_xw5AqrG={aw~Hfr zTE&(_OtT@pDWl@5)?J0v$7Pfer}cK1Oh=Bx`I@PfL&dJn=p|0o=x{VGZ^YFpPS5pc zRlUNVBm2RRDtwU=6>j@)R}VZ>Qc3X5F)G?&Z?*V?Q2KJgZ%)S)zRX)CxlNQ)rfg}S zc6iBZ?yQ`YoZ?5!1N|)1XMTQip~j9QI{OKqZ&i4IOcWZ)S9B>Ksf!HAsn9HGkkf*v zq2i-(xDWT&*VgnUpS=Qn)y=I_iCKJeW{FN(M4|1*29+MDaef8DMa;MldZZB8w z#40&xe^^eK+rPN?*<`yX<0v}Z@$Kg)N)Gd~r|bkIlVPcsDzU`MgbvWjThTd^K__2fDM9>5DLF-LsKA z5Q^#31o^r5NqNkdKT3qMh(9or7)8LFcZ+OO82Wd2#wnJA1dTRed5}nb^@eqUKArh; zl^E&Gw3rZMJt1Z7ttB+0tWhqGSS{bK%pwTY%CWB;uZ0usDslvXMC96F2x5aDjpr~u zJc*`AC?SIDttyY(JajlM{={z-L;wKg2)rbK#N;`>?}fV1KjAmWg}Pp+86Wj>*V#!= z4%Y`prqe?vRSEu}nli_{qfc-Y?jUS%&pc*SQvau=C2=r;dDf=as_>1}^4~bHlwjx4Px` zx(kJwRd4Zakjss?*dD5g+T~>;lLK;B_tg@29|z9rLccuvIG*x#3y*bu+hZU@)lb?m zv#M0u6)^I=r!@dWDjoMN^E$ql6&qUzx|pe<5^^t8a&Z#R+!$CojB3?Y(hP zGKh1-M zp*u)f%!(bbJkLTc89%Gygwi1Un&<6rrBaQ3&c+v)w%0wu>-{G*(3{9bx z9zs@oKJ|+M!-OLMcmj@Nw4}%2iW~nKTrs^|Ik4VS)3lt~wUo)=x2iZgmH9kYOW>qM zN4SF|9}SJ&jpu}YL?LA=x$7~h>r>M@ONa!8Ug7SSMi&cPym57t(387%F35PRK&9jY z#_hY)7JISRhC#OFr?h2rTdtnbZ@i_biF_Yp6vS_ilN zd^Vl!-lg5D(;LC1^)8u6Y546ZE|3`ifF2~jn{=ek?{(wzYr@sjDeX$z3%>Z=$o(t^ zddgImWvX2(_~>N)b@p;v$OVW~d&m1(pBLiyy4i*rF3i?E8FbI`s@X=L{B#Ua`J-rd z;rzdobwOMl0D6)`VC@s78O8aJ&&Wf*>U^#Ynq-B{)uMa}r3hDhx&~z%86;%`xx8OJ zv!1+g3Q6!DslM$Or8rfaWC4#|rz!?HE)kLU#arUlA$iF)UHPXsUJdY;bA2uHWT~7j z>)CV)>8&s7+2r9~?9tE^gMN~SM{KQ?dmPRXl^}A+Y^SR>Fchs$vxx0vB~T&%oZm{r z=!fqZz2g(_A5|oB(tp>%C`_Sl8*4OEDo3__f$)t>)-Cr`4;K@28@C5=O(qeCsSnY% ztLp+z&Z@4)aHUODXy%L@x3^m&gmY3vM)XkO70JRFDxOE>5t!F9>?fW zW$IosU`ecP)0x-IbW7XqZfgW1ykkCs?6cqK^**k6op?@epNi;vDd_Liz1!_S@qWkn zqs+XSdQXpKnv{9^IADa#oAxC}HcND?17t;R4qApt5QJh0%BSHxr+7iXeo^2fm09m& z{!Ag43*c{zF@`x(0-eosJgO`84EpN*Zxc{X#)Pir|8R`N@~w#LI(KJw;Tt z(zGKhj(5L9D35bQPwBiPMQ#A4Sk+aZ(|^LMj+6ThvE(7yM}tlInBq4VpM)wEno4lJ zrDz@N>)hWLg;LL`3%L%A9Xwlo1u=eDpocXhyS6`mz{)7h+B`SR-1WtWCDM7fO%02~ zaql`iJLjh)e}1{wn)Bk$W!Y{1Yu}+H|3}05u{|KCoX>gRvnJW~!vyG)(`v?LJvp6< zuf0y^@9wrJ?hfR@P=~bU!5dr77~Zuv@|h3S8z5A8+v{^(gDta`Iia`U*B78KpDL7o z>p&5~${U{6{@1c-4vl}Lz>E^8*1~`3v1;Fwpk~|vmfb>BsrDmew$SttzK6^SKRv|sM%!n9k%)% z)$%j_&ik8}aj~_9W;DEkJf7&xCErteU&DqF!|c1VD6ixI(+cs=&mo~ow_BRknr-q+ zRJY{VNtw-h@w?)nBrQS4-gENEPszybQe2AyePMLM%#{`;;n86smqj>o$f=);Gw{CH zt<2)w&*_H7)j`~SKB+LAa;@b04;bT^?r}x_NBcB4FLzD6(*EYmCmkXhnk&SA_-e;g%T={a2r&s83;V-hMEIrqGLfcFQ4RHX2a0Z7RBM+`*D zq6@;Vcx}KeHXwXmui?H$QFdUkb}ERBA(n9AMWYAh=pK=iLoW-a`H|Pao zr#SPa|DsepNPIUnghE+qcC*!Vwn@*CBmCG{>|)aI&H4{IH|WvS82Lp`slzW8*|L3~ zw%5z+ifU@S`#})rop11`1ni0bhUEgp1)V~$loSJ_569jW^xrTclPu@eIY-aCHgt;k zYn%4@Nv|43GREDf*{aM`z}A^8;fWQ>zXQGZjTHBk$QT&YwA2+y16Mu<5Xre{tnJp# zi#Z$g8xPv2K!+-_Aj}X|>&`J-mB`uP3MqXcIN#W~gbR^;4~7o&DnFbk_*K0I;F?k; zO^E~qbiO<503+eDA)s5z|Jiz#Bzj0rIK=l0-g0_#JArX2+)fC(TS2;U{V0Q0U&VUX zicFI1dganu##&FnSwvpDyYsSlsk*gO$~}mWBq!OM3f8B*^isd3+RuYT?|~|^k&?;=*KRE396tS1>L#*V z4O!=T@9$)H@nvf`q@u##R!i3o5?s+?r_Lh-sfbb7oVKgF=v#Ol%*k3C{JeztC*N=%K4(e z%L(c`ryOfp;h)x&^L6kzeD`R$48VvQ5cTCek0E~h^XBVQaBXda(~OW$U2c1uJpD~i zlGI;hK!0eR*D;IWpXrWuFJ-ncu$z|K$(i?SXm0SKtJflPmQtrH$;2%(mG}*t9hadD z$2O%|7W@tW@^wh=(mJrHjgOGYQ(FCP_Zn_o00VIQvUsZTT>t8>=~OJEtN^A=FAK*mmyENk=PkfAI$cD~NSTFY6%oBzPPX{~c^TyAgpr+-}#90-$vF70hiSMSyJh&4^K!?Cx>{qg^)RfMzOt zW{aYcEfvc+rked-7J%5|9W^tYTT>=Rz1!n$t&;l>puFMn?}db~?_prCgu44%>RKI1 zBm(tBC|Y)>&0^|Gmsn!yDoBl5!>X~9P=5_UC?RY6Ka>SprQfx@eVfGrHT2pw`qi5t zn@UIjcpT`V1zY_Oo9QKSjQ;=8#^5lR9XemC1!u%;@#E1`hTr+?`&4d@agQ|I&oZ-K zZ`b*ce^FI4t%c6F`HF98=h4b_{n3Yr{_WpOq^PHuW}(JtRPqybf7$u>NX2@)zOhjy z#FQIFX~ZZ0!_E7P$q-1WaY;!MxZ^V{yj?hge;+C4P~IN8N=iF z3iprPg#9K7er7VVtg(A!-x$y{;_#DALw8(9{?WYT&N@+n zS$?#!?s1UDSD>QkstjkVc>16ILa}ZzqTFEm-Ex%w;JV#$RS0PBq<>2uw3o|Xa@h;# z6ed)r_^XFMl(sj~aZ5tOGY6+8likz}`q2Ze&F`|9lfCXNJJzJNOEA;$?*-@*rSlTwQRo@_15o2jVn)Om&9gA9VR{k$T8Ap zHEr5fe!q4n=(m|{FFr^^a4aZ0P`m2F%KL>N?ZFEaZ z%Lz1WZOsOKa&dJcfTN`$=$1Xly%3?@^tMa~31Co2N=n`z5-HUEMF#p#PCz-YPV5;W zX33A(Xn2)x3|9i!mRd*Qv=+pM%`x{n`(QoelnEk!+%K+sDq*bmo z(~-cRqaiOJ=CO5I8^QoVfQ#%(0$eaujw828g!exQ0L-y`hb8qA&jSSPyh<(jpS@I` zjjgD6w#qGJn4SYE57yuRx!=;1Y)e6(*1ukrYXW$Dfeuh;qQK5ngxIjO58V-N`A`-Q zMv@qrm$ZVkHDKF5+K z(_5J-x!rsIAjA842M>%O{%&g>a`T|W`=|gTjB(hs<>B+KJ5bY&sj20+IEjoUw92~ff{mePJZ7S%XWv52aoL2KH4ae{*ullYmvFH5dKNhq+n_QnF-zp0 z%P;;o*KI*OsZ;QU?jK?5&ur3DRfQTexA692d{BOR zx&*#b@wOCMs@?h7>8kU8#j;KxSzr*M&6G#C8go=hwxAnc?RbPJ|75@E>=d@%=p8jV$!0E(Yo|MHw}05ENRVA||t79)EJ;@Ob9 zdK;c+-quV|0$6klf4R^#l&2_B&drSC8~!MN0Im*TT3c;}F*AapOU|0mU`2V+rDNcB zL!4!whFGEFW(!c8$lv>ThF5u*+sU*S|Bekg4B}-rR~UVyWoZr99*^X3TPgrhPSS45 zKM91YKM&ymG=mg@l)z4~(w6DeopD|&>i?~7t`B8oWNeNO@C%8{jCFM<{+7dn=MXkU zZ>{{nB1-@;+!7Xz=<4TMbJJG3kqUtG*#efR>_-K(7M-um(>{OX2h7^e4XP@U_r0l+88gO8tdy@!##I}!y|=x8Is6q&rJ6^@7C6#_8`3uZ1e8ZTQzF) z#TQG6o(pYGh#H&O&Vz6kg1Z{LQG#+VLyDW%9a$~f$K8ppQYVr9B2fIyr+f)UKF)wG zD8CXnz;+?l#$J<9n9GF-Zbef*vikf|+hNe)n63O#5GLoj+pVWuaq%qhCTsPt46~+E zZ?5~Odyl%BG@)^i=YG0sDob#CIjTxGhIBE!-DK232mk{y*}i?qCEclU0FUfsSRZm| z?ixPo%C~HJN%oC&eT~6=H}FF7L-X|4BI69P!M$-WSeLb4)_Oos-%1TuuC4aO9rb&L zt;db_REZwGy{jcaO(R_PTsJ=9brMQ&UrO z*R6ESN2|3kE(;*^)w0Q8emD01Y!(%69-doEg9S#d$ z07JH%@qzi!!(-D*!O7!BHnae0<;Io}k71TsaE- zA}k)DD&FQFJ|}z7D0is1RXId=sw!_+Uhl@cFx%d$e7)g4g-3!~U01`WLPlMciW}MB z9et9!qIVMaum&}*6%i?Y@+`9G#Gw4_^dO9=%r_>flW&=|13^cWjU{(utW{Srh=8j>K<5%PQB56 zQX7m%y_%m?I0*3UcRW+OQ9U%;dB;6LtDNU<6N<=w$#QRt#~^@HMz*M(MnGA3c6ltX z*nYZsb^EFD(PZw_*Rf6KF?V59L{g8v%f$Zeoo0S>o0YHM@VXY#jSq{KQuG;%r3wn3 zTUwLD`#>~DQ@wy9Ywlb8;vqt`wR3+nnVw0#@$sd0kqD*{wkQOH!PBmzdx@P}VxZ0? zq-L1~ns0K3#FnuYxTT<_jzrk3u#T7v5cWd@rhjs9cY+%VjHVLc+F|qaV#cPD;W*es zchonV^b9PO0v||58Cs!E)M^NnCzJp~xJR1JLP`3x{sBaiW4a&-AZ}x@RsONF+d>|R z$Vc3^yJ|agQw%jMkwMRbO4y8Q+!n(cQ$`1`<{PvPSGWW*zHs#Hp=~r7K4X~;9F>8b zG&8FNoNT%*|D|EDej4MP>)S7exdOJ*_(eqzgUbD#wB>JAE0<*&O=gglM zT(@r(K=`-+D446EljA^;+b3N{SUkvG*CafCxt(*@&mH|B8=;;JJ4KM0|^#+Gl$$Sjm@i<1{k6|1`p!}*sh6-$8dzSH{` zLG=Fp%ANh(=>!>&2Zyh^awnamyFpc_O4RC>yOC3^k4{I*Y2FBJh9F78uJN{OMjCIc zjqXOrL=LbW*YMxt3K>cq%L(OrY)hsw4@-snyuLZrKRToTUYpn=1YlpNX$aARmTPFj z4cZ`1^BLqMQhITUmaCxK9k)wzSt9IJU=*)7D_~{Z(OYU&GauBq_(G>+={HrQM1uPn zBU@)#9XQB@hMW%BoGn#uSnUi6lmTR$P1p+goU2zr!upeh8i35wlCn z8P5=DBnjehLV4%&{o!+R7^{7V6OOFDa--(%Qm7;gZi3HwAG$ zH$2_OiH#o#DlwZ1gy*YIwQ$uxSXmgA&<3}D6Qi;^#!ZXs!gi_J*d&A*$i%l6Kzd5k zM5iJOSPeSHtIEkmUMdCL+&{?NuFaLyZN7%J+iJBe?rkov1BFk`!RRugq>a+P7B^l> z91S~So|0ICy;4dmOkt2%5~(iSOp{de$Y+xjzAerZL{&+o)X6Al)1A#TV1q8B+_uU! zJ9?6>{ffgrArj=+)XSp@8Xk3(+p02Arx-BI(Sz{u4NZ;to}KOypG9oOOQ<{v&(rdg zTY&Y92*oG}NwUl+OBklY5tA|}1)+$V0!3MYt+^&mMYlI+K?2l9)i_PQOVpv|N6X~6 z$-Vc9N$%b4;qC{G8`m>mG%C&>^b5Q77vCGRcZ4_hWOQ#{chy#@h#o!F0IoW_Umq2L zpk)<>j%w{H-QbnjtWFqqP@3rraqDn4 zX-Z<*5YZ;8l%f)jIIASx=^%JR`02hvrNZk+Xj1VtUguerdpVr%jt`@5KV~n4bb9 zW#iew^a93pxwp!XCWVcKrR(@0OS@0$HfuguV)_}VsamU5Y1%Jw2X4fma3{MfaL?Xaq!yMEcqgX2REtkR0g=B)RK*n--SLXgLM5u0 zlku6&(~~4Y4di42>l{K%!lPYm*;)q=7;%wPN!C&|G`){{owVu`Y+k(AggMYeR+onn zqzQ^$;$zR&n9(M^x?!JY=TO`_oLkMY3Bj=D2HoM!Rn(o_LoC@LF}mbv90 z*ZF%Pn;vIr!0?Mr2ALfF8e*qiE8(M>?0#{huwbg?g=`=sl1vhy&dsTa=kSm`0zh} ze}aWw3yLR+2XiX26Xt0l*%WHq>}L7PVFSq3klvE&Bebc=!-#%Quk8h$fuZ z7mtNHJzuVh(z?B!bz^yCt=qq2w1N;Dl=SiNBWtrVfV?(}odLq9<{09xo~x{9ZDuHP z*U58RBa#=#ShoJ!2YZ%Wdh0`X?%G)N%z`Yh{rsEC_2H|S zCuSZ{CX3-G)>*If(;Oux#V56?JILJHP&B&pxYj@#s$&^N6ma`s$%@G*R;vLNk%NN2BpW49DDTW8?6 zETPslY*sxf)11vOJYTuV&2)`SKf!+Ey&2n4WIVV7&dS=Qzm(irQ+K>Em#cnl3}KNp z{uGG-?J>i~^Jh%F)R~T`>a~q3GVN~x&h0vdjcbmcct;!HLXnT|X#4(V;6e_)!nz<{ zi88kxAf=HET#PXrXz(BsxV=O#<|1wE_v=D@JRPcQWVnSr+m?x5POC7q4B4A*!fu}} zt_E=&ZRvj#tuG6xNZGRU?MZ6s)p!jL?PGBlJm1wX*s1MloBwhJb)px4FS!2;SulT6 z66;ygoE2)XHQUMnUCKseP6fmh;QMqp9**@3oC8lj!Z7j}wdO~)n)5@sQm^P|THf<8 z{M0&oOIP=6%ncelvcbhIS>~f*%a%@>{mhNI<%<^&p_!?TgVdp%%dc|}R!4A?);ax~ z^9woM8jGuz95hiSLIOrgA2e-E3fi)1c74+)t?v{)--^`P&9Ll#}Nu6C*G%cF-C)NK4fQO|GLFbyg^QzrBVh?0^aWu((%lL>y4 z>CuCo#(^}r6TJ47B}eTF8PS4rmN%0zA4i>c$TD&)B8y>wN0#iyU!*cMaq*3m1xG~n z=fJ+y;_cvC;`7t|HM^fx%lG;|9--8I6cQXaX@rr^xVtq{Lli`ID0X^b397~@CJ)O( zj$*2UGa*HtI6dO*o<-X(%%ek-u6a&Z9Tbi4?2;=XtAXjbl=Dn2DWu{D7Olc7gke+Q6O_C!_S_g;BsS4>&pPk8^_2 z11uygt6s_%1zF5 zOFQvFaS4z)P~@O~$Re-)I5Njwc^1w2G^3O|COEr*UZdp9phFwIum+X5(fl_28HrCv z$8$>4m(IjLMlGK-nB$zumN~3*+HxV5FGvVjCZOHB@-;`h=I{y{79UTEKKa-bxKU-T zgMWE0@kXny96;^Eip=S-tm5tAagT~MjbS~5@K7BkbMw$ti|@E`T}n-@rY`n;yO)-gQEW#cX?XB=((P^G zLutGPNz(3I?s%zS3oEN2TBCe(L3$hQ=39#WFLr97uWPfQ&Ry4Kf@VR8L(;ml%{|Lp z<+LVFORnPf6c*1sJEIz@;C!i3S6BZuGO2G>Tb!YJ*W@H5U45=rZI~Up;GlAhj1T7x z4s+iP>*d>u3tDjoBPC5;OV;m?8uSt}QVTN*cxQRVZN-78x+8cLw5&|$?;-7n&~JB} zI`68ytq05{EmTbrns7z}z2`h!acW@A0&w!+G@@Q@L$eW}ODn3200ctxNj)>NlSGAv ze{O&gwoyPUzTf(|XS2^`d*ugfqiwm1>=0Fr&ENx{yM>>WKUGy5?V4S!VYQ?t&4_!{ zi*SYB{p_y~g?_NO9Ab|ohmMZ>aNYos|F0pDJwp~6&|gxwYd$s$EfUqs6y zSmsxJ-%_48h!E!mF&^yK(}b?;_g1b8wf*8{o=*AxBibaP)78Th7Kl729H`>osv!J? z#DwG-kXo^ zwqX~OV@s%!6)hyj5zEX`-0tVo>Ze34p;0!H4=rraJpQ2DvumOuYoG#CK z?FCirOye3-m=)OZ;DD0{UUDR7^`}P?Zb~<0vQhSTsg^_pi^AaHe%;Wk*EbBp^_3mY zMLjIeId!3g3|{eled|%4<^w#E1|>pE*4G2OV=fd5W4-(F1mQSWI%;gwz289_hPU_n zKSP~+i==mn_+m<#u1wV)KunWd39>`L0_j~QO8}1@eGl_2e%Y5kc5!6NOk1k>YMIQO zX&2_G(5nvA>}DwRaj||a-oP~~d`S2vd5)ah=ruUkxG!BiTa8L5oi-mI-}p>gLO6QC zw7Oah2!?)?wCGODR;Fo~@+fU~B$Pxxdz16|P24HmwWm7Y>7Q4Jj}@&^?z>l>eugr; z+LDtsu=`?a@9XPciZ!7pKUR7zmy_T2S-a#d*pms!#{{aV5ce-VHGIT{#4ion;!IB1 zO?#L`$9DryW%!bf+O^<5yGBumxHA0GOOv+HmNCdY$BZN2pY!QeH2Td@M^96|#32hf zxB~K(!2sMI&Qst9!&UOwX-^188)nFzhEElv23`G4!&sg?d9vDr^iV~gm3b`}04ui# z>x8?))+MAgwFj-uX+3>-^{b&4;&wSB62;O(CimP=>p ztt-8@9uc-u)`bv-jO65 z(?*@^_aFG(qQc(bf-mT`=em%IEv4sBuy;A*+H{Nh?RgCb7|gtfQMqBx;okWC0wgiv z#J=E=kR(m7eGN^`iC?+;HEq36zVSH6LE?2TH3fy|Ox<+EXr8VNNyVOG?vl4RdQ0Lj2@Yk=c1vDWBs|E~8dZQzbmVb`yc6MC;9NFcd zPZdXUW*ZtC?JrhT=b5zA4iKp4zwoy_)tUt~OE63P93&(zbQ)gWKtqyvn=h)UtQPvK zcrYW$uJy6ns;1P@E$t?lJS}!U(4cZ8 z?h%>2P^Tx3TdXqcl&e*BN}IgZ5(+ZA|4d@4F}SOZ1#Q}5v(H7_%25H#AR;e#G$t7$ zw0_8o3g2ZgsBfD9_D#~gmdn3SHG$Py` zvUS8_OF8#BOw6N%e&wT{N-3gD=ZW_tC8H0ga<3|D&%9z;cI zA-~4u(Cs#^ZC3_>)_AP3qdv1_fA;#>=<8F&;swu#`hVHSU8-No40^Lz@lYV;!#t|j5hTIs-2S7T@rR+ zxXbkO*ymyOFJ=I9I1GiKpj9b%+I0kqe1sF~cEws(bj>&2M+ShRMvyqY}dsaKGlfHdFdW9 z*Vl_Z6} zWybY*KC+JB=1KgNM25F2c+z3{hIqd8#zBz^&>J=5Fv+2y z3KV6`QQgniL3taXOv*)G@H*hEAE=X;JV8A)UT%{Til0>jd$7uus%wzommJzwD|)dJ z`M3>gauQ)(s!E2Yw{+${DHn5H4$|z(!6+ydRU$`DET;=Fs~`V^ccJr5yiqkDe91nI zyHPy2qAT29I6!qJI8>W8?Su5~(B~eDr4c@1$%`QcT@FcE-veJU_w;8j5pADh5dRhY z>2{jyMZ@3TqfZ%(QmaWSCD$y}R<8Gi@oEOF%XA}^^~OKzzvFE)PnVpTf~>@t2&jGg zX`d-6qv5;vWu10=g+}+2KOyWE_}&6Us+^it_V5j9U6h%CPEQn@?x>QF&g;*SsOAyo zy|JnK4F0F)p@ZvnFJm+P|~lupQ6Mvm00TL4hY>%_^PaGz{@%+H(GU$^~%S_ zj1b3b5fZuzNmaD=b6L?jejh*#xTr+5o5rA2$>WwQ*CkQP$gja3-Z=wy7APVft_;~1 zZuho!favNxds7X;n9;JQI>|qH&-=k|QP9}-!%C_rH))5sx1vsjtA~6fq_V_x7uOHC z=uU3uUmGROGv1+|A$uR9y~ZbRb&`|^s`$oJVM6A&24N&i+9xJs_2Y6d6Y zZyb9}5>!%ag8&MeCi`+DD6H$@4Ha% z@y)GjQCF*ZH%LQsNN8!k=#Z{2akEX?XWc<5BHZ}?-CP@Wn++M8GV|q#f#iyakp)(h zU_&wPIdEW48dEOC?W#&sI?`?iVj$nbrxHE{OFNI+HcsU8v~cVcBCw0ziB3@rUJHKt z+_3BC0Zvkz!t*zV1K2PtLNS!olTrQ$*iN0>8V>8s2*dQYX-?aNhOER?;g;kZ4Xj2( z^f6MZx7_Wud-gTbCngw!z4-^dGhbg-WL8?Sy`(D;A3tuwRccf`0VXHYMD0FC6bLoisqjW-X%9I3|kK#6yG}b0F30->(0?Vd~YcQVA5z{ zB{U<$Udo{N-cqNgmvQ?)tfGZWSy(|n09K^!=e)AI7L$_C9@^-VvHtk0$rUU8t^27F zJ~9$RjQN3!5pc*%+fV}a8N`^d2VsSHgD}++4FFbtd#YYy1106_VH#lfwc*H?r%j0q zUaxvFvm_Dv%44p4UkND}Z>NKk*K6R~K+{GVVrZ@_z$qrQo2$;@Xc~U~dU#!oeOxfP zr)V;B>$f3_fJ{6HeM2}Sh~LY{RZQOa`M@3d*-IA}bzQn)-kD=69$EL{gFuvSG>xG9 z8O5PCW z2_1D=?bYQn!wJ$58kMCP<~uW4S&GvFv{~4Yw$3OBEI@|2yY%zE1-;NhS<_nUe3Z>y z*CNv9*xOt)Gz4_+!Hec0-(CsrzrPASJ(26jfwqu4+ctEz5l5K!ci5V{GY8-yZb-UF zvDNN|hh8GLlGID;xEwm;s&(2^&Lp9i`ApORL3iwDSmYK-SMBDn4i@6++!4VZs&A8# z9V}vcX>}c4A;CX|*YA%7w+-Glv9Pe@kvqfA{-Y$e&louu9ox@Drk(m#`*W#BlhXjv z*Y!+J#izn$G_7J);;yoJ1@;p&WqL~oJ@kfyUT&*hF|S=bm&o0wQ+FOm>6YsXD_&gw zM0RH1ru5~q%bYlHwB95A+4v*8KbA%+g0&m%x%B^5|KB(Mztw-=&MndWeGWiJg(_qKRye_|x=5$#d@+^29I3q}USKVL<-Y}aIm$3Y>j^dp_uzISy zmK~cO^V)FOv*KOF*~+Y0_=SI0y_3QTF%CI2gUQP5^LJ3|-R?L^1!elpS->C)R(0jeV%!GSC+M z>G|B=`AN9hoJ52i#n#(FzPbWkql*{Em3Qn+%;E&LdR@z96**_A(@^W;DGyXkItuJi zAKR^hlA<{6tsaB8Ki>t%D@r>W4O)mBp4QW99FWGc?(gSUotfClf3BGNyl9E&rA0X# z6HU2(pn|D5&?31PV+cGYSCPCa<&9FGZ7!MWmVDq(kxXq&x!l<{^g=^NP~3E$N{J&m zg;j}Qx5QAU`9+lGlDbM8kBY_m`mlw>01LuU{A-Sxu6e9EbMXVTyA$Gh)!0USRH=-l zY&_BI7F<-cH8zgEMBgIQgQ!Ai%YdBmg-CcmyOd?|BnRU8W6Gtax4S%zVQE9Hnp_)e z9i9S{Yiz8ws&NVohf2A!2g&1^q|SXwiut2+ZdBvjmx>>>0Mt9WH_j&0AvsagT{=6~cD#6?slz6HsLb|DX6!^}2wz&0W#hJ}KOh9{g%t&&o;wN%C(7JA-n>$G=M`YOpD8Wa2t50y+*SwC|Q#iH*#Do;*c9OphTo33RgGzcs zO?6EAZc9jIgv5R$ssAX*?iQoATXvl*_S1Z7%@;Z+>AY5`RZ(@?tyhX`%{MnRI#MRD zs3n2D)vx=`kn2%_&hi?&PJxb9=OAk-&m|>)hRLI9RHm62j%nv`Uk}fN;SjTV3f`(qT8FG$?d3I{Vt0~bRik!WWe3_jkgB&0bOja_Yx}jr>9rTi zrevJn2feAnQUA8CI6!f=ZgPA6bILL!S%k43R^E(!8?ey2gJLp(n^qA8BXH0#Ws*WGvA3xmxRWNT$ zq%`6~6UQp$oliI2;E*F%EV?=Pffd8bYc2TMFd7g^Hs3h+zSIgH9QJ zkd;sg34~&=gF%0oyxiotgwi2((rSF=*wppvxRsI*^B_RPWscGvZ>mOFV)O!Q7zu(j zu(EdZ%ZDh2c?kJ4ZgkY4e{>CEwTVA3)4?Fh-b#24>^M=~o&GLubq8)wJs`1G%d6V) z5i6I4is%e^-sjWxY?{U=?00qAm?Fj6mOrM^^z>vet>_L7_s_ydUlebC^rSyrEk6{V zvD_3ka2@fB?bxzQb6fL8k}Nzc9vr&13pZ?rtf)P7FI;yq5s$sMcd*#j*2Z*;r0YFN+r`d@QAa!b(z(YC)r{jM0)h*6cF#epa1+!sj^aI2+RUHoFZS`+^T`u;$k zkXl>HXqW2Ku>M^%pWDY{IeFa)(!vuSxdoh{YcWpH^Al$3cAicb3!(a1fZb=!9dP)R z?7{^_m)*MDfE(-AH}V3eI8fZor<1>1IVvASESe^i9$&_4Z9fEy|H6bTkovXZ75nnB1R(f{uQ#(TF zbVq9UOVLr{dZA8c_qef~^GTa(%!D4#L_EbN=Qfo4_DTaDT^&~W(&(As4wL>UcpHS@ z12@`(Pha@Ds`@pYS)WH;r;82SONt+{XFWIOAK}Bi?UdTTnr7zj0I=SrD}i#7nCAoG+C8z0{Ep@k3o)DZ+xmDjjxGtXkkpO3CaDicF2}E?D^HjY zj=gM;(6+~Mf0Zp4RldyZXZ1}?zalY#E`+q~1c`irvV{c+Q%XQqY<3>4!1rdc#~8Op z^j%F=`E`7>**p54e!0 zhhE9o=Wy+BC$^N}7W5FB7Op*geZd8;&<%sPEU@tgK=F(n--A4A=f-ZF<7p+njh>}wk=4BfYlJV39wd&qp-~CR4Km7WJe3DAMC&hwgb~V zsV=KraGCoxn#Vs5yb|xNt0T4LajxWiZ6Cz4s#)-)2~;(_TzC$nK23Rgv9c$r%ci9@ zT&R`*xwyuoCT`YQVl&*#lC)IQ?XmNBc^!QJFMh_1NM7Z$ISWHXCQak`Y`I1nEWl|| z4cU|D`Nghp$2o5}xEg*_z-nnJN%bMwy+O}{&kv!%Mt?KSHod`4RW&uUbY2n-%ReDj z|2t?Oz>dt_9Y2;9q>Q0B5fmI7l<+;2L z56FJI$Ie8f&ue9t5yb}_q0vJ9bVfSvPe>dV?w$6DS5~Q!_;^FAWtC9zWpkm++}r1l z>7Cj7m+wtGk7~u(Im~_iq}|?ZN7kqQrPbEsl2Tudrn}M#kyUBubicY|%hF07PHoJ% zP&{t`0Gj&Jzr9N?Z(H`LY<&!vV9x}uZHH-2HJqy@@i(*FlX~mwX}!u3VxVIZZr(dr z+`nnuUz=yRGY@(9lwTYn4D}up;czXaCt-zC@Y^(tI>)a`T_gDBYA&`f(j$ zZ!SSB!;xlPV!+MqniYQYC1s;s9Yn%`qnLnox(j^Hg7zl65>xCRZb>LMdW=?J97D(PgN-l%GN+d zr^0epaA1T*mjif)*>!e?o#L&mUpKufc?S6q3gwt!Yj=0vJF|BRpOzK#apJLp`L zzZv9ZpmnEp#u&vMUjT(0^X|^MFvFTeNy*{U`C ztZ4TO^a-Eq6GkLkppme$fJaNyxJ9sEh@!4~p$xy27abC^OD6RE z4joU)Hw0l5-#RP09?OZm%6wqj)1NcQEr-I-{=U8TJ%RGmLZh&&&_KtP?9s7LZ#ik} z4>fp%hYh>RbD!6WvmwX!DW8$o#zqmaHtU>zr@un@?czJ!03om&5w&l4{-CT%2wPzB z=QVf4)HXbzlWVyoiw*=L%xWF%-i)ZL*EK(Hy1i%N(^8M}Kp>58)i^hAm6;2Qinw0y&oh(>)8QQ(C|~keASLO0aYX(4k^?@XXq3y2G*WPC9-L;hVQ*;D6@%eIm3>B~eJDT|%KC`ro3 z<#)-a(`F)7Xw(ZmU*QOfUF^q}B+KEmbo3~b_Yg9lelT14>U~gX^3&~Bz+>QNeqePg z21)>dKG|9Ax%Hx+PF41T>9$)!=iPHxI(fNtTKr*iExNm#iv(%y`xsnFIUmKH|FLxQA8H2(8$6jkL;N?!A7Fs00aqH~GHH&R3yxyu;Q4&VIX1uhle8J07t7FmuJ0y)p^?@anAKqiUD_Xg)jpUIDpR zLpgCnSL~)%3J3|ESAQ;y>E<1xott>e2O}TBSjNG4iTy{E=$lM#(7zB{0%T z)H>|9b(wpA1vVcV&w^r`NSy0Gb6@gqc(cj~6fA;m=TF_-aCpr+wrQPVyvu{jcA6Tx^|#q@x5k;NgVHbIb?1Oo)Zskqu?0 zT3ls340XHf5v)9$w-!Or9#i?2&(G%EjKjpos#(Pvs5=+Sb^pn<@LO{LQE}BUDB5pKXMu_b}$WauDD6zdE>5?<-$}^duc5wUu&;q<}yXB+vx>7QfmQ0YLlcosXihRwt z&mJ7E&6q1e-e|dty9Us#jm|e;kbU}XO}!J6Qz{8NopxH$QJeD-N^AX!+9_tM-6>p4 z=l(QrSsoVVF2UjNxodoQj=*=H^K{scj)QYHmr}7(aXa06IQ>Z8DTa*t?Yqq{KB?gt zhc5Z^krOH^2kxgKS%Sn4Y`z$zx}F6`^Hx5s-XD{S7i zX=ZLrnojF*u6eh2h2Skc-OOG@xB%bz=2&^73uDNo056ZJCny@_WBYQgqin-JobT$Z{Dx$ z*|6EzVzQ!yG*ju+?xJi6<7dO#oLJ$WefZjPLv}lSf})Tj=`EjdQGrtjLnJ;(q8A)E zgm{W-I-Q{Z(W>dUEX71oQz{KX0$4`Y9WBY4yAa zPSbi(BE#L@t51`I=g-@-?5R7H)|1*?aqHBFewTa$VbQ zlv7eT|KV01!N8BFlgyEH62)1H+Rm~dQ^AJnA;c4(-BBj5e=Vg1Byg(vcX3g1cSl$T z37*1wj0X}GptDsi^L+iCOLkbV=YJC0QIV06iIyZMKL$BE{kZY7fZIjq)3wnK$hFmc zaA2Y{oh}&)Oa*xMew4^zPn=R89lu?{Mbo>nG_ZrM1bfzX2D519+xK({hwz;fC)U0X z&3&76^KqN0)wDVQUdY9}#7gKFrx3|1_A>o~RydXOcqA3WaZy_R@MT2wUEZP7}I z3f8Z7dz(FLwZy1{fcnuCVf-LTBg$ zz?@bUHM+y8pu?a8@h>`}QxCba;>L74%?;1`%*)``4JR;T7Gr{0F)CZp}pXw7d{ zGl-Ib2#P-jSS9p;1mq%RtRT@f3)FFvscO~P=$}K8kk6B>pq?Rd^ni@6} z?=St}hL~GU=#7ZFgoAb%&Dcjz#><8gA2?YQhUR2}0&_Qtao=;*WIN=tFzw?DT9gdy z>EQ?m{U-f(19D(F7_j}3dGSV~ayqqHVX1%`Ov{Z1zqayT6<7%2gx@s7<%sukA}-c# z*a|p07d&EnN9^JSCqEA=TS@cLe>iy?gV%XBdc;e5_Fd^g!~9l0e2POuvRV#!$+RIQ z6#?7^F%55==%OoQ>=s7Fj=OCvp-)~Nn%7wCvgG!YwD#5J{A$m9d#SuUpKbPHI{ssP z6T|1pg{;aj&$VxI*z@4LA=CBS)67Pw>jkC&pW61E*q9v6SNzUp+OQ@Z)^smN`)Ni* zA4`Ji-Z8`FJRW9&`tn(q8Djdv>L+ctI-|sP*VpDf#t0&CN`ASbx)A6B+rWY+>7d~~ zmUMA>SiSFM-A=rmr!ZrTqzW+R%v6u~J!SC+Q$f0KVyl?SmQvN?QCp zEpc|PTNT3_F+}TS7y<;=J9Iu1;D$&Tttd&uy8CqzyPnE8B({h0Ox#UN6J4xIrA}&3 zfX6L<@%+o+D1}PvsxOimhJ$*kFi-|8dzABM`yg04D%ln?toK#`B5bVa}J+u>Sq}=po1T#A|xJn z+uuSKK`25Ea8{h&Hh}{OabVNvnRZP&Z4X^d_&wpQugu?OO7AvC7)`f29;pPoJJ4B9ae;Pyh zZ~E(LSk`Q*rHO|}|CSUs*YU$EkB44{*(}FFzsA>xq|Ez@k@gkSfo#|or7KqpB$}Zi zZL@*nVQshkJmjp(SLnRzQoaMSvPs!(n{=A)a`&l};JEP5Fyq<%zCPE|+Fdwi1n?_& z-ka;9EO{83>aLn#rThpp-o1v8)CIPY;_$fzf8P)_sho%-;yjtJ%q(}&m9_I7m(~Cg zZcX)h6Hx0KCr4GlS<{!LQ&?K&$512OM25$^0BUsWxpc|5x10DAnz`r^(t`v zc`>C2l4_Bw(vn}y>B}AhN_#lrlx}#Y1fBoFfh5Ql2mH_c*k0I+Lgt$hkB6v&6T&hz zvAjM?Af`N>7fAI@Hi2LzKe)7<%pji~8D}J*$~>nN9Fw?oAwGk|CSBn&@zLY3ypOzr zMU?N>%XBgaq1Gbmt&9d41l?EizM5u)8q5>h*Q=2Pp8%54&>jj5*@Rfe$I|=2q$Zzg*o5ibhR@p&5LWR%zI~n&! z7)$@mRpBFtkjHN)CW_%)$$ak6z1rB436T~p-=zh1=AnhM07u0auQP>qOPK`sibBcw z8H^M!PDJD{g`((KZC%qfByV8hmvyua%7M`ol$jUM|&FONFyG!q{w#lT}QQrLIP;fNbJ@o;u?LP@FAAVz9CG6 zOut1A7S<(pwMUzm(5jpTdFLn0Agmxieu3cr>*|h%kVO-|OFgVBk1n&q$41AG-EkBU zZJ)ohrHc4z@O6=N0OTP*(zLE9nO^3R{GVQ!Kf*YPbbJG7v#aT(lemvJcMm$#nOoMK zZYps#yy5k61Km1s2J6`ym}%XEFC-k8{(Ey4Bk>-+CGU29N3Q>P$Hk5HV=sA3rsnI? zaoxq@=QkRmr?E;&vPf3H6@Nb@`CES>O#Q=v+eYzRG%=hfIfv%~EOS<$*8h!;^r;(9 zm?17>ZF=Cb6%9sQ<0~B4Y=1}UCT9N6rq4dZkib553_8p#u&KnY{{-#`7fREyKH};_ z*|D&Q{Bum}JR(xpSK0n{2m;d~l1Ed6K10%F<3`4>WJ}tX{ik*#KNK| zRe3}%JCgxqHEvioR!Xh=pIM2ui&5eW!4AlvbzEp59}ujFJfWs~z=A>Cm@Wi%vM3sj zQ&`Kdi%Ifs5r(d-#s-_#VcPFyB}Q&{lC*Trg{y#EZv9`=#nHQs)t_lM>bNdm+yGuR zehC>Qt4_jl>3=5bmm}Jfh_ac*+)w6=$g^oocN@_TINoW{+ZnjEe`W0>E(6TNzP-e} zvZ;l|0Nk(>1YQEg0Vwlc?%HO9I8wEy= zX3Swco6z@MJKlw6AW~FdrGV14muWC?yfU1uhX#L%eWNL*r5BpKgSou!9y#%ebO2Sl zo0*kB&;D;4R6IVyREc!VE-bV&Gm5#DTK@SaR%9Kf`!ojjec^_L5kaXK&p>q9v>cfB z3(kiqxSVrSi<6f43zP(yvon7_&sEsjxn_rCpy_ceJO-D660U(rKy*n8^*tTb2t?VB z;QP+o$Bz2o@6YZWl}E_{?6&*v)^H=0(1eWts)8cA3+=bF#nmw$i|Asp-s55>e9BIZ zE#RAp+n`mTV~iKL;m-?7g1rnHfHoJ>K|R#y@wb5@d5Yi@3doGm;l+7z1K>6(t7?d1!Sse@m=(#OE)EQ@p+$d{BrpC=z*_{^A^^MS;@ zGczbefupPr@ZviD94m7A_y)wnLReHBiTlu_YR&Heh=bIOF^JqGiDG_3TembQIp32@j)!98>Jjcl;SevW{l? zmg@IMcTO~t$UTfsIBQquWvO0GJ&jRaz!$yJkH`HC@$!C#yql!oiK@THIbI6@?smra zW4B<&x3eBH#|g!NS?-77=m%%h21G@QGa{&b~ zR`L(?4Gcb7%o7NW^pn=BsAu9;Uc7+*R}}~g3moM<69KM{aAFRFeFTs2GRC}s@{Fl* z=)o&q(8IDHj(K+II<$IWhCo|SKZBrPJ=}DAFA9Oq=`{kZ{+DFbdki|W-hv*-?7KFoKdTej{%?H>vUtD-EV z@>n#(S+}d6U5bp6C!v(-KkM_3g1MfR#rM{SN3+4F>2He|pD)MvxR88rTrRfgUMMMT zIUh5=-g%tbb6XWe6nW;297P6mWr1O;J9GTI+9nQ0qJ!F52I{*mLVN>AM+fvJn)jnG zf%T|m5^*qs>Bm3^b({pm#egt@Gz}A3hrXp15F5ZhUg=*_Hx~n~@uX5y+JiCBIPmxP z+?a;B%vA{EZ@JGD6)7(ESMX8xV%A`;ewpSrx(hA_Tr{joYUqNGu0((lK>wzYR9N&t zfPDo#f62xJom}qdsu;(tYaP>;79oQkQxHUc>Q8hgGR*b20*sP+5*D)|NtNhm! zezdnSArSR024Ugx+5h#1twWvkXZBk>FE`ZJw*%D#b9osbt8niZj0xo&6Uo(SXlZ9~ zk3Yxy)Bg(u1Lf4lYGVQe5vuO7!x8QQpNR^*Ia$Ij?@-<;7gyH_0F8Vk zrL7(3L3=MQF3w|DLCV%P-vceqTBT(c2EwA##Z+^x&%pn7FusKz2{( zmjxJ&Jzt-{zUrWSs!jxi`P)zGi~(5+B$cI^Yr@{{lGHWv-oj#2*JSbv43$} zbcE+o_f><%2?)wi%_7kVXVql^ z*pMA{BU51p(wU3$(3UWs{+F+ZJeoQ-p%-aEt0o4PWXZtCLcr9%Kag^A<^r}zO>>~< zo_;xA2*dmqn!x$`iV*L059LO<{oHtuQ9_s)x$|QHugrjLs;Q~bv6e1-llzWOPBvWe z+fup#mRxFr;#akIyj|K<$iJ!HV^OoqQ>Fk0< zU(rku&tph+br%-%?CvDCv^; zKO>3gW)5OAI{Pt8`QB>&ySvC5(s!lXlQCrcY7C~cFAKNQ<>}CG-Y8;}2Bz!b1|2=k zgOfrHWZv{EK64IlOC}9rGa;${R%}5|W?{j0rTXSvL(d07Mmw6eo}T)3bTFO5i_ayG zT=Wz-K>KwTYyRyNuX@0Dt?A!f=(DE*rfof@q+GLZ7Hza|vEMKMw%Yz6#rqz6-_TXZ zzpTgKm+bkb{J_A_@Z{VrxGnC>4x%)_Fb0`r7mKQmWPogMVO*3m$zQ+9tLdOqc!9R4 zFz5!6-;rfHGaJ-%T&gPrMJK*>O+fYdKUwv)s9mX_Js-@E(!GZ}+Al&5Bh&qVzQ zMIaDI1x#749&B4btJuyFR!V&_cS1P+t^zP)Pu$c{elE&2Sn8ZFu-3%6nq}oKWm&%O zu>JjJ86_GAn5kQ?qvO8OdG&Pwn7D8PbY9}l9z#RJ8Oh|7lqcx+jhx@%>qP{*`_?4I zy*WEI^}B+}B3tAfe+l zg6%y0#b=LCJsLptW*FQiv%}Fv_wTGHNbI_f`HS&CdBSOTLi&PLvMLgHo(bX|JZXRN zCd<9>(9r9M8Y%lcx8Yak{KWFbu_?u7oq&DCt3IWqsLN@7$(}gy706V6QihXMhekwP znbw$Wjsc>9qvQG@bT_$YB0|@asS!Axo0TeyD_WEQyf;GB&!Q_!GfiyJG8ENs8%XY( zsZrSQ)cZ>+o|-l*A>p9ga^*RHmqh2eESMb)hfxcuHi$h93<@SbNrTSgQR; z!l$#R2f~QXrnOc(^25g-?6&op*u){wD5F>EXL)m0RuKUmc7k~yJys$y3a~BO+;jI6%IG#^(#D@Hb%GY7&xcRs zENx2(o~FIxp?lm+!-1fALCc|KbVdlB-0TPR>+Dr_3=;7`W5!`+8_a?P#Y<(o1^@-% zGCrSOvKTx$`63DLk`!QRAr!T>EjmnfO%p2Fd^nfBjo}po@v>p82bwRZWRVYasLXzK znW>duOk(04ZD)vQ9N{HcUlwDKxA$rEe>*3XK!|Tf5Vob!AbK}M z3wyPBVLj_>bA2CLN%$fMAmZnepuqm{4E*-uFC&oK{`4=aM>bKS4cUS(HJ zYY>-+U%g~*d**6ftbC|L_9T`^W;y*iQF42C(>(bj^Y$kyTk9e7&h-UiT{T?|tA6b?3F<(*Vk#(Cg?FtD`aSLtG>nl7332P?v+d%~U@ zooc^jRY)9KHJSI9;P^8@!4iQF#Q1%}cTEiqZB#)wb(|{O>=hN1>^yh-hPYGbRF%8y z`YcPD)6ZmS6)UO@E>9@Q339FTaT4~T7iV(cra1oYr=Z68WIKF(rGaN!$m(s^Y)(p+ z-&Iz6RmWx4H67G#ixi~0BCzbWlI|Ub7Kr`e4<2z;Do;7S-bx9awLx>A1Xtg&Ey3ql z^b<@y%l0$yQ>++PfHW%JT`ch@0nci!92j2^pP7+k``;J2*crybCJlKQ0_9AJj@|`o zl1moP@;88_R4+6Q)6rkDI+(Qb+mD90L-(;|r?j6wy*sow$wR}#qm~2uQRLn-Six4l z`HAswV{f0-o`7nm*RSQeP`LVo;^3!KK~ide*~q(FvP;QkFi;n07wbFn(I$V z5`I>0e=WQ-ofOaqYWr-}6l%pyzPlTjwkazu-EmT5y{@FABlOuUipMHG+bb8WJ4`@a z2S-Pf1bs@1{$7%p_%N{ac0*<+lOUgG=|-0(oS=9SK4*vh7zU=Jq>)jY!p)mEZ#^)j zj7AGLSyc5d@-U`g4~f$wU=eo7Y5tI`2X;`-Z-*_-vUfC((Yh7|kO4sb`aOSqu26Il zf`E&P?H_x)9|J;pjg;%GN4~$kI(MCzg2HMZ35yN9jou$UR-KCCtVvRO&q155dr}X; z7|cS#OAy*{#gL7npXnQFE$w?Y;ns-Pc3UnX{89TYG{VAY0CdR+k!#ci@=N%;u4`T%$A4F| zu!uMi5pOWoD(eQ8jN^(PsVV6H(bDLONe0$B?=0?lyPe$SsUB^Gr6u?MHwJ4DoInYB zdjBYX=p1*yOOR9ZLFLwh-Es)0%zt&Kl3Lmd+i>oonVk84oOkOH22B!L4G^)*RkYKbgIO@aRTSP;`MVevID#&WyEo`HUm4gANH zaAfJNNFKE`VJCOhUO{WZ)Keyy>)UesgrIAEiZMn*8nk~_>WEK zxUK{s=YY09X|lAm!wJLOi84%0JHy1~{MI@KXVm}cKg=nlG;W;zWjMjiEu!b9OQ8lQ zq!kusZatRUUk2}Ta8_GAB^Bt~e+KNIw|zTUL#yq(zCknFz)((katSqvFz5Q@sE=A>^@+GP^u`WS`iNS`gbSk>BwShC)WX zWQ-mSF(`djv`@=)TGVAydPS9|0X4IyX`2HPgU^=f`6(4l>GF#nh&(6qBSoAM%L#9(2mS$%X1Z$np@326)80c{k zuNuLUtaj^jw-VPcoL7Pdgwk=F|D^JJx8pI0*ku7u@^t=7^k4goV2n@HROx4Y|L-Ea zd}r;_;iP!P&C)+ljWh;&>lGXrJO$Lxb|cp1^Uy9u6p|^&xaA`T6bXA3uJip?QmcDY4IZ=>nmsS%1D% z^yYlfW`kt;GhIcU_~fvbU`T`xtx`C1$eT=akRRiIcWoLU=?*qMRpyowqv7T(|HfT- zK~YhiYu`{<(m7elHIqO)u1k(SesYddqJv&d#h>NFbzO4D$EDkmPLR3b*J2d8ICuR& z&kY~mB9hH26SjJLk4Ltf%CnJWDNHU)CkyPdYZ5>POrTi)&&I6t_ud+?Fkc?99Ug#@ zBFzTIFXO?v3l&4gix;?P@^CM}V`59Tx2tHW#p|amLx^E+syW22`qpk=a5zz(D1OCq z{gS~us~xU7oBeAp*9D8L2JXb@@G-?_J0WV+rM1eLzR^ChMC_~TXO{g~@Z2R_DY$n_ z|23{Wj@{mR5BsiMa)#jt@@>seTVmT)Lx-)2J20ZFoQZ2!+eKzsam~e`Gp<{5%5xvqVRi6Yhu`=OeoCJG zB={!sCza<-XPt=t4)fx92ptXgpfm2vXB@*w`TsJMqP6K^H-m$HLYTusua7;mQkrZh zp_lE%*2);1AnLwtYPQ#OS8p!^Fk6Yn1qA_nuF=v99O)Nd2Cp&f6-^y@ehPnmZk%^;lvNJ`hx45)ypbT^2=&>-Ct|Uz#8-@WC>X{2=@idVGlG@bgrG-e><;jv!{9JR{Udwo>IC=W&!Y-%oh+9?fGFc*~e zsaB$Z6TbRgzyb3xO98CaLz9YP*X zdv4M86Z?-|>k7gBUBx7{k3c5lBECr*_e1{ptUP0msOsI@fJ%W_q zZgL%AfD@?L#*Q0$(l2(ouLns)K<9{VT|22#oyH^Y3(?BFlTj5H-&^cz#A8?j+auyP z^X6w!@?_e&zuTS$_1Y=cSi5U(pcYMJ8mDH@QEfgQKPegoE zd7)EUx|(Hsv^($#Lhj>kno4zK9B!Byia8R_4-2l((+$6wxAp;H+0xXhrZ&q9R;G&F zBsv-EoSVS2lWj$$Jqfa8`$=b+Cb-tT*xrdn!V7jAss9M7cmMNGN;M%9gF+Y{zl9G{ zCzL9QB5r%}3QL1;E*abt5f6?`E6=PFpzVDd*4sDI|0_0$Mc&iXi`VI8zDTYJvt*bD z`##ar)_v-cs)0SwuGj|vV*~5f8njU}lWOZn*CZ>a|07ttqM(0gG|t+wJK;n*-@7y_;j*yU!xVW3t{vcfAs+= zsBv_l;g&};x#22=2VGjKgYXp=qhkK-qWdy5rGu{lPj*(Eco<W`3g%*!GF|m z-+p)G@8ol_j`iY zl}hyeNSt57qx-vu&!3L{###{Y-X45sV{iX|K;8d&KF7FAPmM+E3G=t(HF+%Hr+Zr6 zV~GNAeW7Y1Hhl{pHMfCAJr-bWItcmn;lwabTlAYCf*|+%$q|o%82T&oGu;qE@f@c* zlh9lkDW^OdtC=pSvbBY(cFPJ$jw<-Gr2UN_v^*OI>lYMslb5jj^Sufzqg!Lh+iZ9V zyqDm5F=>Z|>?#`vczk}9ImUj9G0SJ*DVoIe-bDHI8k6|x=BOXr)fcfm`U}B`_VJKO zoAr2ovCW9?$)Sb1ygkMY>#yt@ZAS~tl|;has(oKAzbf;#^8V=3!WAXz#17W=G)c)h z0G8gt^2WRu@6-E@LH;}z0Ivib5+&~9dv(}A8__yKUmcInF^2}^`wtea6?T$V1IwUe z=?aQ&Fhd(stsq54RCxha8gq!{o4t`3^WLnT&SY%2&0*xt+SAD`%z6AF<(nS(u-yrH z$<(U-q*Ly|E!|>IuL9c%we0lIg>r5j%8vRX;$hV^;d|KhAq3YEHIY#ZW4Wcgp;pu% zfY_(M!2kww(yS1(1m}i2LN{LJ;aGWNre6$M6Nn^wG~~iM5ewWA$52h8!3Uk? zoDzITbs@L}W<3J5W2_BeCQU)iFS5g%2&*B)I0r|Ar^dE^vm$8B9aWi0hTP7+jj;b5 ziXP4^jhK^xM|k-r_o zD9jPlnZn$u1C_xZKOI|e-hsN5)Tj}E$Qn4E-#DrFZdj7c-^fBq8G`gNM19R@qYmN~ZGriL1O}nOE|Jk4k zg#X7$e}~lon4wo<@y)~^K?{(#w(v9)N>uQtTie>WPl&U@PqD+!A%nT%>96j-c~wso z&*E6KvvJrnI4^0iB&@4VO2P4f^$YsEQyPeaZz@`Pfnlt5d%?igoVQ zm|yq=IaH$;snXLHC!&4!00z#=PX84f%Q!%P-pDS7AtM=gfb?mcn1G$n;QA|N$F-f0 zfdAj$*9*w+;+FHKhGVf?@3h)QAF=s(T}jXCU?G*BTVbvG0Z<`1pY#i~BWALU?&%7} z7yXPKe=E#Ov)O3j$SETazh$=Br@;yF>*E_p=j#4$VjR@V+}oi3!jjGKDZv`fmBe8=_ z?i4$i2KGk3WQ1F`u+g;B=eAVI>b@GSCdl;4pHKPkp7zsspsHSJtaR^bMN3!(&}+&{ zm2Xe!<;|5MZs~68?9mJDVoI<)k^{l8FQ#!1^_1;qjL`*|G~RC_{?k zzSW04{lz~sRq<=~qj)H@k1|PS-|LkO$oB2X15sx@R@0}!T@SrPPVYQvv!iZ{8jc)TP72n@$@V4&bb6Mrctx^vj(8 zuSV*9>X}T=vP<>n<(2M>tO6NC#xIukYsOzyn`VBv&q&hxIcfy;nkurFLXIk;Yu}a@ z7lP1}1+3|u#Juz7EOJ6q%uQ_RY&diMXQs3YKGs1rp&yVsBO^1xs47LjLSzXi%7>6J zJ!acu&C$%qpxTzXK`r0DLET=@Fzbp7{^z>tcR7i!>Sz^-8p*0NRK>`qyS%o)Uv5b? zn0<50j_Pq}=Tk_1@mwpHaVnmb16?Fg4^}CTX?#um|h@W!4Z5g|W{4#DM3xlbP z?*!Rn%FOZG?aBu-jLoFCV~JTq+5u{lFgsaA!_#UElM)AGx?Q_F#L zWMxf@z<#TXk1J}F4OjXhHru+i5uy?~6ixKzZ4P?o4b;}Q8E_*Fxt9ILQG_(#$7P#w z#j+P27Y`S=yuqr!Sl&1|-QC~&2mF8S=y#B#$^!*g>d(euEPdZ9i8THXWQ2>fvVQWx zQ>rW}pJp$&AeZ&$oX=P8rm4NDa2FW6qg)_*k8gX@E;VeL`Pu4AI|qEAp8wsrXYQQs z5<7g(5s?gkYP`-fNx!il4ayLiJ9#~*So7n~r7j_@qn|@cu@cT)*DADLzkK;TfHDNx z6g&;Ovn?mVrFR{zIX#nIumc2}=phmD64@m5R7G;yY}aZY5HiY~#&VW{s;|-F=dJqh z0%tg`G}Qx*26`k7Z@Tx*;?J;MYw+ed@5(b^>*`y#8b&NZ+-c#rp>)>G(@+<49J zs`GruMtaxBVGHcMbE-li+1i4H)!W!)`*yj}nbn&1$1k_{HjfQ{YN)ta`6_al8j7!e zG2Uz+ot9?bXtDTOQXP976hQTW=k_gf-~Hq}nMsSMJEvY0$WIg(@0{ct@5j#+$R?@3 z<~D{Z8DtFw5T%6C4FPw(03GG?`L-wZSsqO`{vzFwQnwKrDU`!we_ZJP#bK|qP&^*7 zzb_8UF9p9Uh0kF$PceFaQ{-@ z^I8r&EsmH7D82naMx%XTdOmTl9lj6~oSj>)SCW{Z@uF^Tx#xTFN=O;E?mhzD>pjfQ z)mfY%j<|EnrHCe*0!0w7th#wg2c3xP_9mi&`^g~_wIdxT#S4~$BSEPp8PB6hZkz@c|Y zM+k#XXwG5SFyoft!pcg`rn5>8IOBMXykX)r7Zp$gfl?84@lcEWnNwhKtQFEgegLy> z$@u+Nx`4a)#|O{GkiVF4sq_uC%8F%Vl~)|NepJks-o72>GKtTu?;UOc+<4E3E`3ze z@*Op@_EavMEVmIpu8rk;iCQjG4$RiYBl~F&q5b zoL>8_x)qdlqqT~tPe-uN`FS?qiz^gGz|wc~?Y85ngHR#B*0-E$_(wL)lf=>tVb$%^ z1G)?9htOkY(y6lKsqtzB2lssAI*+B{_@Mf7kheUm!a{z%>SlX6Dw$rq5gcAvCQ?Tw;UL|(=c%kD zLHwcq-%nzI{0c}6;Ta873#6lkIywAIWk5jBF+Y-JSDLW(%Vy9V;E>6VOE}z7y_syM zK7Gf2Zk}a7jLupxPz&neBkXdMbs}$2RyvSbdbHNmR>VBatOvIjm~HedEkDB$xUQHg zP<}M8D4~_U{(km8J2ZLYzBgq(PG|# zEsELimEv(cev;Vjlh@JvJ{oQi+A@?P>! zMnHqL&m>xXG0GN|FqtWvY3$t}wkX9P{L{WqARvcFb!hR&neT`_aHr>tNv?dtRl{C3 zLFH@YZxkf>*JJ`4=h#Z3t+-}e%2N}L#PiU-02h03!5e9?wx{m>U0eG?STCg{JA`>4 z1N7f}ymm!-wg?_0H%R%_J&6NIN7UZHa|CF(X=D{~iOylUL_F6f)40T2f54R;x&cmr z$V<;|^j9e+&@W!>&<`xEk~(4|tAfkiMW>V)BcQBpc_pjUqt5CJE8?dK3Ow#eN_2}< z7e2A!02Z`gZ0_~$cR%k=eroch>)y-xib?rkz@m1JKV8=~7XdYc37g;3+#9=L zj%T`!1BfT|^@?Nzs3;zY`3mw&{FZ*ToQ+Lz!|n-sg;C-t@htr`Jsw58h<)gfGcX*J z^861j=iP7Mqw_t!(6LI_mf~?h!VMm~VqvwWvo01f{5d)B`i%T-H9k{V0{~*%rg~;J z#JX(!EHm$9G+&!3$S#9aL9!VXKTJ0%8f0yW32Rz^!Iu{1e-+q1@?buWxv$a@hOffR zGBo=2ZnJ_cSDT2MWPetJsF-m56tE~F3177N2`8^G{4YX}<8LeF5I$iZ#Wh9AvFl=( zU$@0_qz{USZ(%w>cF_|RNkgZJp?KJC#>Xoph413?9%p-gLAMauh5H}_Ggd@4d|4z;w)ZeotR-^L z^PP(NqM)t%pLyRG!%~v7%J%MC*{85?yIm4F0Zil2Yzzk=$XZ5JsK}Y`8eBE~gJG$@ z?hOFY!8~0iPQG6nJE&_`sQskv%i2W;H>pRBdf=g30*sbukG1mAquF0E=42Zzf|eIM zu;Q28$$HNO>ZQVE(Y>aN7D|GAV{ zoNUI6p*rK-t|&m7C#Vf9x-DjnOyse=lG7Cx6#?@(cSg+2%;bE0b`}yp8}|lTn(Z|$ z%zwK_!;Sk@rfpc_@$FrwGTQ?U)BNd6AFp7It*xkoIvwN&hFBb1zm!$bL!U$B+A~ni zl2uhH^^i^XS=R;D(u;s&6zMcqsJN_gYC^tFz%9ymmu?GDD@;Y2LA*v%BQjSb(d|e4 zEqn{&8=0l(|I7`dFul+g<#&!28ZX|y z?X=g!ad^J*>iVc`2Fs88hiQ#}&3{q~%9aY&Hq-9u+b9MUoo^O^G{4+j<@b!kPMlrh zA|2D(rC{pm;t=YiGb-xQa2^#K$JBGXEfrrgXYBA|<4)qF7Cq9=PHnz8rs_-sxeXuD zPXXI%?|^ONr)SQ`@eZCO=)TW;;Y4m1GKAi zMqGeJhjgRyZ;9)*HMqV*V>WB#MsYnz`vpYA6O#pTbE9wt)bLqZg6_h}pJ*;u#OciV zwZ4?Sj62XKAD;fmLSL;92Q}M4I5a3+U{_c``!F@9a`Cqh@~cG7?@jgZDTT6t(B#Xd zd3WbSVh*z_Fu4zAJfgXZYU`~$^6mYdYWyz;N63eat5r;=C9k8w>N-roD%t7uu4||S z%rLS1WF+xucfE52E1z=lSvVQCTLkN|lm;HEungy=7Jn|rC^0nniR;{5$uwfy-=GvzYolO3s!1<% zgxR*N?j5DIjhJMU#XdB&#K){?oIJO3NM<;19NW-Ks(G~;kbIFD#p#1k zyV{ukL$`cfhVs4h!PhN1^M8xq{D9$v|I8lfP>cKMXkRUaT_fH@b6hF4j6ifgS>_is z-KqIuawV)K5kSt4ku{p3)Bqf#t~Zm-in1IQ&C!=^iJCXCSNBqIr)jz9#Zk|)SPm^_ z!FWLfF`XxD^#nP++wI2}PZT;J4YXL&o(ro$8Apx>`EA)boI8Bi8J|-91}GyNwp$0c z^w;*q2-V(q^{%eLwYAwJ7`2R^U!JP0$vK7fUosUO-ctfhS^UV9MN)IH;%P!WS$DJV z&+*T=^8Rdih3kx6w2DeHYaVPF^!`Qa(x4LJ7$%jyU3+;lZZ|YcVnC+%PqYPuO|;a^ z?{9w$i+MMCe8C|ialW2hN0H^bA;WYx=D{>w@PJ0pxD&@a)X%siUB<1@;r*oBEk4B~ znarQ~^?)5Z&XckZaw%N`*$4CY;TLvQ;V%VLT)LF#`JYIZaWuT9m~|zp6BAO6MBJxD=GN!PROc8qDdR4FO-I`~lXi29y%=B!d9 z^1GCF=(0OsFmG1(i@Vh9IqiH3J-h7Zwl{91_zU%jJS%O|zc9Vn@(S(7l0EDW?ZwX) zczmnY!=kGABh;fop62XO%**V&LHffH`+YI7vemXrmBkZm;GcgNvVJH z*TMi9!3nZX?kZlI#Bn!&2`mRf{p8O8{t%*kc=v*W4d@T4(kfqs-wdqtt87FbJss^I zp5amlR`CFM5ejPB$0$o+z;{GL6R#-l%r?Txn;akZj5$-69pZoimbN4+=RmTL#nmIy zE1UgZU_?N`C>hw{gBYiQ>_utRz-TKws-aD>k zUg*|aW%tzkdGh0#w>xjKOg^jnpuB@ytE;1QT5Olf7+O2qVxpr&yW0|CE~&Bt8SqRk6m9ix)Tt#6 z1Phk{04`-^L9q0>^DnJ#?WVy7@2{K}l~a?om&P)0 z-J-iD@RTmstXe(Q*x5^B&MWz~e*2ApT#$JzO7#e+d9!Ps>S$hl{KZv1GQy>vnru86 zJZ10bt&*Y$Iuuk%y+=96=q~0KJyYfu>Eo=NIEVQe-u&2o&M#-?Y|XE@x(dFPANw*K zEettPqh9ZK5!q-63{8pw+BtNZgpsjQA~glN75iBcv;rfqYWE5{Hf(-S?Vq~`!aCdUPP}w6MFV{2%L}>hf~y+suzmgzdBex_s`m8|__*A6GZDJ^oyK#;bzJ@f zt5z{Z5==yp8Ue-hb^3cPlk9wyWU`kqhjdnu3{=drEN@vU|Dr3PR|M*6kN#0ubFpCR zUc{aBg4wDYc_r>?VO0Np8ZRKf+rCHj5X1Qg9T|9OecZxbz>dO4H)72m%W%U2#x&Xg zKH~RP(wRAhZ+4B1Cq4(~^#D+tI_@wXe|B}N&y+@2AM=81pQye7yk^<(xE-#t{;PXP z!4e72|C%YIouNLgClYw%wRk_9eNw0e$uvI(@5T652U}Y7&drs{`Cbr@cas!w0EO!6 ziuR>PssX>m_yL6oC#Q-q+$7XxwF}Abrjt%;dS_$BJtZ!9RVj9pci5FC04OlC@(}wj zEl8b#Z-sdV+FFdmUqaAN9t_)xXR7$r=>rjSU3Etk3`0(~ouyc+t8Z-){;Q6C>~%%< z8HbQxXOqZOywO>a^h5a`0hzVw1C69#2J8fjbCDs;IU&s1Te(CrR+Bl|&!E+Xf(PC` zw${7E%wfMuU9*yeKYq+@)yyrBqqtDhuWQ>#@bnn_aPP0Q;NpR(_V#CD6uPTV*VsSc zQV&qm55j8tg$(rLsXPX6^z1Y0k3zXQdQZrEzwlh0e^q;f)iW^7_VWlEEh9$U;LQ@S zW!s#&db8`LdL8A;5<%HMlZOb=_=W<(-dc;?uovweQ{dpwg21}w%YnXsr3<@CX`A+L z5n;Xm9pB=dyyB{iLrB-My7LPZ_T>l!t_eN981@QRt>GNf2x_ zq`ktEY7hEu>Kg8b#b|l8fw#yP@TjCS!q^PJ;Mu!rPfRk#7C87)!V;~$`PgcrU)QO! zSY;1@xm@Z|5YoDadobKfrXn5fC)^0)EAUCvapWNT80lq(J;7tctdu}i#cN?o82HTn zj?Z>s0W*jqeER7aT|rK$yEEg*C+$U1OcSAF|sS_K8qXQvMdi@E;( zN5R97;(SL~vgHjps%=SG&9hFK-OxABXJxiI&f#K@Y$dKT`07Ecs_TdxzfE#Y+8=XX zLsq69DQ_T3Xz_rQ6!pq7pS~PUuNKXorZqg6B6&{znfrAzJF<&}zJSXRzV>`*@TfqZ zEh6mb=Z8L)7V;xs3xaq-BUunPH@81wVHn8w{odif0M_4Q2ufa1YHGFm6+33X)u1vP z#JJg>r4|vJ$k+_?Y4a(y(D^FmqP_i4$)|=J+J%MS+85!xbVsCSl+ulf-j9#2&Ag17 z`UNYPr-#&2=UflZatu=@9}G+U-=YhNjcj4u`m@C%voFLHvUZLcN&< zx46$IG`q9_sr(OlHUmKUA&L> zC4aDSDBR}q8qT`0a@AHH8qGQYM0xxx9@3HNKNTQ+Cfe{113&#^=v*^xm6D!{IX&^! z8kW_R>@K2Vvh4j*J4b*Zs0R4klH zkd1m_bdefznXzuD&paZ39o^2Yf~%h@Mq-~dWZ9@lUC+`c)KMRT^t$o z^x`+WY=riv_{X4pY>yq6ZDd^YsI0uuD5}ftAe3b+J_Jt@ml^s-Vf1s^GNj&x4Bj%` zv2dV@U{-+`GKC7Mhdj11U3_pJ^35n~pc1ixb&vRoJWD0*=SOq!0^6)#Yl}-7Qi5;{ zo8H#C*17&S$zJ;v1ZXpjhgAOo%fs=1woe2&;%HUI#Z#)k(>Ih*@DAfW!kf4`r>*Q{ zFs8vSkmdotK>ZBHxO@9@R-9okuw|GD2lF>=*X6f1IED4Zu`7z^2T`_-f6)d(G$w2Z zRGDxL)5b}CnfRKY>{ODhO~jOLCm@-+-=$18jDb5%Z9ZI9%*P#;dtMUPrRH#P+%mOB ze>k83>yBmtx@X6Q#+J3X{zMjm(4VT@k2-{y=|BV*m@yW**FsURa}MpreB`uSnFn@# zX$fXx|E`69s{@7jU8|mTbjiEvwfK(KR@L+4jn|u;n0@Uy<{4zV!+I>Bo^lahp6f(IJ4jS0Id6yC;w&L#VMMh;}R7D z<&_FejyrQJBwa>)KcZ^E7hoj!Oc8heO76X~Gfb;rY;%A9{>O&m204cX(|CLQRwkou z#>craDU)v@l7!k8>cPtv+UviVgsAl=(*2A#g)L+-BIH@ZmKOY#6zUelzy}aS;nB@c zo53!ma$<6x6LX;Y#@My!<5RzYuoylWS%rS2!40ZR)QDFXwtOXHIv&NB>2i(>Grjfo z$Ir`yQ@YNPHNV5?!{y=dCZP(SV`LHLh~rRR%J=>36x`kmT?Nb)lcp%Ht#m`inNN`o zzvYh9>O9L)WmLz%z^uzB1gR21x`qa+_u4B+0E9C*-r%e3LM^t$KC-=!MvMY?xH2nj z!^P;L=1)=A?;)LN_}}b4e=Dc{^$a#r*mJ*CD- ztE_I*ao2)2N)LyrD9Xj15y)?Mw9PpQjA)$OW)VFvV3d;OfT0xrwSyzc0p4uNh}R zqxWcNJkJ#Xh`jpO0UAF0c;HFLr6)S8;n+7+BkdqP-w3fLLNwYFTz~T!|ESOrnP8*) z#)2~#*HYf;+w$m3$1i?+m5vOG@G`wnWLxups^?`?$M8oO32~NGA^mLx;Ge=@qH zE!`TdT5?%F$T3V;{+PogxjzmO!}jXc#oWKf9q-n!MsGN+fQbn}As{mHB57*&L7gf! zvx@Bm@VLEQO59mYNuR0%N;fl00vBIxy-YlEBRZtUbhfL0xTsb70a5tv5+}qWB zba8ji-P_z;?){c2o|~+#06FahEP5c_elExD9V*s#hrZ>nNa{ zASExom+mHLqLdECU0yUeXl*a2m{GBd0{Hb0ZVs;nAK`)rGJ5-71&4MS|L`V0>xlZA zlSb4P+8g{#gze4uAx5rwVeT8gmM|yjhBPUJjJr)-g=H2h@-S^%lzaj)0)U@5IWge@ z3;reIwzMb>gv6-3siBuwUG}+B!|V_1*>Ada6pi{0&6@USCVB1jcmWz)O_cP;ji{Fe zKN1do_7uH-uc`ezuK%8qQNjlXYfe>=TMr6v`{N6x{?;G}^JP~jVGirmlZ&3fjl~cpKU^+eMqWPq8XiJVC4R~WR+NAWOR~GJNe=kZRv*P(Y*Kc z{j8>o5Sq}up2M~UpS9FCZ1}t zaQssZg}3=-g;eAR{zeADQr%Q%AFq6>M+uC4%(!LQI|}C^t0EV7MZ`h6RZ3qRE&^2j zMzkM{l_t9TU|Jjn>Y2&dq7~CTwJ;@@B*5>Jo0~iDs!MSGNPri5lF1o>%%qB>WdxNo z$bJ1U!d}V;-w_>{AE959S26fTbFAJvySXic=9-4moa=IX>Cv|#P}CFMfK*zWMA|zZ ztmClr*Ue*u2TahN#V!W$7^vJN76*pVxn18u>U?GLU{jv>Zsl#Ksr6Wkhqd%vT#mxq zzk0j7V>tY_f-yaO=26I%-Pv4Se}2M-F)~}5ux^0 z%%HfS^cOJGwQZr^)%&>i^kk*6k#TuRvt)zMUxp-t;FdDj*&+v0-9fqVVSfkru?0yPh}X-Z)ZjZHYy`uy3Qh^1q>QhvIRW-?Y>n?g+eBd{c5m_2f? z?>MAA%hF4m33VjBc~rBUOx(++9rK79C)PN;e{LsC^OoJ|7@OC#MFr*9vdY8me-yA> zRheOoYh|RJRbA3VAdnIxC665K;Y9KXBu?XF*4vbTQzol!dL%jeMR4SK?5&iX73Sp=Yie@QxH?QHtXXZ z%XE9@eK*uQuws^7x~$}iz5PYOs}R+VHrTy-@E+- zStjDtM77(8X?g+WA6`?`cg`@_<&dCp?dsaBxKZm=yFE_^FgrwvAaRdsDWG!^Xvf{y zj%B@El34l1*aWEL`g=#4k1C(nT~fvke#3DG)?C^_ys7T{j17DZ!z%S0os~X!dqDOT z#fviYs&|naVk`l;CoS?lIK*|^6>`|aL)myZ`u*pieyHWtC&@W=+_quwg7i17XCd+e zc+rEUV`C4k3aZd*`Ogiwedmx@$)cewklW_Kz_ivl4*@y-_hU4^`WWXM$UrSzROj38 zkmdegVc4``XN~aA&XiR;=U2U2VideFhnR=?>MI|P+(TBTcqq+0J4Dp<^1_)P?;IM= zIwOOfba5i#$R;1UhPQ^#l}^>Ep8_pAIY!4yic$w$wN1LnYLdscXyvGdWn3G$`0+%Uz9(DyGs;TvIHiIppU5GtVn(uB zZMr_zJ!DT#XC>xEO+I)jR+VJ%PB3p-s3-99|Vqb>K zwTrctNRCF%3Li^^%8Kk9)X>m-g*2~3g6?-6w#(3i7A-Oo;~(;#ekQ;9L338aRKeO` zsX)l?c^v37%VXGho_)-zv+{dMQ|LSa)r4&ofc(;Pu*!%3Kp*Aiwk#4~%w69ooNp@a zKOs@GO=Z`o+-Zl$^`qBSk2L0si!#+stf`AzTbI@oXUR1LRQSw29fxDWL8mXi6iaDG zFH#jNzIBL7zd#>j+EKwPrGrMx)n^#W@t*NZ+Z~s*3poe7l?Pg4WNuVMw|s7TQchUO z%Z&kB@Abpx_Gj5nft=7CRW4fh%g@s;SMyi_n+J@49H~wKZ4L_a4Jyf;g~#xtgQNC8G=*aF$H$w}s9e!D0<8jlxOR!Ge&Y2MqB@rGXVNy*9Bv1<%; zq*+wqoGU?mqnLevO0(V)kYc}YJcRE^#cFyw?(rR267wzxiseZ>qiCaxv-nOg;M~$F zo5$XKIb{O#=;BxF*ys64Cdg}ho^3WK`kG$aol-KK;*4TA}rmWPhp%j_G$hm zYSL&gFD;l$)`z5`w4I!f+n<$V>Xt>t%G9Vy0YJ}ux6ih@<1i&UVAEC}?lOz^5b+rr zc>@WsdH#s;<`)^l{iK{jA{bq^S{c)hNJYJnwAs`vP~Q3O&3uW42E=CI1%m$%*#=<2 z3OHGT!E7Z>C;3!@j*r*P0iPK)z$xf(Udhe{-FOH;!ER*d=PQ&@YXM$jlmhm=Rg+bA z$%X{R+zQP6fo?M7J`^T>bZjzRHHW?I@dCcOtiY%wFqN14PNVIVtE^$`dxcpLt$_$D zwJ*vE*bh&Md_N3gMY*o-@d~C*x=9%fF$OD?Jrl93QA~T{=uNABq@b!UIOlY3>$!fA z4HWfT;zN%env61Kzo1_XT)Yxs6mkE{RxH*SLPULsw5E64F|AG)S7`clcz=GgjvEK` zcK$X|kSCeCi0eIAtAtRRk~>j%*odlA)|$q5x2L-VTQ}pGAz84)xEVJ$L#dZ^~U6@=f^O?{@t^y$^F^`lQF}aGwvCA42Wl5e=IceBXd6>jQc(aH<{m z&t;!&S$yXyA?oBz*7WY1%M?*EleMlpBjLP_hjP@U`o!@Ya2A(eCg49xx3Fe1&JQb`_r*s8C#hFreoSVdB)4(zWbRO$mXS`C38fbYv+c~smI0f+Dg^5 zJx|)+@ou;u)qc#vwE>Rb<0bs?vqY3k-Vf!|=;He-?k_yL8984r-(^vI-w4bzQe^af z;(|8CF_Qx%T__sR?cakvBoJNH~M$z9@pkp_((Nk9Q1{Mry;0VWPU zkCG`mg}E3%rvkdeV|P4=U>8V3pyfT)T3BS)p-_O5|5xCAE@$mk%=)hx;tEOs`b{Yg zAPA(t4a6dSIzY$>mO(->eE+6ZYqiwBszB7Fr{0JmLnPH%ub!yWhZxZK$=d*K;OVA{ zKvBT%$+9*y@$B5aBOc}XK{)Ub+e)g*shRu^%i&+^cZh-`3maY={o6-gTYCsZxcaE5 zsEp!BQ14`&sy(7^w1i{@HCkYPj9KyGwOxV7$|MZQ;&LJtZYzP9Uv+Ssj#>ipfBc9* zY8aKzZp%BWgwj5tm*h`L@)?eww`;x4zq?TtkHCC3v1GD<}|-?Q9%XYpKXO?*?PDK)kH7i`4zV#`79)Td+2P(ycl}d z0Tl6wVj4==-9!BHD)))~Yp?O{Gf+JYD54wo!hX99I@bbDW!P@iYO=!tfyHm@D;&TfE+;@0=Nk{2e?o63!#P?Njq=R%FiMwgMLP z7kXrLSV=kBH_I`DaDVz-)sXi4@XDe-?Z6toPiUL?h*M*IDF{xyHE#3op))oC!ucM^ z)wNPXLt{O%n{V78lhu;N7U?qIay^rW{#_3vFg^nxo6VGEkX#+~8Ht&dZJ%1!0t|a&LXh_A*$5wa&tl76Lm$|2dQBlZ4z7s`f6qp0l9 zmXgs(7C{z|JSh_ph*20%Kz7ydjn9ZS(`n9~3nJFH!!{pCl1G`la&jo%D4E8el&KCo zpK^(AH;$M{(z(-zOlFOOz_))|F8XwZ1COgclDxP_&l1MK_Dy7-I$!IgrNZ$W#=-H6~G zo9L=Uu`*U@F|QX)r?{l~e8i{r;s=fxJ_s0;5d}d0^ACZ5d_3#4&q|D26>T|w%;y+; zQs}>N-{r%pm@m7}c9I%^(Z8g2`6QuE2pcq{)PL^O!#|!eW#Ks2NPRK9Uyx49!mB%4 zr8KL28W1z6`wXux2W7d{Z!<~POXLu1W9>^9Y~>ObLl!EX&P&Nb`D!(bpOV%goG>OyT-i%kuu+NFVD@=uH`^7!-zYS{EyV_XayZ# zIB=C&kcJI$^Q7jK-!UUn8yg?r!2cv3>g{%Aju7QP=QngqG5RkgbIpGs7m z5hgK>2%sa}g6kIAXhk@od)elIZLn_dJ+@;HMuaqO4^wl!T*MbQu4PB@ikpJJ`x z;&;*}47HzIBk-`FOv&SR8N8H}$syti+t*8PDQ(>jG%Qd0kt|i8GrjN9ejwJy<&%Q) z1-kUiKlD!n9p8G^m++y^9~H z93J65^Vh`GPSanWNc_s2H;akw=}IICW^adSw&I*+z2bg==$3|;& z>YkoZg5d>hy~p9=pQ6G9dJna3YdKPW9*`(o#e{K zaz9;h?aKByB>u`;yw)X$O31lAw;wRh$%YlPKYt!Wz+UqrjvZ`0LK1jl3j#kTrV(Xw zXg;qR0UV0?466}t$%TN&DBxep|34gorE;5oB7X!DthD0M@}fSk{%t;;c^NYJ?@*$DvDDK)gszfU1M>*4{A@80!&mk(2$}I& z&^SI)bGWMxwfKLGy>(QSYacd>tten2N+TTt(v6CAGlawtQW63J1JWoUBAr8{#K4eB zcNlbccg+Apcb$9Od%qjs_dDl1^Vh89TJp?u$FHu~9(vTfusRTIg1=yF@FsI~d;Os> zZql5obm@dwpxt%>?OiAaIQj%>4JjKOXgc3Zb86avYsiOJJ`a9_ahPM{5}1+D!b&8e zPQB!i*M+5q5t=zP|m+(tD%g{KtEL>mvV`D0}m0>&M>;#&7o4 z44~JZHm@Vf88eYtjw{2)LP2zVPCQIfSi#mZAV`}Av}f}YN`aOwU5C(S0J&58iu=)e z!=rbdBi1!gjj1b)YMoovhZs(@Up;ARyp+L?1nR#YxT9wrr+oF8a!Gl{G zTEVFbE8*{lVmM64Eu|HASCs}gvh<`D+IP2JDP>Hhd#=n}_cuaMj6ZsuyN2keut+~$ zN#Af-8dY}q5}^OAwV|+Oa8G2v_0g{|_8+IF1&7ap;$9ND6KyMQy!v3)N|8n?X6*G} zTG4%d@-?A+!KWwLy-$%|n<}fkur`gYS1%X0Z!{8kI7*mn%P0Lh{VJB9z5Dq_{7%yJ zSVJT_s`X*;jN80SK;(xXKb9`2MHtDrcxW}km82DtN1e@zXN~u+aJ5*J&%p(W({i+F z>BUms$MCW=hXg;ePdpL6i(dG6;KZS>l2@+9A+o&ylxoPbRzg=K1p`(ke3&mFH%0O& zrSB2aHEn1j>jhvK{MT*`iy?yd5GEAiw1gE~3|yHmtNn%FL|5Ts9O5Qz_-sc1Q1hM?9C_di zFc-)>&i?EZT2Bm!>&p*%+~RgCFA)nzDBe6Z)PNB_{CvSsW`gEfzGOO?}|?9k{;bAq4z6Tx!}Ig{X{R%(y2zKeApGj zEZNMU;=}mPElb7E*tq!i$7;TX|giJ`+n&dt&N##2d{BSZ}dTW()J3f3c zo(E~We`V6}O0aMjA+;M1zZ7t*REl7sh)ZHks^*hbxswCsuuQ`{Zq&3@xBB{Nd25Seguf}Gd82!3 zt^jV*;=`$u%I@-(Y3&0qZ&M;h+)FfNSuo}U%VaDo>Qt$Fc(}@o()xN*sYq~qBTM)@ z_XV?6YJ+vcOJVWLcv1i8IneR`7BLsc5>x<1F7kiIO9rBHb6@CebzLYHyBeVBQ(4L5A>xzRt0;awS3Rcg8?`^>iU zPZkp>ARFa#cS=~pqbN?4oZS+v)k^8S@tvssKvr;xK$MA*mUBDyTN(z+E&m}bF3#fO zj}^=PJh<^l+^vI<(7QI61)9|mlIpb@TDQ7IMA`9*$H_DN*ChldX&|egBLtmn8yWS} zqS@o7By`u$f;W;ym0RuCKo-2f>y6A+D}AH+I_+)wRu$n+)zs;CkVf17{&*^`|+N9@)|YE|rk_Xxh@3bML!ZG(`lpb2a0$ z1|-@KJq~$)h$_}M`gEMUnC7G(Pg0av8VDA77>Vb<&9K2j#u(^guyz`QSl+#3(`Jp; zK^P;R+O%>G9`qL3KU{Td7rZZ4cajTSi~h2)z)FstlmhSLznDW`1po=?Z@Q4&im(C? zu&h`b$QwxUqot)4Pjo*pn|YI51~Tfjf}RnW94zj))cx>+_QXLHXcXKpLNk>C!Rizd zlFf)orsv4-5qCoQrJTlK11Q`s1Q6EP?<)}e;H7JF?%e&B7yf=r@CS^mvSxG2Rdo5E zM2l2R@~R=h53^D#6tPn^;RfQg4g15%ZbZ_prZ%|<%}Wn#hbl?SUbU+>DQr5pS-gH7 zntr-IFpVO|oabf2U*I+}r!d#kvNcK3aoEBuTP|CmWn@##!|PD=*s`{@&2u9v8(pT( z&v+S3e(CGJC01#Wc7oJgMc4qjG!58>X~?mh0dFY}ih)!zc(E6?cf^H`F3ntDz);=) z{8{SY@&~URrop{YjlBEi64a=KF~aUH)QaWGAqDDQM=MXgq4B0^{mJe;P^D4in2$tt z&8{DN>?IcR4mk$nO{|HdWKdyQA_uFt@|5YB?t8fYDzaVn4^S zLu1L@=-mi%6HiW+>E`17WYzOt9QbuH=M=N!7Xcl`?xAp7k;ZWlzyY!lMpXQ*`P34( zI|JWK*{g}BG)R9C)C1VWI;J`MT|*PL-J(4_i(3&Ea|< zRm#lQ3m;!oW&(Kw2QypO3suvWZ}`d%13_aE<0i)#_6zxUq6HkGP{bQjg4 z>5DRkNc<>OfEtqe z@$s`*;Qx|zD_fOHmaw$&I;{fNyLXayueujvEpode2Y}rMY|ipcgrww7Jo$aYd088z;mQb~}N#2r)hr`n>_1_X7gO-|%nY+=%jE>5e%Jr)dx;got zzS%d3vZny>P87|yKYAKwJaJg9vrlnqCKp+~HF7#zqI2x8*OGi#L$yC}>i4axBgX^g zPh+wJpnuUA-aen*{{qm}{uB?fy@Aa21&%+qKk#_(*GU&;XyuS7GG!wXI2s9TeJN;F zNEFT-%vCz0{1_1hy-IgOMA+SRfy`m;m<+6rvWz{Lb&`r5T>8 zS-fF!Z^l-Y5dmvC#NN)uu3I~ns`N4=tP$0eFP~6$ar7Uw0ami*9c6`8g8~umr2rjU zqXOk5Ip(O8n1ZyDyovk1S?E2RS8O4;V$e-%Q3A&2uxiM=-`)S_3D=j2>xym3zOl2>y;=C{f}#Al{IrZ$AI_i+|9)8@ffE4| zPzv6i$+&cx?Gc%fG+ebH$C0cZP^a`= z^}M3EOdVLME2eB=OX|b>oZ9DwuojBngp#|95QuA1GizT;BhqzU6x36X{;$CGV#vGG zt!>;mv)n5LBHo|X9W0H~%^bpFg^xIy_{Irt-I+P&0AgxDNU71@G)uri1@w$!#&@sN zKIhg%=7wYnrYgQ^WjyU%d39;jR2=g`dLXdd6rBb~nTMe4f}3mCf)FefFEqksf&KeW zx;@21k$dwK_ThKO-xiSjm;xmZqp60ltwu=?Nd?hxx1|fYe%TMkHkuUkde1h^2B-UT zp@&~{jnTcx^79X8#vCUiOD5b$UW~t;!Q;1^4`6Lkl<5T~g^b&!e8&hUq}?!LQ2UAGDP1os*Z3G-JTH_w?Y9T4H5RP&fT& zP}Na8XV~b2bV)5nWhgADw>oD7gVCz6Fg}M`XpdAxdDQ*I){r%GyFSCCUR4Z;XpQcE9J*$w- z`l^ImCqZ)QikG-gUyiK+g;4(}max=#9)4Y2`U`GfPIc6JoY*Q^HPt8Fe04e9+d;J} zXa9fB3^KobdHOFqSU}~aoUsYMqUjKD=)i&*K>xFmvqQ+x{*&tVeA_t6%1w2k71Uek zP~mNOC~T>JA9j$PnaQz}w{zGn%l2T5-K}@WT+0q*bhXcWbY35X3JrU9hfLiwz)CAb z#%4OW0%nkM_dOjpG2g^TZCq6=|+bH*!amHShHZtqbO~oQpIo%#??` zS9{-kV4k&Rzwl>$uv#%nq{Pg!l|ex>ERX4o2NnFB518bqhB1(tggbO z_XC_Zp0;1Ae{qRnGyi-=wGSdKS6f4!e}9HIa&7-f!V(Fb=mAVFi*b_g@~)%7P{L59<%^%Ai>fS#K{o}jcq2l$Q zm~+GOM2cl_@A>T5adYQMVoFzW%eHs4!4UpGA3jm6^OQ=@t1O(EQ9#5eHf5UK+nWg{ z8A(q~T`Xz`eo1p(k`&zIhN&KNX^0mZBS=p=_RxL#C9-z`feAstJjti&uB$CQZ6nYi zm2&C;l5hCAhl#NVSQ)d*aY3<@y}gpu&uF*f*3ZhW^Zbt*XsF7d!4Dfp*B@0Yc54Y% zUVvqLEj&yw)?00K>3!N@&%x@(XRR8;FOYK*L`4mZ64>b5Bnf_iLZ`+Ysj>z=On5X~(*1d|6os|_v&d6edEV}dhR@s8V9?NO`K zCvz|KmcH=8cQ3Ik|7{%w!~p&QW;pf&`_CbuK5@f>opH8%=Aupy`j6;?wdgdv50eQ zd5Q{=6K#8?Sc$4MSDBctpPHf`4vq=A*)NAHN7xc$gfnlZ*;a03)C#;8d}EU`E^r(4p`pQ`>fQr+}9v_zazy!MCzppcvl+;u@+8 z$UGaB>v+@6Ak>USV@_yPC?35de(Q^STdL!xDc>Xdt{K}(uP9W}+1qJVe)-;t0HT~b z1hKyIKHufebq+F;sm#R~WDCZqv*bmLvUKRK5jr84?IgZ-msG;bV&x~#K zMULl2q`cX9UX+_1R`9TM-71VY?~ny+#t=Rbpe@2IOYZ#Qc76e91=qV7mcKb4vx@Y- zNF7iH5Uoqx;d(D$h8+zpthh@A+!Wz17_uCaA5u}}+AH{%(}cs$g#EF&&#(iQ4FDOs zQS_F;#~GT@pQmSSpLx;d0FQ=v%(v0jtx^(XChMb=~48yle&p8$b zk4%-E%E~bV4hIfTD|7Yf87*!A!QmzGOWY~6gFJJ|#HjXkB<^z# zX9NXZ7rnQ~`Gj6B!JYN6|B8`nfr)(!y=axn8BQBc%F_^AQdJEXL0@1VDNp`(@?1;r zD7A8)8kEj0KR%lYD@t5QCLjlldZfr?8lh5pfl z?od3HB{e>JZ?m_30Iy(FMlSAYb_&a}Gd5VI62{g2P|^3o^>1nfc}(CMf14$b1)GCx zR>RR)^fN^sJ!Ov%Almq0I)7|A#4g4ENQNNNmT(<7OlX#x>GO9rRRE(=^g)>lB+YU~ zI3J|=G)@|Nh5=Y^E&w307gsG-OoRgo{$sfV4X(e}O1C>{?(|pSm8|IDQ2K_5J>mYFYrD>$Q-Wziz+;yiYe8se>Dbi!2^JZ5G6o zijjL9y@?TY%pKQC1yDN;oc1e!v%Wr@#$T+goFHQ(ELd3Tq9km;WV;xY1(}KF<`Liag?sJ^>g<(bN7I# zCQ5wQgL)Pd8Qym$%Jx(J6J@Cb&TQ(Br0*kU^xNb4%ZYbDHqvAmEbP#aDrtM#>~oz+ z23TGK_Im9+aO~V_JT1bZtNvY(`q*7%qhT^#fER5dsuXrx!*hFUgIuXM<$q|8#H9^9 z@3z6ssbW@CIeZou<)YA{4Bg+9dY^lqZ;s$Xe^!w7Z@s2`#JJhyIjvd`jn}`W|N<5?<_u>}4|o9+q$ zc|>vX6cVb&z5zUHQ?SxR?^;UvEO^TY^gPDis^$s0?tQ)0s9J}jz zOSUUvX8Gvm+hwnI?v`0eSL=wDj+Y#*q~B`{dL)>Z*CO&NMsqeaEG7+z7i`o`P)kmf zteV6N{d%M^{#7pcW(FV(yI0Imqu?(dNIA+n13N5$2|mm=ln7ScCAwc!PPdW&Z0U$N zv%kOpi-%5VXsE${12Lbde4IL^kB_+f_4f^bd*}UCI{cA9d;mr{q1XwGXlqT)q7;9S zjEv0edIJUABCiTKo9v*o@|uBfu~AHrHY`<6TDptcnyAGz(WyTA-qFs8H5}A}JsC;P zXhtlmcf7e}wC#57zTv-h=auSsTixpIcV+A(nQ>F0F)aZ>;WF$!YyHJMD09nS7h?<| zCcG3qee6K316@J-?33359bJu@xy;P0oLTlneuYi>vO+wYq|o@}Vu~M8wws}1^dCPGV(N<;lC>7!J zHr2oH@A6Z%S9tdPmuSJYf$#suuN~se8_yV_cYLJre^K1VJ2sw+K{S(#3BQpB^5(Uz z#Y|AVEJhYC1e>F+0DsHXPPJXay+?KEm4c*v!OjRXRC&I6hS`AHTjGuzj@r*0FSeW$YHgVoJjKrCn^!NS3Z@WJSCmCbBkUp;d zXfnwK!=JL08e6QbS?yW|CfMB>UZIhZw2l*QCFJM$^tO?miMUoigGTJ!dD>C*R^i<* zmI?<6jH|zD7X4Q%o!MqwD$5dfa8Fow|Wp4Ym^*K=7yKsaCXp?0}v|EOrQP z_$sU{g}e_U7A95?UlcNSu+-9!`qI=D4Vf2u3OmsL5hlz7cOibop7bSk9E>~M`rvl)&!^axkShYX>(f} z*0cI24-FE5Qir%{fT6?;)&%sbZl+S&9m3B#peydWAUN^Z9A}{fcnCRP*CVRDwc(h_ zIo>-YY&!&|T(6lC(r7<(#*;C{ARUx&?3ZPJk-vRXcEI4PxXG6o~VLUmP*}YJ}M2>R`$k9v)7X{IXS3Vu|-vqLicY-IeH`mEihh$lSWTX1E2SYKDT?}19Y!BU5qpibj-Ho)o$EyW%Izo zSuk{{?a)4L$oSfYACWu;EkK_eWd_cC68n34H}j30&{5Rb7<6#4Ze7cQ)cK=;g1ymiHThXu5YELMD5L zg@ok~{fC7Vom=F&w3+GCZlL>bLXX4#_*QB>L7@orGf-V$4_Ft>s%T4~ z!+;||c@bfE%RIeq`IbA2hyAF)BPaIEAPV3HqGOv!L&uS}HFWnMNt1%I2*uqwY#aII zmt2A#?tGxIC9iabb%-Npf02ITIpIFGDkDqIgB6mTET{8=m(ti<|JxTvDwBM%4j z@T0yjM02YG@>*Bdd@U;2bBEUsQNm97=b{hj@ zXVnBoHD8%NL)Xf`4`p=D4A*w1tVdqlulkFVl!9 zV>$TM;1xF2`lbtV>BrwA26SC=9`lTpzDW_<88ieo&^ZG*(Q*n20#w3}+Q$gls$Q1m z9;qEq(?Xrf!f($?1soR}JirYGmU!1iJ%Its*O`#z{k7EJHhH^?Kc=nByA>0EO%;jWRi}H2<97uHGcSRh;JtK3&Z$} z%5%PFD2_g9c!v$^)B$|As#xny(h+L(o3cj0g>#7o*y$?NXIMojT)9_kQeY#9mx&s< z!|Afsh<@^9w^QFs zpU16KQb}x|5@Z%88B3{m2$NL}nz7Kq`&qdWnSyhm(H8hk5%>oS0jO1dvN=xp4`d3E z_-<}CkAyB&)5VDv*uEP2Hm~4x`tC4oVx?c(iMtdRMkxIQl^^Xb;wQi1p_u+Tb;h|3 zcsOleLQsBQeciis*;02|o4XYHdV)pxYv>5;%CVW3kNdmfys>Z56CFl{376CFHlUi( zDKp&VxVnyZz=jV&DAz;y>`V0w#aPj^+#2>m)P$O3OM0~{@>c!7b}c~7ix-qI&tW01 zQ$9pn418==TVd-0V6v!0?z;9=v&=T_F%JaFsOBg$QH;~BCoDYHy%lP=$69cbQK&)Q zLygsKK?&rDIsM?5L;;oRKFXeht_KRl_@z)4%@A6ZPQ<{)mJ@x2pD0v8!uKE^w4GgQ z3AI{q_Auv_;10Q!0nEyUaW<8N%Xa=8Fk{&K^H&fT4R!T^L8pSkUQx^7$uzpM0MrP@ zzZ1f8RzO9xusj5`J0K-qqs?`8eMG_f(C1a{#BS~Epp}7kQqA5kwk;_d5fNRhiWMI0 z^+m(nB1)w-^|DvStO{3c=pic$7#Nco(WmFTU~rIe2jm$fcwUI+_KfzDRJ+#xRF~j-ipGyI*fa z{R43K|C7P`b|Xl>$k2;H8NDX`?4)J=Hw#A!?v=GWrPl#m8@SZ~bucHV&M{300tF&C zTk)OGV)z6JGt|KUapvm9p`!;I>*r|T2Bu;)3Y7Vyi|h}id>GUkq$vnLT3T84rx;{X zrwuE>GH#4^j90nL0nBYAxcI6r*X}3Jzo^K~&Y#Ju8Y8}9sl{`kjuL{tKOBWI2#vw5 zK_K>5bn%GYy32p1$_>gm_Wz+hZM?cHD;bY2&#n;*ZilAV0ZB3G#NiZ4^(!Ky=>T<_ zr#>dMGI!Tx@l5uDgY)Gfc>Ape4ZXdgg1;_ZrqG3gVf}f#aMxubPwFo|`jH_asZ6 zx%G^ZFW4#1z_c|!+AFB$c*PoKZ2jW{Uik*@4e84&P!04Sf8V8BmFL;1 zTuWEIZV9r?_X;SkEH@A#Rf58?YR)o7 zoAYfOK5VX7*6L#?Fry5ZoW)Kt0|04BVF(1U@otjj!npH7N-{E^vY{2~{n?s#dUtT> z5;xokK^`Gv5yhu?m5lDn^L)ds?b?R26obyzis<17(Rw$AL*J?R$>zsw3}bFynE0_H z?5h_I^sz3bS0+Jy7(0r5WSi}=!FTjpc6Rpt$41&(0vcyKb~rbgz}B1v6=Q#-hP8Db z;Dgp)Rm-HD10C4FH{k9b|84;oj^()RMFYCbSnNbZM8y4f%?0w*`1;d(i=K8@ngRwp zFTC5wqleAiF+q3HPtM^M zea?$OU~%?)DC48yL~e{AfcN|E%CwlyIWsxaHJx6 zm;7WSG#i57R;UDVOgzXy=1BqXHDQ6>+jqhxZU|OmsoC)fd|D~YS=mQ?XW+-c&5IPY z_d;=1GHIi2urnkqjWHz_p}lgUcHbbC9ct;CB!{%_P3deE-TCq}-2saY_#Tv5@lyPN z3y8Bb$`&O+8>A+dw(XdthUnjL(4xGLcTy&;6yZSh<~~eCck)dl3B*6aBPuCV08qHv zU?*dr+H656IMaf`@wB;_N@~g^wZGM)NRmKjT{?9+1(N234NgL!!C0RxAbb8Jegi(g z;#ho`dR|uNFQ
?No$$Mt-?)HaiqUsh%-9TDQRnqdi1p9jnVLZ)QN^Q#dMf)o-5 zRHU2=Es6IiHw*0rX<6lJnz=a?_d9i|r(M+nWT*u-z^kYUkP+TzUvQpH2@h0ok^yqVI7WXJ z`LBDK+2UA5%Po9#JPh7F##jHFv67Zznftc<&i3$CcCsZ;diBiB=B>riIKc^*_GK=Z&M?L~dUV8u0N3mVv)_S63D^=^v zo#5CJfIk!|G;tryJ05|VQDiR-+!Z!=-f4N0sHx9C{eht%fT|4ZzWyuH(u@)2;t^xx5KRc$2 zkx%WB+7QZ1JWlsc{9v{Mzlg9J{*6}7dZxR`^Pwh8FZOp!Si+J{e{f4Esayi37hBDn z1few|*`soe;&{J-KFaj=Xetr~&z)j=8MosH!U82=KhqeS=aFlY4eRAHXEhtDfxN5H zPUidButfO+Z0SyHyveP=gl#Vk$O0Ont_y~X7jX63;^vU)iD537GD!hJK_DK|o@VH) z0Bosgb96NfK0Q!i*!f8-Q^hQ~Q##)g)BVB^%-sO|?l@btPxvuip8T-MPGjy?S@&X0 za?&kLKm`ng&y50~jVE~Lx7d6(r0X+t7V?tW zS@R;CC#B#z{Q?G(#8bZjN2Lj%5-eCU)tBo1ac!y(&{-Qf)^06V0nL8YF5hlJ0hyH* zjs*~Z38yr4=B&`%cL%}oNz6=tn4;t0c!Rd^ zeRDIju1Gj8cP1u+sWanQBwOX$`&;G#_NI*>o;#t^;2*xnq1~96n6?UV=)$m@)7%E5 z%2$-8u3^9qOHpkusD#)ZKDf!h(!>j16zx~B)Y$gJpe555u@bgE3kiCxFlCqF&gf8i zKH4e>4!oI!HwK3E(W4Lar>1?dFKzF1sDh-FVU=^@lqmRN#^i<6r?||iM-!jOo z%fYZ)U*(S1Ik~(3(H{vuzZW`ODBs5$<=nk-gPpUrcH!80EMdv9gS|E2#Dx9W7W$v; z?LG_e^}0*6Ab2#5zKb+0*2vR+9XQO&#FYOm7ywmObagQfrLX9v51q*Ra!LjcyYf7? zx+INz5c0N@weQ>;{C2nZaJ=hR$DLbL6$Eqz%T6*i@p|P;tRlE)I8u4IwNEx=H)rb< z$IROgBqd?ir}e`58}H=oOcv?nq@ou0CLR5ce&!^x_pQYVY&-Zp7%@@DSUEgz@pGJb zoTW>+T4M@b2&{I8AUe;OvE$(AXC#9aat`Ae8qzI~H0ZPsC~^3FL(u_HBeX=04;ig=t&tRgoavB-W+b$`{>I~}Ln_`Mn@evruX&f$5RmwGsu!8`wa zRfvp?L_7pg)sLh*9c%(Bm%H+fTd>JXSa`T$`;8jV^U(bE(#LY7IFMrT9c8!Y^qmS_ zc#YG#@c!ItdxZ}}pU;7b)6B5|GH-qSi<11u;=pZ(q;6qK-*nc{to1vyv#TVn4?>-K z=$cPA8+G<4Jj$LI-(v|&&KsX|IGS;^HSVDpdr){YtvP;)e|*F+aZi5`^RjtM0pb&k zD?Ih#V5LUQBSuKs4XQ_(Le;KT?PLgCiW0>R_Q;;d ziH7LRfi#i##U&kAbsGuKJe)okmDGcWNF-84Nr_T)uZ|!U6qWbbbV;|A(mv1^9nMBQ z($2<@kcG2Gmh!Dqd!M_J;E>SE8ZrE?yN;83u&A0AZCGIQDaZn@-R?1a0wYFL4oMD3 zjHn5;AM&D-qLMoHlKFd9o*m;s@M=&!1xEF5?Q4^tU*yR478G!+gZ$1y{o$iWZMHLEt6dA6`6khSr2S$~LpXkNHD*xB;6X4$1(G~gqYO?? zPU;KIUp|m>XP4>Mop0*sD0v_5{A#zUwN=jUI!;}30z;L<7ki&Wq0M0EQbaLbfsG8Q zwH0jIKUoOAg7>EETCw9^A5F`84y!@4!}S1 zJ~rtyl?e$9@&fI?;&9Q3c8OK8E7ox~Iwr0!%+w`RBMK$OWi3s8`MYEVmDGbfA|Oqi zsjm5}yIx@T66SY}ojC6%l+6B0SdEGFueNeem8u%G?DCL{>hLh|(ez28s8M3+YiNBL zLf7#sl)k!ZIGdfg51h&)%aG8}TU)FHPiXY@a1@rW*ZR4g919YVfK2RFZIxY3mW|0`@orrnfc}Z+kAG2gK7twQY{efdqS9ZBuh>#fm z=r31$o1_}RuUhG$F!gbSc zIY}>I=dveNXXv;f!IneFO23Byt2}E=Gb7LX2kPfe-0ts>v`uzP(VKoHT6NNgQ{gL6 z5k@K=7GMm}66~&dGHKD16WV9xS`|UQ0E^#^@1XN``=}zHDD-XBGb3SP*cPHt)5Xem zI!$D9MjQyo^9xpfTx_lW5N12)oTUj{FZeOJcW-3i?fs;+Z^&}gAVyEFBRuNPdz3!j(2}BX~N7D{VT)_E|?mif#*q;k!{AVMkFd00eQKUTEIV^9p8-> z;fR#_m8z*C;uNHb=jp&v2;+?BeVsgd*rZy3>)~&U3AMJ`D;ENxgf94Qdt^s;xj4qn zC=7P=w9eiM?b9&pXPp=2d%8x@rr}*Ppm!J;@4?oeeSW4s?eWI+$!SOSnd=RQ{XQ_4 z)W14f@9Y@c;y&8#KBTH(7|tf^YPajA8{EoEe7gA|W0?KldR#>Y>vnTB|!=l3U z)$ily9nVi(;OQQDd3^x=aWy5)I_B&}jdL061AaqA<^Ag8wvu0o;bjy9GPbaR*X_aZ zpe(<6;vbUmCh38t?3NP9Y8W_E>SJxL?=AyuY7X-g)SI#IG2wGl@kkS&DO+@c7UYtO zwRv%IcmIPw{>Fu8;G6{1&pYgI`vuRHU7(+1bKgU_PPx=9^H;BrTIA1BOhoFPU|c)+ ztnpOng1-4MR*WkbT~BlP9B-?IU0|K5q>PN*JnL$zz&FyiTm6TvngqlVTivj3&L_y3 z%1w%RDn;S)+W0z~%)BDEQRl-=l8AB9tzwIH+x6)NlRpZ|ci2+{9T$K}Ck}3xAGP_f zY8o{4P(V2c?0X)MTUY9t`mhWL?Z!qNA%@GLvf937K8+F40$JF{v0uB<;dHHMtden~ zV$axiIh=otx~k%p(k3<|>pO{!3}qRqc@v!NO4$RR#g1fY<(4yE#AeZg-2Xg+zG@F^ z1aZ*Oc=e2M0^MQH@HpeInxoH4OXKjMAdG6fLvpK-$Cx9LOvhS$F#A|r(5o^hgHLjP zE9C1{3>Nv=O~rVChHk_-mF$ty=aMn39vfxCMshCs(-?kgV0d#Q&TgSSmZ?-rdqnGmDO9|7;l&D0z3#C-9aFk=;I85vUQdAg5AN~}xXM|v~8o7zm#$gB+5z^t&N*vL=` z4&ybUlkVT#PK$fiRn|{jOnjtxc+!5To-;bENPgOegjn;QA=ZF&>LPe-K8NLKWI0F+ zHS1*X)G(uEkRWnlrq{1u&uih-?ersM$!98|eud`{3>8+1jHbWO)siccy884ua-a=2 zl*KN0;q9u3R{TMW0}%w_82d+xaX~zUpRrRJ9G?>s0Q2)1*?7PfDca#P~nXqUx?8FRm$>>CrZl#+bJK&-2}I z`oILcQ5tmN3cJ0vsxDXLYE=5k`VSV z{!U~5_^-S`Hr2|_&D}Kh!`Jw5X9fIPoZq)LoazGu{m(%R^G7S4TwcP5L(z|~ZBW#d zcV;gi9r-WLnOvtxWPqfPTdAM93UTfO)E*VP$B8QhFR(<(*vYA^?4GCSU)S%`Pu6Cf zYBn&3!Z_SeO#A+I<95?Kt{~54-b^3r9Z}p9CgOkO5yRmQ&llHUEuFr5)8V&Db2)6o8mZLav=tA2_99`r=%UU4NGr=dv1o34SJk&>SQp?lxP zU{8hw?wBjtY3+~}e3yDNaZJ!y@=3ip$L`9#pYk#mp+|wc+6HoGpXTD_r_BZ-07(HT z4*7-+_*g^?Rv>_ble4Wz%#c7WSGz>vlfQpjdHJx;^#Cj4RQv0Y;qi0U;LzHM=0?t< zbC){)iVd+=34w;Q^FHewC$0CJa?2jtB{Su08qTn4!mC`12uX29paDrXPj7d3x8hPa zfpfhPD0&*}>fXj89oNUywLUHuIsDFcBj8@yu79}k551fykjb}lpMBb^-Wi`heM)Rp zaBwI_vXi3G=+x|NxgavZduu^q!^a%-^pfPxi&WBzioB+`f@EZ62gz5nBx%Y-J8+fRojhaaGZQ`g zvTo{JWc;BDb}v3?8mWr z1VdQ{mGt8mKS}vz_j+{sW7G};=Q!%iqip0%{3@9qJp@zL5q}2Jf`fDO#zjWzCptRu zgvq_^b9NoVL|<7WWyB&Y99N^zXmh*E_lGzTG|kOdIC9>Mi-<*dO+PY2r3I$(>RLrT zKJgwsZ2PY~#8V$sjOPFlWp=SE1>4cmta2`jh>VP}IKQyBK7%+fQ7t&#;X<(>Of&v0 zoK6SDtcVg=4Xu-tDWfD{b5X+XvIl4122S0x5RL$|QsoM9_szJ8!eY#w}4B#LU}Wl*7A&S(P$g_?o-N+__Hw+=O&;E{s;&f9ItT9 z!(I)=R%1-Q1QbyS1fAXMHScK`oTi8bkSjn>LUyudwKzR>-#0;G~P0Y+hSjVq~$SXQ)}JvLJGu{d{>6VjGR6mDPD&F$RO9{x45ugo&>D6R}0L#qrAO>Lvo9jxuN`P%*1# zV^JS=*VBLgd=5EBt?ibx^137G+o(J0wX9PCwTxAcpmkqK{HfGEb+Sm>1rOHEWDny@ z%qtD1d6hH-iss?IfOJ*1ly~Y&qtN!X`j0&R$s&m5RAfmxV|4zXK4oLVfBpKU&fEM% zN>LI1{{8u(A`AH4d-r72)uTXbiJMzZJx8n0E}?`q(#?nEUFgr(*(Jf>M#Z+3$1Xu& zWkbjE2%XYbmzY9ihTOlW0Fq7TaR}fXYUY*7>%6)ErUVTB8Z&l@RYF89h*U5 zfh-#vwsviuulAHH6s$&y+XO&+qTiZQr9#blNrfMTou%9No5o#t-5{4Ve1knr>|HD6 zj(>E>HdhMfoGpxSYS*Zx9-J2_J_MWCgxj9@gHDZmRi6QesOSV*f?sKeHPU%@b~YNE zzNJ1#5;<-mI-`!T!Bm<*3VWQJ_#k3D0j+7|Wj0RE9^$bo(++YAAxBED;DGJ=S2hPJ z78?gh3BOMkyr%Ezk?XO@QQD3KQ>L@)j5GC~()fOlXG+jS|F<#iV#{w5j`VbNu@>iU z{It7L-ry`rTcia)M`+Ai@w}m^r8FzhWRBIC9P4oV?}u^}tP>%JjWzxwdgFWtttxf^-!l%x zoYWDx>jz?Vzv{m$yIiZZB5GT&`jA3g@o}VoPQtAITUV#yKw?N43GXq!Uy%QwDcbMf zboCWn+FAtyaglHo-E8Fyz+8S`J07S4#YEGrJQ!#*|5$h@Er-N-I`wkX-mzwZEnEox zZT#P#1lFJLAy}gV(+(ar#|&FAK?gsMOwfV7oa8Ep3$F}3yo-}Y06CQXp|(Geg9!nd z=dLtXS76-4%3GiLOnsy@zWOR%nrtLzs!7;or!kh>;{F((x2Gwu3~yHg;_IeeYY9BA z?NlzF#@)3mdsMukp~YLX;LjxF2_Lxj&I{yf|A+v&9QQoQlgX;B5%2=yV4D|EmGiFy zs$+Uquy>s`$wH2bm3377wVBptvu8Uj|A*J(O-bu{OIlDlkTLs=VNilxp+!i zw9{FeMksWnKyl?rG9ygKs--n7Q1K%e5pML%-oLf#t%3kagMHH1o{>FVCFjtH{)fyO zY>^-d$lk6w1;TbNhAC<;T5HRC(%_)A;d&`UAk^~m^6Ckpp>-~#62dzdCx8C`?d^V$ zf8~JlTR7h*jD-R$pmP?+2=0Vm_S0(XF-s$+D-EvKLw4#wj9q;*$WYh%g)e~ z`Cq%U@EXC@d&#QmA616x-+1J&6z<+kLzjy;2Ge9ZRBAfyu4ybZ-65%ARXoYdYohkCn**JdGBU-gwP>OBmA;KN zijyzPndRoVtPaRP(rW!*rxE-2JC(<)-v z;IjAqPdm2Hsry%P5GV%&WHDsS!p5RE=|MB)F>|Qk0$XAK&JVanF;7jR%6uE=A61~I z&2x}m2a)o>WJ(7YIs{J#5bgV7x=HY?(HYkIshtWDx)n;}Tc!5jath|eP^Gc2)D;Zr zOB$)Os`b?`Xn0IRA^fx*w4hc$^YP{cb-*NZXEbQg`%wA51QM|bTdXn&(z9{*ullbV zwfr1qptA*&=Hu>uz5r~j(Ve%}d3*uCW_WjzlB zcrxo`rKLjwkFLO0GUPV(FR+!J0yZ7e=ntip9b&q4sd)umYx>6CIL5ag%<2>nmu5w- z#x?o{(>JM;mCY5|Q;O>Ov!MU-%F;3- zKcB-wU=5&xa*BA>)YMc|RHp4du}^}frd(of%^0J(P^S@+pLdh~Cgb?qns&F;5|0Nm zEh~X=xw$c)!p9p+;To;xHfY6o_DIOasM8;X_afq{=t}0`z!x&2YZvh2FS8^8xs|1& zo&n_nPtmI?mo-5ih>QoxtUNbYwS;d8xEF9g>in)A4dUh#ZjL>4e6G;~U0Ib?5Q`k$ zKC}~VNG|8J-F&}l6`^c4FuS_~$@x65U&PDPyxl_)8wml&{g0n`&<(mH^zAw*{?S<0 zKz5+nn@Ot#yc*JS%9OaO+CKzde#=$>C)u5j5r#axE$K9b3TSg!Ba zz2wSAJMpc@rRrf>hT?xWw?F~YRxQ?ww(K@q+IZ%+`PpMHQ*=ES84r@l?6YzGZThK) z8?YM)p?9ca_1HRA89Lgv-X+x=IU!kkJ4Hlj3nKD7vjm_iXy5xzAkcmQ2FYkU4%YIv zc>W&M|A(^oj>r1n|HqY5DJ7*sMp+@7gp5-5%FGJc6tcIZXxJ{#gXk$3Sdf+@gR zGvvMu`E-C zAx-Fzs?a{hAfi+G@;KV6jVJ%pr|T*FiRH-L41p0yX2&YY*V&n^qk&Wm*SeGjqi=8u z70nf&Hn10k&ZLhHLsyA!aQ1Jf88lx+Px&*|5OdD58l`r*>&Gl z?cl1dE;XPw@37Z~uG1jdysv;j_$Qnyj|cBBh1c`Tyq|~85uyl~pl5~W?1}GeS3u&- zkB^Bl!==vosW3mU&JPdIKU&yWnux@IBMl?;YX+nI7>aKmw&d5m7TKfONcJ?V3xyZ-Q0tElwcdsPEYMC z`!!cC5ju}l%w|%%xmGWAns>dMy9fz;x~_bw|8ACCRo%)J?r!3ffeTS$#ruPFe#GqT zQ^jjPAU)|~01u7(l=vTLeZ=AU<@FSQsIG3xh$s#feINIID4vS93b*QuI{C4yKfNLR zRpDa!p4D64k@ASpv$ZFj*TO=IB~D+ikCt=mP9;1EP>+C+G$aZX>;#?TDK~lHrXW6OsDQ>f_ zN4czhQ8M&B$-CwmP5n?zuG7oK%x;kKH(hicW77WPJew=nTr(HH*qksoG+0WqBiV-G2VpVI!XZWgc`H(jmiynfMW(Z6 z*`jZ5$mDWQJ14Y%j1?esoJq@%sE-~RpGu1TkfxEZ9&Awx5k)cGZws9IcZ(#aziAS< zZj#ccAGy2ZR+g^1xwCxBNjt{lUShYI@k2^~ymQ`?vB5K!DRNdj zf|A7j07z*lfDaLzWv@4Rso7w}c!LfEKbxUSWk^g6Mb`ZBV;Y(}CytO!J6@xZ8w~g;kF7vp~56n(WX)#9Ab+w zm3OwTm^Pgo8yiz=8rJkK<%}Q7%aGSmQw!IqmCKfU{rYv5IISk%s0|b71P8KBZ>*1^f?g z(YM~Tp9vYt@ayQRNO~`*tzFru(;D~-fP_3#0-x#ImQC7IdYv_?M--ed%|OV(tq{e8 zJ4Uzndvg7fY=q|3Y2{&-p`+xpsf{{y)iu6DW;Xnls6GQTGk1tMSWYpb8J{e$A%0}z zRpLwjo;ztM!R=7y*kA@lbM};7*X0J)#3vJHv#ofbgk%|S4Mz|kO+=&76W;kgK0YzS zrbk4X{)eWTNLF(shx@5RTv=}vxd)dR+vkVBJOV!iQDt)kjV-O|KMLLb@w_)Z-78|P zwIZ6HJQR!HZSqDA_xY$l#9Nq4@c=Xed@lXSW=^(%a)E%TgATZ5x zcWVvg__pjTqmgT>(5rgSh0TWA%ruISRx~1pFcJUW8-?f$0VBpeWb{#z^;tP^&z|8S zkQL0Fu|-rJ{&=jreJ2QC^hXN>h$K{1;7Y#4L2B_?u`pF4%QE z4JI1wigLzX_3+zE9OHby_=+1^Iq-WSKla&)WDJ`K2%?^+Y93|%gWR!o1B9L zW|awB^m%rhB%vZh#yt}|r$%p6c-zfZ5eHfPJxDL)_RTF_*J9~ST zo-Kmj{KB(ulc)xEbX<31`lX3)lWMv^tn46N7X@4`N^@K`g$M)-&V$+E^75pmUph4& zGd5ZTPy5_Wgv-6{&>ix>Kt}28R#;*FWw_-*v5=vGLAbL172g)eglpobs8S7@SFX;J zzlePSz>tAc5@uV`{i}TdhvVEG#RE4lo6Du>5EtFZqwO3Eed&H}OCF>aJ1! zdXI6PyQSAYm3rzXK`-;;lAhFWq#0@CgjlX0>(KTU*`{|a0QA6=7XRhT7bE=wS4{`^ zoUO5QZVR{!H*Q#-wq_)-G4IQ%JAb{S#w=-&Ol!D5R-{1qmv&&hwsRvK?$mt0HPN5!*KsXtVh$?G zkRwBW;PkW(O}2b+cTt5o%bCWCDlw6!_FTDbx!MCi51&=L*7N;*ru`2*16M5*UC-p> zErQT!(puU^n>u+b-hV4eXa}RDtQ^kyMBuFJ9hObrLZ=neY@(UWn~%O!q~ftgi8-H` z;iWFPf+e7WWsm=4^>7lI@AC?lexD~yEZXkUfKC~Q_S!)(FY0Q1HI|(A#J8Q(p(k`v zb_J$SpK>Ylv{02Zg<+tcWe|?ZPwkkVc;Nh|o%p9sYmK41iwLyl1 zg=jEE+ZiU#gt!JB8!2#=vn%J*N__}RF4jwQ8xAz*5}XKpMK#9uL!}65t-xcbgv5~c z7TKnBvyIC8L3bk7uSpd&$m<>_y^ESSCvUh9x5O^v{Wj#kjnFl)X&^-Pni2cyM^1x(v8+P-z-qsNaj;|j%C3+-W^ zDBtRdJ3-$7YIp1I;BZ{jSG;y*S;n|KT`S9CxGZ3MtQ?!y|8gk#vWNY6BZ(rf(4=TF z?!0|{=j0Mco~|W{B{cB#_Nf=|1X?t&&;KUNXh<9`(^Z*+0b0K=S?Ctiu__40b#@Dg zWYc{)a)M1KM_2Bk=<9DBn$1)jEnKsAhs~o=H+mK6Yx$Eh#I4P|dCeT~7`4p$9SQVy zr?SG&3$<<422u;ee<33!9mVDk=Qa19Bk0Bs7OM_9PDIDFkSBtQZ?^aQw{Nuwv^U}} zl-EV+wjO6&u2=1oS>`X>u+#0nqq@=C;zkE0q{}GuadK4DRWZS_{PLZ&lEr$auDG@S zfq|#wPTCQUZL{qeZYxaIW|`G4M#weC<0{>k*hXB?k-^H$oLg3gMpyGCttDxZt=vLz z(e3pbp}d%R02E6NV{BYDK)nu!3bcelD3rd+;C7+!;+mbElXLe;TSzQti{X6Pu)6`V zNQA9CIWXMjbKSz(;|BQjEvIUcd-?snr1D|$KLLP_ZjjA~vI=b)SR3k)4)Nhkrp;M|Hix& zo1EC5MmQ!;GEr*WdR=iv1~-(nu^k5?9w#R6nFjtLtu-Mt3dx)$uo;h6$o5{(+G2mv6d&T#F5cw}w zxj^q<4~MdfN(5wP;U$ef2`+J+4&utoiR8TgoV_*6a-1f7_wAqYGuw&r3Aycweq&v- z-h6iboEN92y?Takn{?l{C}rig9+zhN$=k!;;iT;po4(c1*q9?IwNomzqb|U8W}0bo zs^uA~Jm0#tTZyvV52w2Zn-M2ybj*Ke={@ioSMNB;g9xCBk;^7qU}H2Gw=d`9xnm5` z>izsm(pQ`>nb&rJB!Mgp2Dis0h1RBo~V;7yPn4m z>Lvey*7F~jF@SIqrh(?2y^sGG@cM?yKij*$fM6sTfQ|doZx-PxLULvp7#V|*d06j< z*51N`s;#YUiz|Y(01RW+iIC2Ke_Au$Pp*%^GkIzr`g~>l)nyE4zbDZrC4%{Az%>6} zop7j#U|u)|1R$34%IEUvgkLoP*JwPm=*)26F+=1_%Egn<1ed~F!j$vvha7A0$(`Ds z5Ak*J6B*-JA%jb&l+oUd^_+iMI=}r~&et&4*41Jd1lLjs<}Kr?6BQA>uqq?8?pcyX z6G>9b?!+0jpe?m*(x}yH(ZcOj_QJ*0eyg43sq_&MYhMfN@*VPUt9e($*E+v=_>Uau zJ^urOG6|^ooBKAvphiER-@MirZ0CbSfzk*GX-r)H`m>~;mFla=vg`Ga4t;;$WNOJ$ zzVo3(tw2R~x9OG{%0Bj5{SE8h*M6u2r3yz_9~e*lm6bB1^Fq7pBO<%l1c3XQUQJ2V zXnOOL7iv9zY~Be5_O8Togxm+jG5;%yrSGjZ)R-<~Yk|C{z1MkpdB^+a=0f=RG|Mjq zj@)z2j@q4!`1Y7oule?11Ua={HBDG&h4JP3chUT#gY&Mc>KU5sMq9d@>(zGH;Vvhk z@&m&h5PXlo!6^+CVVWOm$YDl&>b)OsH+xTd=1&$rj{aC#->L^yKC54n@a~=X4aysH zEgr3rMy!XltE+94JrLl$?StRjte^6I?S~tfMqstX=2i_{B5OBEG*!j1d>E}8xwwgB zSlEPY#y|D@d2%U6!Y^&ldZPOvWQ_9$B%;8Mi0b zpR4A}lFQ98DNQ#>v6kvH212$x;%QUkb;P^rzB8i@TBe`A_3wka+H`;y3DOt^VN+AR zd*9o|L8I8`^Y^CkB~5GUBGK4{=ou8Vd~a_bqjItPkF^_?2R|JgU9jk|kr)SL<>q$6 zsCW8$QMI8r85SP{tAnGt$`?a29_5zj^rKz%H^E#uIazCx)Xl7x^)2<=v_3|!?EEe~ z@$fdXhr0T&G?!RMVerO1OP!;xNwW4zWGvf(&iY++i-eKJ@YcnHzgKu2Xia?y2?-#C zb?<4HxXL_xmWC-=x1%1$#gPnil8l3 z)!M1*u;ugCscWuM0J^$|4!bq*`t=VJCcYO7Q{Y#KVP;@IB;Ugc1il;ab-IXpE$wSH zz}zUVw6XX&t^Z~+Hklu|a@LPhXz0SLj)_8qYoj>1W!)(oRrXk~*rXjDo~+?ssr1VG zLg&hk-%5<2Ktt+wJas9P-5gsAOCHnh?_$En6KP=C-P5w@y9=H6#g4$Y}=Si6}N4ffrt|rFC@jRL!30Qtm^rZA?j7o2=E@9(}M^*2etDd1Vc@z%4lO6(h#xF4%my&fu3 zfMHEs@{Z^UGo$ZtIj;KD=e2KLk(OtFwEU;I$aH*k^ihpYkI3g;DE65rl&aDc>7z&^ z`xD2W7*x~%l8+@r(u(?Md(@Kdj}v$Gyw@wnL1 z>(g!O7Bdcq_&lmqM2n$)w;sYRn#|}WxDkn;qpGs9fo=Tk`~36|RbvnpGkc14d~A$h zGjnR@!AUC{hpnmxSNNPV*?N_5M{afTi*rdI{;3>@Gh%=%SKr7e65j*FI(+$o9W0=x zQTg&C7!Pm7%!}itQ^N=5pzv@Ll`1iQLu5)!R#Z^Xukj=I3mn<(n_RL*+9Oa>b)-5u z4w=K4+4ajGFo&4u(|v!e!6T$XZoaQKLjt@MeFgd5HqA)Z+=|tB$98(9CFIJRQn%aP zHnNn2$hU>omMwM)eVTsna0Ng>4mem>YYn_I+-M`681~<_87jC&>AohB zjUQ8IR>tthzLD*~V|GPg|Fygp5U(?yCv=>Q^G^MdK##pWg}>(5M)gbQrA;3;^W(>g zRIhH(eLCMm0l)lM1cm#nx6bMdT*%-$QF1I(donZ9R6lh=&2uZQY%)_Xjk@Ovoh-*e zr~+)iV$cxtIW2!KS@?QdMfJgD0z7j_5Sk8{^tChX$n06Y)#EAw%ZF+x-3O+{c?a&) z1g~+M4h;LDBrTHiRrxMy)x>Y<0iuLVL7li8Me^D92_+LT&eGgPPg&g_@YSb>hqSBi z#uksR_Rq7Wc<^ov(vKCLT9Z`jZe2Qi``AM*dz^Ld*_k|aOD_1%`e4IyVf{>@M1 zl(7!KPtAC*SXxg0xYk>o&o7VmH>nGvRl)k=J+MU;zQ?<(71sd9`g-VEQ`JqrC}|5N zlC#)q76{qU6fk|`3kdufXLwFw9CR0^%f`>4&Tl>=1Te+Ps(2$Zh|?BR;7lMv}wo59e>Nn*f}boOQ&$#jxaF*7jy zaqJO6C4()0q6na`$m~H_`zlQVRRIfIqty$F&`x|Q~GhizNJG*sdwce#Ysj_io;gg;kJY< zm|k>DUUWS0M6JF^V1=Fc9LhY}@g3(ifeWzu#Uqvgh&XTFZp+xhW!2@Y7O85?^E)eR zW6|!85BW0uxwU`UxX=Bl?xL4%3g}mLt4l31mK(b3%}`v{hiFIR=R4j_hamnn%vsuDK$fGhj48*+P5SmhMkE#r+I}-Pc=@i zJA^A2G#`>4;M{+=KhT_f_}4%KObp@=!+@>_l#4S_cK#qsM&ATvEv@ReabkRVciId= zT$)<8D6&yZMV9Q=(@BzJ|!;UWQPSGY?{DTm3gdvE2! zjnk*>aD^r?W!ZOU=4v9>k<=GGLv|B0UXr=QcS zVyQ)l6|vo_s2ILxT=~KB&dGdlcV}GX%1MHcMUyj5uz)MuDrxUUZ>h7r+#3blR3u+c znR=b8=d;!gasUx-it9&hY&B)+kqQ^?U^>BuZbHFWIswxN=WUJ-Lhbg!Q z%E}?2{j8n7kEv7+RZda)L#BJ})7|g(v96-~-EL9Ay*H$fN~WIfILvkROOIV7A7%mX zcZO{_sh`W5^RR9Cyr4~9Uj5YX@=yRVv=gT^eTl!i%x9Fd^7Qi;S?HV-+jMv;@Kkl` z_Waa_b_jp$b~^8&hLx%f#C{yz>n7SZH@WdxXlcKbPNb33X`O|Ia>#@u`bubLxg8nC z@(EM?mYN&l&2&Cb-nI~XljZO7sp#U2bK}mdQNBWTW%iF}Lt_T1GgOqsdt=?DWL?T@ z@l{tRC!RZU>UP~!%h|CQTNNwQKctd}c?MC%l+SsuQc>Z-T0q{#(s@_8wsu$VdDJe+ z#fujafn6iXu}C`?n^)vpr<5_slOABn|UWycSu zmb`hF#V6wwKam7?NF$>rz5UM0UcfgD$@qb!bZF!P8TZ))R}{h~9D8S%(DA3O47t;g z7;u&NeyC+*(8Q}OJS-MFtA8xtPo9;DDf{s}=sjl4I5|1fsN0I%K79BPAxXo-)2$RJ zB4%OMT^myo?{#JC)WhT@#rDKcW9^`?1T`T=!Fmgk-Z|k-$8v$~64%Wt$DrKtNUPke zucc-Ne+iS~RQ{!NWB<5zLWkzbzi@$nznGQO)JzzWg;lk>d)uF=p(V|~Pq4JH2${6u z_@ptKFja87Ie*;i`O*ETJP&-hpt5-CyVJ!T-!6hccE$F-%bc?g?0-~1t|3$K696Mz z*k4mxn#P&P6MTsUMRHs_fBIB!cIwZBor6t24%;GP$brKeUN2F6!<1fBC!t`upGTUT zjexn#6BXQVCexW1%cdWDrz$jWt1WO48FJtHEeUp!9sD^JwTM&<8C-3}mJQbC2CbkF zW*@yC>GzG|^UQ^ygi@1KHD1@>FPX)8K{n>fzP9rA1m*Snhk8CA1?XGvd!nGL<77$# zO+fuBwm%Vrsf#*tJ-Gjog;2CyX_J!dpON(Xo@z2fVh0dgaHLp9s~x(#uRPoL!EY8D zAf037C%qDGuHLRL!{&njlY^0#KqX$~Rk->q<)5U}yE9FfS!&75R3bz6hN23UNN3ui zGh@Cm9NMcqAE9h{bGJBF+@=10_PyK5aG#xufPyRjjdo=(7Kr)1ljqPj7a_GKEtzg?n?R(ei z%-VN_C6iow`BDiDP=$fs;N_DhzC((0NZ!uIhqU3UZ`y2MG5&d=2_es{_xBf4QOR$8Ic=V?rn^nlX3qW}e($&M-$CKOfwT>_nCKhBQVtb}iSFUhV^Qan zNb**zyeUO1M~FSvZMge(x|EvyK}^60!@~+mQ8L7CqtnKCcLZaAQ?dB1oB`8h7LhAU0`Wg* zUfnqKWP%r#@DahpGYck>lYB)p(IbOHTq@=?zSzeFbhctEe|$?cxuKzu*z>$|cyeZo z0Lw6;JQ$0mp;y|wBHv_!K~H}g9xQ>#%Swd$9#1-1@Dnh$^d2=&|C6ThpAHLuUN43}ux?83hj|-$CR5D!3E=BmGBUng z+}vumqM}|jbad+6Z0ZNZSpNvdIKXH^ht5UZdjUsy^ozS2+zb1TOCWoj7;O-+4#+XE zQl77aEf(W;y}P6S3E6@bdyjmk`5O6^l{Aa*EQV?aK;(s8Y{lma_kcsnuYdn#*Y}Z3 z&{c^#dsiR~dGZfFx9tMh$1X1UCZy|P_GD{tC&isDEot+h98{*^p)-LaA_9d&v)uQh zDOfboNADTeW6_qac0nRP|BR^yS4*i&zC3iv-2bq->16ixnS{jK?vaIGRS?HwPVONO z-QnOo$iBf-TFaMbG0cM8r+t0u)=WA!?;-JRI~`-NI9i9kM@7zSbsvVgMn)S%LrMPT zrcrYgdq^#;@{}g#P3NkS!X~cKY?_wvavq~-x*1Q;U7v7#t!^i~y^+M{5@f+(PkGCQ zvEBXs`*$kom9+m#0B9B2)G6u9u{lIPOKY>t(u`o${Ne{q{#%JF;TM?PSF>Z)YU+aN zODO=d$X*n6GR9eirFm_wt?WOqzQtpYb>DT|KuE3`H#1Yy)HA^pq#y4*|NUy8+_DAt z!v~;JmqIEgb&#fk_Qi{$wP1`Dhm+TR|6<+dGag`}fV)IVrr-fZn8+KVe?1y60zc;N zzKh!RE)V57-CdutHz{ScUANa|eRZc#&no99JEm_3x6o}@gLZxEzet^|6U?{kV3HeAAiN=InoxGx?$c>tNHh9PT z8=BIMa_X`9GTAR-{l-7PsIm|}(^wOISAB1gdk{K;)VfP+CWJ|0Yw<2J&!tt2iRXZO z3UacoTLNQaS(#MRQ9v@^&JyaAumZhIm-t-SB=)hxujAQ72$TSg3@n4VLyfQxpo*-8 zB_#%64-tqC0h%#YN+$>Ljl|(|A-I}Fg41MVY`Vvpl{%EUl~i%U*^#m>&GuM;QbJv|!vG7+UtSGIkz@GHX3wx=gSjx3?^p zhBKB5z7)0l5u8TNVUqfNIVR|LH`!Ysa*TBSS#%#V56O9MM4Qg6Zk|^YVYUOgAsXgp zLH19K>S6_=m`~4z>8Dp3TSE+5%v$NG3v|odqG$deS4@%3R1l^%=A#AlO^v>bV7X}txZ*hp zzhWfsD^ssV|IhF5U6EI>w~L=tNM^V|M8iU-&Qj)h?~*oS$Z1S$fZwIPy~mK_e_)|#&bE-FA{ zugX2(b6D71@OeY3yONii&P=_TLl`D?dpkRIU5Tbsy;DXVuvkDZn`mP>Nk~SVXrXd< z<#3%9_@onjK zxhBk3PhCD}`bS*fIM*DIM4_c)e((mp-YQn^qRdjh@Vk^ zwUC80uz1hFz@SUNzbs0?xl0LW6D*bYrXqjClaZECm;|mlUYp4Pco9s$tkP^?9$T_) zmsEHElDT}*ih9Y*<&J>*kYu^c5b2TxZ|8IG#(MD&*x`a2e7ysZ^NZR}b@vCddsV}c z%ArTVW6@5=9gt1CM*|L)MpK&~EVY2!8g|N5q3oQ!DG26l zOC2D-e0eP=_=H#R?xos|^zsxQ{yW3wMedC6-kF82@f%4#+e#y%Bx7E4<4`WkeUkT- z`4J6?V$@yTe_kfwXjN<8X^?7j8k1bRH*z#jcvL=$CGRz-*eP<{n4l*SYPP(H&BUo! z%(1(>yCJ(pP%4aQ5^?}fCNVp6OUy}#ppfWUcrQfU;}c3VAF}vicVte;@gxldWVoB7 zFS@REjRuMh zpa-Vc=aKzrFpD;k;Kb~YDMK;)J}vPqlXkipf|hmwbCC2ZD9`_raz8*fSd`PwyUiCb zM)nJ_VrBH2)a-rRH4-OT%MESgkGUWS%LX>g$LCBw<@$@(H~&hv`FFNf5dooVV_sb_ zi?aJOtf5;4cA7@DW9s+RBc#JtB*Hg0#yGAyf8gy$MEYQ|%Aba{qxHze)Ta#)0xDk2 zKO&?Sfa`%GB}Vd%Qr9iJ?CFs3aE5b2+h0~(JzASV2w`P%%yK7S4wIMHrBty1;`q=c z=|z%HO2OzR>Vq?%Fpi1kB69n~8RjmzhvMsUe$|SoHp@!?l&i;e%73v>P~t1>3I=u? z?`e#O`?Ib*=QVS+d~jl${r)tygvHz1lE`6WC&UAL;}nAwXnHK3ExG` z?3qKIq=)4u<*uNQ=;MAv#}k&R_Wo@6Ms8o$#Gu^D5))@Xcf~QS* ztX)nB%jtZ~hYpTvR9W0gBKuRpi|nLYLOXFC!`rXac)95ZQ6SogJUGUjd?jxyx-1tE z>{ofx=xoq9jhBdar6l=AG*mEtyV)<*j?P1Qus{bS{2iR8SFfahzkl+(a0L`jat5mz1^x*ImR2;73&hQ z2k~db0m9p#hQUb0Knr2E$;_sX42Q1SybcQ=Aq(})Mo+nWd8dLsf~`a6ROV?9O8BUp z2M#M?fc+s)TzIUe&cT6-O5WUAKN0g)`eTHm= zoG>I80UFeZ>?f~pxNo+{Mi8@Y0>E&D;Nhjqmz@bj&o1AFIP^}1Zc4GuDl^}dTx1ZHUD}>?@QjqQN9^maOS1nWk#b=DpqrTG2@%kiAb@Eq zA;W=5%o=dyQE#C%l;1e}xKDBQRGPwasCMxBqs4(qZRo!{2UQIaZ-mMjHq!s4`F2I*SVJQca}N>}wXCeH8u3q-uGJ5(AdixH?A=2>-9xI6@zu{1OOxE( zUSmX0G*MxP%ZGsjrsM6PGmE)kp`oE6bKHj|AxxCYHMk?8sC0#e(3G;vcJERQPv%HQ zLy}*D94{c>wl?q@hD;B|bKs_+>1AF3?9PU)C^^ zDDN?+gef!|zdO8h(h_i8=%(12GOI0EwaW%lojN-^B@~0_$5rk6ej%C?!~MlOibvvM zM*!hTWusc;qFO6k6Ho(VGB55K{}ffS(dy1qhGQMPf)CmhTuv|_D4Z3wMC23a-MFG@ zcg`vmnGYLm#=0mIB^icw)}qO8AnrJ%-_;OaM0V5w6`eh;tf_f(c(W^i6f8?B@@!rLwd*o@HX0O(e)st-{B%GgbLC7Mb{Hi zmTK7Z*f8C^i2}i!b#*iz4Yy^X`<~mzI8V9Bakkt@2v>1ziCZfN(v%nIw++TX&|o(} zGth{C#r3bU}ydl#aaBgs<@AfC=GxL(JZW2(PMOSLy(5f@a{}y7r>M z5Lf@X2f{)?jXAkx(@|4mb&Cs5e82+{XVUqLFH<0K6 z;SpgIPj2k}Ue)+qu(zmRYxMZB)y_~57g2Qj)##`H|T_rr7;DK||L!JT&ma?|pe+yOqn=SEow$bUXelJN6IdqKEqv zqUw5PDQntm%rfR3F~NE1Ybx3~&IZ-)o1YqY->(qGa&Sc4QdU)M0AXTdvIS|6HXJwv z-KO08Ln#H}9fj4!palu%Fn9+&S04TP{QIHq(DH2qrYOt?olZ_B;&UK6$r(5lxcK|K z|CswTr|+pz<}yO`!w%#9hR4^vhi8u+bX)!V%njq7S2EB8ozYpPnXOT+>_gAHZY(@+ zpJy`?^@9PKfr%*?o=NNK!NzdTw2x(Y#j_b^Vx>kwGG zyFvVn

p3vyzKGW74USmWG_;*@qr{92_r7P>mgd(2^ufR8ar)s9$gUVOKr!9rc7z zUOs@*1f*lrypZL?1fsF=OS>t_$%vAyXfTw*X)pcOqVVTvetVU^41ac}BnefBa08t7 zKzj;%M1Fy}=swrYjSfXQBsC+kq@-lxHuSy3=&PS0PoxfX$%AuJ=SuhB2SsQciCdZb za{+DjPRbNDTJZc(_86pldr%p?F;1$50SsI?h}vPp0``IhL8`6F9G8w76jT-A$tbry zYDHd|H+zrh^#7KGghNOHDu5S$%d5ms8=+p6l-$W))CL0|Xn55_hoC@*;h&6dD4z$X z@V9Y3iMoQ$gQXv0z3(8PJs1+g>M=moE8&KM2*MV9-`IZ2b)% zL;m;xa0;poUMt-k7M|3>y<;JPWaAku`G^XdY?)L#H3 zQRD@jSqe-nozZRDD=RCIMtsC^wby9tjP5El=EA(zo4AF&zjo4S_|moLv`U_K9&)O} z=i>**vf|T&_tIlp0Pf8G|FOuPjIsAG2L|dp0oH)x*#{CHgkt#=gV+ca2s^ zn4et=Mf=Lyp;w5rAD`D|IkLGj?h29q|rI(YW?fi>!PD+ zK!knWd383k+G=8KtPVa4r?ZJiB)Zjc!%|ezf*@Z=Lpy$u8LCNI|B*NwNM>w%PTiau zMWl+%J{Sd9bZoLPRbZ}M3Z2()g{9|jl&^oD*nDDA{2r*P0h)mA(=%7t~q8-iGh(absOw z=GOp`Ovs{9@E^F?S_Joev2%PFf&;TSj^?qG{^6%+|`b=B5kYwqp}t7o8#d+>y>uSwDq zmH+qq<}w_IM>)oUm=dI&gTAXC_6&m#@)0ZsO${6*&9WgU&FNS|i~%sW*X{0XZIPcA zMB1$*9uIA&;m;%@f>;pW!&-Kn>qp?{{%8(BIIgisyNEjgc@T#UfTODu(`IY@L!pII z(4{Bi;ZHsV7f`4e9zWyfx9GMKvd1I@8=8W_8!BKm8_ODDR&+mHidwKwpohP@M%?+p zX8_FwJ=_!epeIG+`;Ik)N$+4$+p%gX_-tUh;;NLTsY{^95keq_7e&&g&i0=wte&$Y3! z_o{RLhaYhdu+^@xc^yd^nGgv0YV#%XM6yy<>TN2s?(~ov3TJ5ueH=w15Wqav0B*J>=KUu88<#Z5{9s+| zYn26dd61|vB3eHriyiMun5z&Y$&VZJWyWEPdt1fb35kilhawAKNI?KpXLRe*XV78g zpS+1Z>Ig_OTn3pheK$*iQfbVsjIXTQ*;?a|33Z!C*1Dy7dCzcXq@{5idCAU8%`0jf z3DnouA9dP08M9B(ms#yo^q;Q^x;%R9wqXFex?4uuNpaEy@`Bq7Rmd&Ve7!B?pPX&w z3i2ttZxy#265m#f^$QW(TlR0lzfuCfJ%;ipEGMO5XKzjRXnprTfUr2b$P6+&SlpfJ z(vHai>IMZ;ODS2|<`DfP)^wI!_X`lvm56CLp+$r; z)QfLr;~B_mHOcr$89Td&W4inDEn+cvvLcyqxpytCJcv!XHLQVPBA%AO6hikzx7;Mz zUJE%KHvI`svhE8px1L4h%0j|?k?eG2LRp|m7!kG9XHOJg+g4$d$BZb$LoIJd)wl1a zP#N2fevpJKH&!Nm9f;BkT5nUIgKfAk*c0pBLjBsTP{SI9UM&}-zI=e~S} z8%GicnXPp&e(c~YnHI>28E^IfvO;9q=fyX7xN|;_13~*57K+X!9FbKOla=#4rBE~{ zSCy>3CPp?6636OPBxdiZ?#+(7_3n;S+M_)lOdt!&Z=Oe4Tog1^srb3@KfJ~#U(9p5 z8s28JO4(|KckuAlP?`l;sjK_BSWSy?IJ%&D$u?{Ta&M_BlD=Dxx}OqvM-`_v*zVST zEa01_%R5dx6sud7ww^Y}No)slym+-@P5Kzve+*uube_$Oc@ZaQxU*aJAa({4k6rPk zPFzUnNdM5M)$u@EgTSY#=*_fQ3c(7jfHc@~9CqK^fso2GcO|KOUh|1>@U-m8r^k|; zsd%*Yf8e-vc!dY-Aym;YrFXTK7X>piwo|HS0kyqg@h<)sPJ=v;V8BV$&tC(6~9 zq2tR4JeFx^->v$2llJt(CSvZUF0#@cxM-Us^ZEel)Ts#&^l{0Ps#Kqr#vo!f|H+7H&wFN16SLDj&AjH zZssRhS6ecStxxHoOp6H3l360$&%B9o$%F1G@cm`*m>#&2*dl-v5Z^GLS>U;k@r&i9 zIx*zZXEsqEC|TvkXYLH*Fmo*??tr!JrIxj0TLPs^&HU(T3JjL{K89;K6reYx1BRa6 z`WBl_PC<0W3*9e{UN=^9dmts3_$v8Lg16Hhsb_H6Gw6w?&nnM-E7xWscyZSV%xd^23N;DQ(f*jciPj>;tsARE9zrr+XNOAroi6+tO>t%Y+2+JqsZ`n7dOktr;Pg** z9lT!=QZ-HWdn~9@r*h#M=kk@fHFF>8a62J{uTWbtd^5XzYjw_lm3_DpeMIaTUXr)# za;{ahJvkEb2g&YdBjtp4a{6sIw;`N~b$EN6C(SyYD$x*~^J@ScEmx_(2~71obEa>J zTRh`|+}`MW3E*tIyWAvNV`t`|z-miy=0P7yf{3`=T&0pz*DCqogiv8@9iRJn=+25_ z_Vld)7S!NC`Oo!m+^)~_t%i_MXHHHi7jwrX`7tyW=;^t|NwZ@P9}BCEPrSSp;0(J_ z_fg@GZDd$YOr=!ITxg~K-s4TabKSL8$0#53ewUu$4GfE2gQt$hp z9`Js2uD^wav?GdJ&+9NOJpQdWG>y&RqoWt67%stFB6{^!&f=}_UD;Bf@ECGUBFjg< zoR~qDGlnp!s@DLc(9~-VpQi|JT-oHvK#qx`obf~4_k3NNHP#bgsD&I>+s7ti^#GiC z-ITMP?qE=q5Egw+jTC*6O}dZRXSv7I&}iFgA-$sWTo`t=<*AA*=KWgtWeTsEFvO8u zI~e}@wZ)rq%`cW>UEE^0S(_oWYqd2LlX-%HTX8g;sqJ~!>iM6Nk@-_Mo)ReIG0BIA zh4oxOZ6nJ0X1i?f8!KPDuL+nWNf3yePRJ0*n)haXcM=4XQXn8&q$HnKvZb~||1$=!m@#;k5dL}05!yU!(p;gQ7#Cgk)(Rj0( z(KNZ%BY)ASe1j@v*@d$`nN`f17_OiB0NEpi9B;c6?mJTHj`V;^Qkk?Cx!9xzXv(F> zqQ?^9fCl341NEoT>&%_~~Nik{IRI+@xi{K$$V z{vv8#;;*m42JDBb66XmiQ-jzy)!+Q`*CZcoJz5{hUK^zSD`$1t=z_)@4kv|p7ekYU zs*eY5;g6ws^zN&Dc$idyJ6}amrt9}mlG?-P1o;?|>li=H>zw552{jEUXV1k3cISCXdz=<18Z_x~DCOX~%S&)?Q))>B8qDl_Au)sWD{frk7+yN} zB1SI-?Q4>uNVZ$D1{GP)MnmVRC3>Nax3I&waH1#M$lrEoH^R*D>1_(?ZA728>{~5( zTDyOysBZ~R0YWfW&`DS)TM^SXl7*V&mgXSzRB&T`a|@+T$w2DVs)}0i;*moDbew|o zMgr!mu+kS1|Em7Lwz;<43LeF!YEGYt-sf$>hHrpz)<@j5SDv>BCys}4b&pO#{E$j* zR)$E8_qoe06N5i~=p_I6(Vdg=XT<3-zgdy7*d55%>ro_hPEV zl|Ux_acLhuluauttj$iMh2bmu^Wu;}WzcH0H2QlvoyY5yignk*3a59CBl$#{Zeygq zn>s~vxM}kNfd7TOI@_~C%-6dX1wRehtWdi8L)deXj0LkEm54PIEVpqA*WgD`>p0|s)nxRv1=<= z1Z8e8v$sCCT5D9!(&@vurgPwWqpA&7zv~DQ-dSN{FX>z>S4`ViwU|E zj3s6xRbC&%^=kUJ#f`Szf?0RFldKPkiHQX)n31{VrBKdR0^SYJz>8g78E}FA;XSKa z{E#T&YWTm1?H{IKrl%}bpq-C(kZPNWNEj*dKR?*YW1zWM!%8Bq%nuLUjo%jf0Vn_1nPKc5oF#7*H;q#B}Yx8GIv$dKv7h*g}-u=;29Ak?O%!1s|_rdx%;<#=Z<>>Y9>VDr`PG)>Jo1Dv}jk2z-ayA9Eikh z60RN{yy9`@;qmq%fk^569(S3ydOSoXD>%6W7|6O!Kkn6CVse4Dh`Ty9 zDlPAVg(i=3K6t4P0l79(Qq4iVBn^#{1g-=5LlY(aOW^oyuNVR+#Gt4}2E^GZc^aMD zF!BCw8K-p>+tL|{&_K!jKaDAi$#2F~Dr>iUX)UjGRwPPMd1jo1Rfe8iD906;kr9{y z7WY#Bl6Y0RN;gc&p=U9`rVQJ|51ibJ$gQj;gQqU<)C$iea2i;wbI|e0erWvTe$_64 z9{_spuYD9wjSUUK$4_4hQ)VTgnnr1nI}cKlKg5!h7vlt5&fmFc^28DJ*szp}%?abOH5i&feszC-?`q@Q1!4+oI=-9F{|= zXA%z3a-IYC?C1ptpEBic5g`4jAly;QL`TM@mnRwO=miY&tCFG|YM8F1aHza}qCo0A z^~U%?zE1V=Th?XhMxpgIYUz|Nw&`;ZQP^M>EmIj5al4lY)YSx%K|m(X%qGkN1+~y=Io1GL z1F+sC>0^{W%5f<3(ApCpna&05M0;xbdgppeTY(#WZpV7-L7TZ9*L)XOCrVdVz$ z*FR6!DnrkP8QLjQ#R**4X-h<4KAXbctokIh6K6bSYLxMyE}*+z;$4`}v&ZYiH>a zXKKfk0*b?I{v$q-$}OwG!NK%K#>Oje(D{0b9N(9dP-{t7yYknZn7$LigZ4k z#~sB>S1&$39|DE2;EieuZLVlr%GlaUlw?4=9@C;}F3S#n=d*V*G|Y?hHqSZ_XD`fa zxkjB%Y^!ovozrDt87Z9ckKTx1)KAQu#V+@oMM|syM}xK7@l}pYnj8K>N%f(L%gu3R ztR2+4Ir(|-t?So2xviXM)-ETFJ2+D}e@w}1w#I6U8nU?zR)RQbq)m9+j6d_*axb4;q1{M)v2Gkk zX2NRBqlC;XpGBVlX;QLjies+W!&*>M?;X*>+Tmc3J}S<3^u~lRCCC0!qfX}3ig#{w zoNdYDwwgwh_t(uH8vcBrNf^X;yq7YzZN2%_hm7@h(MeEXQqA?WscOps)z^vog;2nYwFNRTHKCDd*%fTKpT?{69{|F5Ka9rwuHv zd`3$R`GBh}4KGEVEyRLJcqo5C1{}z;4pO-2#uN_!e8T1VYISYNE z*>V!o^D zh_CZ?SB2CVPI;eErjhjA7LB?0ob8W%=(%3VTpyb*OzxvY>gkc*A(bjdH0Af2GrXw1 z^*=P%m3aiXn z5b?v_F46d?JvX%$Hm6Vrw&YXVhV%O^yar2!gsPQfvtHzFtHVsMt^bk4U$_Kp4nB=3 z&SU>T&P=*W)+pHmCh9%7d5|bK{K!>kwC}?K`bAi4sOvUrZO3%Y&9l@cY9K+g?i2}9 zIJtanWYsEF;CZW|pP9sx|BVua)t$ea}g=vY4RX=;=k1jm^(=`$k8kPnAFHXZ_CrLj-_$_o1=V7^tOC6;4kdeFULc)-kZPHK zRNRUXl0A4&C-({h+|SE^0IOTsS8%m2-+F<_w7o?wH6>t$#EU%5if4pcdY+vRA3R=o znB{?Xm`po2#VmF<)5UN%9~aEm1cH;w{$V@4(X?j?5Xfd)%=)hc7Ig?c4bU#foqPM22*SUeM2-yB zBc6?$>w$f>H8n$vomxatYk~iwyN^}FVD#p6#JIPcl=YD{FehD*yb$V{e>kSdZ(4|G z(erJ*`6Sah`L^HaZ?%>EjiIw}VlQ0ZgZ8ux113CWG=(A~5!GN9Ch!pkpJl`DZ^Ve~-31drhq62hE3bwn(gIb>ng zz^>T^RQ%<@#s>Qmb81lsb6w zW+^CFVb_O7l5{qvkW^_zsqh%0pJMs{sGo8TNUPO|C>w*4Dg?a^OD}WIB_KtD#(jnD zY1cvZJKZ$U2GR=C_N9ovAF`yZQ6TKrN78a+w6l^jz7E=!-~PF=9=(y3!_CJpO+<8Ih-lXKG4V?>)lalHyU(j!i;=Ij*O=l*ZZW<@Tx_eDh zu>(`1k>DiGO)DY7`p~@=$die4ly(tFzna2vo}Zg%`EBZy z`mV}2dpDWXlq`qVcpNvBB{MoDo0SE5D3d1tLKpKSsa{s0jmrM5335v#I zisyY&id)zD!HiX3AjMls@-J?&4=n|1v86I|i-lD!_;@UzIMZrOZ&q0P4LUOEq9Yq( zF8^WC&xqk?>8DR2n#Ef=Sf|#HgOD++mqhZZM*8(_ZA$rB!-&p0p%w!F1HnyC>{Uy6 zU1r+ra<3!V#|>XdfkvQ)5>PeJrUA{fv;Qd#AYymn#)}w)dDDIr(fPY7)m^mzN1;t5E9GF5>)|{H6v}W^+bWK$qDXjpp2H zUiHJ%XQLw~K?Kf{nC$}z^tyla9!f|DL*_R>^GOkCH|RKQ-bpXytg)4-S@Ah_ zi@ZdJ)=^sRgFMpe?ys9VAvf)VY$9^hL`PnNyvPi4KtXi~@1Q#9;{jWFt7Al~Op)Tl zcP+B~3WzMYx%!79tRG8tq@44e?zqY@c3JIi0X{`EN3f< z=VeRK>feu}WeSbntq3`n-JMSIui%3o-1J1>z4tY z>D4H;ssPt03)n+^fA%l)Fa|E_hm^8jplp2zIg|L8p2j^LXp;;N<7Nc6DC+9*V{;jR zE`YlSRb*)Ed2`p$8U9fNW?bQt{Qtqf@_?R{dp;Nzcl21d+;Vb9gv0*mIELrv3nV3O zna*zDYc_TFq+3Wo{3EjlFKdCkY+^cV4N%k6KrvwVGu+bwwQ=ADSb70-2|YG_gFO26 zUlg!kk1+m$Sf4&A%e&-127(Nv_4baL0~-CZ;lc-NU>Kpiv+ z0lb!F%__NC^EdvsrG{S7zcI9*;g zw+=#R{mXdX!+18u%}Tlh3o6{P%B3o)$?p$h&qv539~bdmI(5sAxU=IS{a=Sn?uo%WHoo;O7iVNl>GL8V5W}@t zZ8-^>Yk5w%q>eQGAKgCin#zzK(jc9Uq+LM*0Bt$83UvZd?Z1MQeErr9{P<|6t;SQ# zu-3*k^vdcbXeh`wzC4#w)0ys|{DW_`+?7a`^B6s_+FRP%blS6yn=}0BGC1{eTSg1C|bD6GIy60lCa|vp6L+)gNg?Q`m+V7m3CIeGzezp^W=M z!vR?+tv?yNAXbKh2DA0_IUL_s=W1D`fK0b|8Y&7ZG}w7SWGom|)KxXPpb%(qNk4Tt z+TWj=NN_=3H~nwm$kX3z%j^LceOrSENk#M?Ng+eN;vzIEMN#gHY)q!QMZQ%iY$Q#2stB? zpkqYKu}R^LFTyIpHc+Y0-u`iHzNsPXbo^8!_{<0loOVD?BF@q6!NJebRm5YRbQ-^e zMn1OmwgH*`TX+do@*%@N>Fc7lB|qGX*9Lm#%Ay3$p>l1aH*|U!1grXuS7;_zI#G6_ zLTgxEM9~20<*||Uyr z0e9>riB$rC_WV(W!lwxc34!`7kII_{_Xwc~WdY*FM*=qu5Va_!NhX&a%_Gh?F7govR zwX%U_pFKhOCY^$S^Sgf1k7|S$=fHUQ6yv)YcKL;D3n=^7yY$|qhpTFt=%lkC$K$ty z^_mO0md;VBPau$%4LEIV{hWGnlM=OI9ts$nwo|@KG@&WwK$6a*XwOQ0z{ERlvfvDu&YW*jY zxi}`4+6vzT=z7-IV`HXAg&ux4MQBJ6MNP6n?B5&fI5l^er5MC?Py1Lk&pbc5715ib&rZ`@KgZ{+LsCExXlHl0zWH&pM9$G;u9_w~r!v#g zjW+OZ=BJ7u1PT+;t~Mc7&1LH7YXP6^A}<^T&U8?o-Yq-U#Dw)4&@V{ez27~4^0v%M z(?)nbbFK)5iF!F6?`mD&%&wi!t)M!8KN6~m4C6}Iv%%?Nm@=RmPt#5yqTtg}o|&Rx zVM=my=)4fuq0P2Lc=q%<%&ZbUt2t<{?o6|1j=G&UI9ZNWH?@KedzK(mN z*G?PgE1Er>_yF1T`7#967UMdKKS5w2IMtXb3?+>DpX1Sdj)WbE z)IELHXE#9S+M1{NGJ3@DUzb*<9vrlO$pMal3ZE%-I5z|rYm!aqPa)}h-(8lw4_ZuI zPggxEZWg0h7LWrW2&I<}q;Kin2 zG|s!O0L|Dk#uN9*cXpcgm-a=ROaa#$$iZrzy3CnM8*f3 zn}Np`N|@I_ipNfZlbcOFXA1*1N+smQ=61+a$ZV-9dcXpgG6T zBzx+;Rq}nj>0{-CKVpX6e>|5-b5+^fxUY_|IRC-&lcRNM!=|o%R|G%59^q!g4k06vEEQL`kwlil=cp(0HPO+ba?rCJMZoCAd6F1!Q zc%`&^S)AaCz(SMgIEYlU=qaa>3Nt(n$g*10dHK#jhVc$%ZESuVfkII59p6hOsXVnq z25`U|_Lp{Aq%-YSgYB2H&c)qHfH!shXX5zp^|j z+J&#ryo#cLYt{uy!$%Y!_H>=m4)-XNlG*5cUm|`xa(Ysv?#6eGqRkJIh*``6kiy8arsMAhM zr6C;5q#!C^^7Hv7J<1!*=?yKslmq&K`r}>+=t&%lfoV6$49QI+kzkP2M_eom84lJkH|NL~&GNoW;8HAMSIuDfh>m^@G z8C))$tvn<5`~-CfImYBR<6%nQFuzS_&RMgx=L!Ims7H`ELE`TW7bE$T+7;m_FV*wf zybE*EQJtE$nra0Wx*6VR6`X8jtkVV?^`)Pwk!ww@`i(a*Gh_OTVcQ;Sci$#A`X&w& zPAP+0Ms13cy3vX{KBE=gaK>$#Y035%V`neq@R37P*JvqG&wIn^`RHzr>-kVCci|Zs zyAs!7wLDi)1v>HLi{AN~V?WC0FsDAXREMn2e*ffJ5b%1Jp7ClKXa(ZpUsrT+wEE5Bzt%XT z{>?JJ%={Uvk(5JKqmizv)#*cudsA@VoAk*niApsEqO%~j`gdCRKE3v3k;W}(X+-EX zh8Di+DY|@a@tb(Iyj!RsdrN}WO5{RlE>&Zubc70}0|s4E(D9+zI`zUM*w=&grJPoW zgDnj|!k(bYC?nWQ9h7NGI{a!-#{=i79DvhYyjs(WJ%dQLjziOh#$2W1GM79BHFgu| z43uQU1VdSS>E2tUBX&VR%cWqA*HR=#wbA|2K2#5d>gOvk8$}$Gk5Fn3i_wYCaXKb0 zCogw0WENXYH2A^Zm-!>(c=BnJ-@A@@_kNBR=6hFmJ^&td&804qu=OQDxdbZQ7?mZ{ z)4)OhXH$_Vy>6>@su8tOkKr(*Y{|;wb#aHNg?@g-Hw3O-LC{N#V5DvYrU+Ad!Rrnf z{j%2w4QqyhWzYWXkxvJy>JmmS-1`8;EDfn~*o-Mz$y9NI`MnlW(B#%I<0wneXJ!_6 z!IrYj5g+~;o|770$BY;QgkI4=LXmmd3@}JguS0p3W2~~r`d`E zE-r9^8WZ;1Do|QB2-F`r;?GL0YfrZr@+mtne(6ySu34B<#%YOK6z`7}V@+^h_%%*+`kgza<9B>&wF^KsT1jh04eL&S2FMk;yzR6y3n zmB)xN3S&uAEz(avSo$<)96&q<{eopON*MOq0IS6q8TLx%gQt4vJZy2;Wg<0;;=9-~ zEcg}m2`_S?CgbXlcIc{lIXKJ{mrbD2k)~$7)?@OFeGqetWKRni6gDSHE&K+qSmNUJvzY)VLYGNz1(VZ-!SW_Mg9gG_@3-&W6ew8XD*r<=;^HbvT z9w04u=b5OxaGuN-ziZoPEu$aj1U>CqHEsI>O=17O%6_WDGxP-J!=TTt^JOb(jm|Tc zVg;JLK$3BF!Gx_88F#%OZkvjjHH%I=r@N6hXE8C$?XvR2-O}V8!BZh{j0(7p#WFS; z@G8&F517ekn-5kd_Uw)OwW5~TYJROfY)A}GspBN2Z#ycce ziwP8%Ro(AFpP+~BfDKa*Z;nln>*8UH4>zPx3aHC*5R=7KzKSrbkW=A$9p zW_M>p>w^yaVv6ASKZrdu&<`A_*8{Tyo-C()=ic`dTNOjUk1E~OSr*i4ZIX+P(~N=s zaqs@=Zllcl+&v_-wiEk?2ZPUD$_a)2eG`T2lxVfYmEjj~eOqtXENd~Fc41z0*~o8s z*|V9nK`$+Ye|*;%{qlu&e*lG(4A%mP6TRehQy(0tVcG1O$Ow7y{%pMQ?S1VJIw}}4 zK0Nr=b#Ty9JF$9Aq*s0*dw{(&T|HQfoGR%|d@slc!gA&<@04aVE4}~4O~dF!Ha8)s z_Rx(B#yo9`y>YAj62~}%QdRHSFlH1^ywx~eL|`5C3 z-+t}!5rR^&Z|3orBD5foZ#0 zcjJ(W-h2J{jgijMz`ORP_K6P7s>0G#o7KYcVp2X!~9$Gey3@hheR+}96WcKRpFYZ*E1s8*UB=NY`j7=KV?^K}Hx`E(HvezPvT zU#tv;AVncrtk)c8E!}Sk!vzj|D)h@@tS|`aMfUczbyEHWm+3ikWysYj8r%kJ0*Z@m=c7; z*GL4b4$Lfsv^&~tPa$5D$v(O_13tx^Ou?t?1y^il1tyJmil<}v4=G-&pukW$a9>G9 z-7~~=jYixO`2B=;|qg+x5OA`jB)SFPzMf0uN0Jx~rHdxL04iap>gI z$eFS_ICb>SGDIg*0tpO-(rlwtm&Gk7tc@UqWKoMTvED5S9!*$LZ~_K!_3!`(ZUm;4Yf?3>6} zi`@hB`9$6*lp*Ba6^ z9xg<=x-)JM2%eQ7T$h}W=6faOtyd?cD6so!Y~ZE}W@Um_uPoNQv@7P*vUiUVGHc1d zVo`&Ux+mmxjw6ewe!JrErjqcc6W{qEU{YxG5=E}w|NH#Kdpv^-Q+p@(u^?kTaXvMH zMxjw`#kPQ`NIpF9%eCQ2Gvsw>h=>H1qcnF}6LO3uQ@bC~i!|07dC~0^uoDCPd*ioT zLww?DP3XMCj{IhcK_qwGrxe*05+O(n2soHEc`v+Y40r#xV3?c+fj=ha*WWfbp9~j; zhq9KG*+sP*iEr1T$E?GP?l0NwZemwlk`{g~+c0(WFN0x0ep~b)(Wqivl)rmx4m*me z^Y;tm6^6rbH2o;u`gQv`gRo8X{#|d?hdeWSbNCGy`Q*xvHg>ITXCE*50P@3|U#cuq znx{5J4I4I=clr>B*doh$g394qO`BzoaJ`WF-xq!{ldDY-M~u9L%1$d|I}^9HNyH3F zxS6hDcv{b2&I$+VEk;FZZC^fS3bxD&9YkBlORu;K6)UVh2EtPlOTo~Rd#1_pPBITVZ!|&&FcxV1kLU!=v{F>1^ z>=fkegoz~c=y0R$f!wl=k)}^yAb75{nuX; zarc-xSeL)|j5A8@s4Z;0ie(yVOivg)R=x1)Ii`Ns8UX0&q~(UZ95W}M%LBa~jUFXe z7`m_9UcRI=W_;IW50Alw+pkaOBkQuH8q;6S*|pY=aFp(hze-ityc|{_BUg z_h)IrdB&YXJ3bTFQjA{%_iTzgDw%?EUxP}-WXWVi91{^ic zX)N{Fj|VNbGp#YFVLq;{5mCWqt-e%pmT27O&Y*7@_A61dNQh=yjI#*^{ZFKH7y>`g5o}8VJ z#tOjTM%Z7B6fnBG>%Q*iF8R+u0hJ*j>*HPe(J~O1GSmRX(WpgHwBogrhG3C`&<#z@ z9w>yXsb`%=?=f>ugqqXJ2v2tI8Z^Gd>E42Q9p)q;e-9t`DkG2ua`B+p$E;8^8l3~q zcFx20w;Pg12!gTjP22bAYQbSrX6axp7kjl52WyvvlHH+15@r@RjgmsmoArOq{nX%s zsWq&m!ObrKgC8)xm=Z+T8yZ4n(nf=pupuN6#mL`EhCluln*Zygkx6fV&tfn?}Biv@I%3pmNApPMWrQNw-xD9kr<6zP%Ew(F{4$x)&G6V^+ z2qRI?;i5qW5?hCK%_1(~#$?z*BI+_qSESB^92fMKZ=(V65~hc;+CZA^:F;3^sd z;H3qMc3bjG7t^h3neyX-0!8ZUI8#8QX+Ip8l9ZHWr{(c@2nbRZ=?Zmh0VoPY09^fzCQf|3*3x>NH`-3?Wiy+K;5HFpkR>wrHws+hY# z!2|A!Y*a{)UVRS#@886GD`D~S5b2(36^kO($L|@@cU&49U6!&n9x4%vR!;d7hn$r( zA)gIvE(ig@xfe3<|dnAm?x~DU(_NOp7XAtqlepDb;R1 z^%Hy|tMAv%!uTCm$VGR~1HW^ZOokzGUchlhRF{3Yi?&wEhKNS5kP(H*!C0i6{(Y$T z!l5p8&~YRY&F)U9wq0>&1wKl*c-?Ujo!kTv=ks9}IGbuPq5QU{%-(6m#&%j-TFke< zlsad^9zHH8(W;*9!c-q-XuCatO1?uUOyXRsXWr2cTHPT?GD`k79N1@VV9OO*JR=wC8 zZm{ds(bmY{{vV>UwY9Zhi#`j28;#@fIxhtQ1sKmLn&IOV)!9c&eSpru2h6;C28@u+_km|-h0`S$J&EfK<;j33_E(FdWp@!#|hi0b#j`+D|{dcZ@|N zVMZ^b&xo`859=^A-dIhMbEXfLZS2G)nI+oZHQLONv~Or^t=g<6ZRrjST1s@TD7_F= zO^{XQq01Iivy(IVk0eS7?5YJ99rucizRTghMSty8exHqccecKqg0JdjfY6p={hME& zF)N;JTV~cO6)=GsU|LuWaT_hcr2$5CP_iS0>~WplQoJ|pt1kr)wT!Q#Hhs0EoXzTL zb3gf%;Jg>saTTPkqb};NUN4=b$W(j0hGdu0yA0o)F(5&5Iy+mqtKs$s^amAyc{+9? zXi2Z;x_hk9_0>D5#XC&yw4(MMHE@LR2D_2gdIWoqj9R;Hii-B{Uv9`@zl8~HSDW~q zGsKG#fA#w2Evne5@o~Cq0iLIr=qJ((w=xjl*|jG&`~=pgvI{tVT54aerRO!b(qrcA zjz))%G*GThZckC4H2g+0<8}K!>rQXJ>*N!nujGVQb(oK)pN$8Gf#GmsCl&n-UX#if z3$;;btEEuUMwM-9KH}qtK3ar$`+N=LV1~NKuCKg5`>_{byl&{omcjWlGZ?j`8>|^5 z)&A2Tx8U?U4))rKgKr;FWcx7#k?z&2ps>g7!+Tvd+qADxLlP;19*2siiI`}GdB`Rv zx>GvPb{^1+gN|6b8MnVY>AZ`0L3&H1jXQOWBDZCBFJ3Y#V$)(ku`Ep%F09zWNi)=a z4=Q|RKzO+^cTbkiGPDKroCDrbD;TM##n|+;3~oXTn>tVsx%=$aUaq6Y@j2V}e5~_d z)QiT7uer|)1#SwXT+Checeerrs#s@wrU5XW8m2~~#s9$`|4r)QjhzK#iJUG}9=IoU z30ZUp&rfPPFSZ*3=b?b>93LY{2-8vyW4seSLc@UGrSml}-kJ5YK;)4VTGf&kYP0Pz zi&)>3XMOWPyjrEw5tJ&l9ggH5v6YjPD_rU}@~o1_ct)22=4A!YQR4H%q$77yhnx5z z_J1)>1IDC3w5{(ZvZ&G>o}sAT;5gG;P{9}wX;2)zREpE;3*DTh!8GQ6auE8d7Ph1w zEyumtY&Lu+Mo{$v{(;uZJLnOT#oYCQj-eSIpI<`BxEkxEPMuCy3cSOkpO2V1nuWE+ z;ZV}swYvjeVO0<(fyDTtt^NKDq1^nt_Hl_<#|wM2v_%_v>>qVIDmES|+CI%<3NI$d zcvA4&U7?sp$z^al7_F&}PqFI{??42L;A@idZ*;S?%u?$QLE9}7%UhYiS^W1u`TI`; znkooOvsfYp(8Zy07~UYC$cRul@plBf-by|Fr|=ugQt4}mk`^J-Z5*d?7~keWJ~o92 zLUYRdmpwM{C<-ix>_8vMR}RsSu~iXLN@rr-^;;U!ZIe1}H+agr%e@9GTD$Zc=y!s( zw*aCPehn91yTA4+w=!;)H0J)I*h4`#=rO zZn8`o3qr##oC%xc*@AXcLE&*9ys{&Qi-i8BpqEYxt~Yb%zR$po@bE~9$(k##*MhQR1aFq z6mzH#d(sGp+c2-zU(I}u+Q=tTAK0GoWDsE*6ItIJS$7Rum=V@iX2l?6e9vS6@j*Q3(OWc&|xN!YuT~oGv2UAa< zeB%R~n&>604ROs8cP&SzMGVTeQfm7`Q$TVklJnA*eY9}bYo+XcFMbXX7Mk}TU-1eR zgCefqU&QN^g*lOC`o5BFNy??wu+FhiHg9CuDbc`E12gEkEdsFx_)+LV1FeeJXcigu zP%ZLG%na=gool{8eHX`P?R38-2_`!7we zi|uR}R)F!0iBu6aLgtN*sa7z`C2zT1W81h9w6WEPjwFm{Uv|m+O62qtzhFCv`1MRBNy~Q3-f@QTau<^AX9~)q zb|DX2?-kphriEVDaS=M2;2VPbL9*)?5LgL6U1PDQK4iLKkhLn%za`rPuwt5j(`PCdxK4%U(5v*k0_E6*c6lL6B#b z&Fay=7J@t9pjX#8(Pp^ZkeaGztrNF>4-GRvf289~=IG7_xeUk^;gUM3(*@_1fzSeF z|ITqrdO20clAH+JknDFX*iqcn%U$6Iz+hwd4iZsG^<#(KTS|#=Z&@2>8Z-5*uU6lh zxa&Jy?7Ok{X~C7Cp(9* zr6>6ZVZ5FMQ?GTi!YjacBieCSaQ}Ow$QN3BkkNi5 zt*d1D+rd^M#|Am!jgPSl*+h-BPXmXmh)7*%u|K`Qf%ieh+o7rX5}o~-d(ZZWwih-kBOkNjkK6`VI}dr*eQb% zISHrK!S2PHCmL&kv;Qar>E$|^;C_5!zq@VJR&>bKuGf5M=H|!W9@Z(%!^tymkL}iq zxWcMFq{HK2Tzq_|&(<_zn{~Zi?3UYJ+v8SbuYmFb*Utfi?Jwm}+{Vg4cKE)Xmdu6z$jr02EPWA+nF}Zs z(W#Z(Q>V(Flq6g>u(4y72y)o;=E8!ViSt+%fmj|7r>=j#$;w8{pRxDW*FL!n7TX(Q zM}@+SIPKMH2qd|}-@HE+leXSX6I9HinK#$F^ScN*Huh*zD4Vv=s{JMSzu9vPs{1a5 z%gCyd#(0G?QG1a`Z_k1}w2p4t|A4DHQ`1smt=9~7rK}|^q_S?#o1rcC2zc+-g5RI<%7sQ9`_=P+t}yeypw5 zO8r9F22nKA)7QjljKZ6XT6bTYZp8JUUYrX2ZJS#unL-uDJ_<0$?S=H%7WgdNUkLcUO)E?06N{mOTx`c zc{Q!uZ&wVyai^@&B98=B+9ZSvr-{NYR*V{V=Zv?Fx~$e}$pRJ|eQ!G$!IObt_!4lU zo2>@Dxe5Q_q2zgVE?5nbyCmIH1C64tWZwmN&0%dksTC-wEl^xHm%4rbDd@sQr71D{CX6*S9?+Q}v1U6s6i z^BD!T0J)TDr=Cv)yT0rMKnpc-Cu{A^`U*Ka4+tT(+~Z3@1zKCZ%7wHezoB}l<~0cxNFI5P=zLI68aNx4}x1hIzK)? zFW`vTAm5$}C1Af$j3Ov&K+6_VYe$^N=uWph-4nd+vg0X*!k6@+pFMo3*nZx^P^YA% z1pa?Avn=YbZzTcQMOSq@;>06+36~~hX0lKSI&gGu-}d{$pU~yma4r+R+`1*8l(Yi4 z;yEWjP%P8|1$jHIA=b2KtVj@Jw>2vK(L0zP3Eh)VpXaC_n8_6UXKd^ zKqdhINs(ThvkYjHrJ>y-#*_cc_Qb);^|6FJ#Zb$~oYmZ-61Qw9`~qWtux0GqU)!>~ z27Tf6E{)Sa7h2^;TbiOEYXUJQaI+{mQg^MuOE+#-jRTxj8_d#}8vxZivpEQ0Z?u5y zCB9*v%!RbO2AB`*xdnqoV}@Hk&Y=NwyVDB#!1D~4mw3a@%YDH{_~+{ofNwi$z`M;qm5K<7yyC^&MbyY-fwV4xJ8rbYFof_?2vH4K^YrKP2@ zPH14=@mh|Y&`qZQA7=Gm>M9 znpd0@b#d9pd?#Hk(6nT+V_;|b*ML2BdXUp<|AQ5DTsk4;B;moP7vnW5`@qo5KYvim;Y-4Aja_DkoDJC{J#u@)pKAzB*A%;xE;Yz*UYLpO+zYfHFA4we=>u+(2Zox zyt&bK^j8W%qkSV#DZpq?t)1NJ4la6ZAKkbLacnUJ0xiJhm}9^{#jFtmm2dt=_yZ-N zOSX%rhU7ARPDokzLy;gtgGpXqScr${BHqk-Qhgj!YtsEImE0|Y%f0>}4W#KLSiW37 zwS#KQQ$zC~2Ve(mdCySUp^yQEhXhI;3rkChkX9u8=T_xZ;8wN64q9XVG{7n%M0GB! zin2CWY2T&V`_&HMsZgDGhZG*{0?gi$#K#YUb~iP&RTb`3ES!2@Ymrg_u1Gv+ACIU> zA}-_SknQm~2fo*@N~Uuc3n1>&IMcnKRO6KtKD(ND$TrLdA;&lmbla*<2nb@)ijw-G zj#PIZg|<^>zg=TIJb7K@8Ry=k)9b{^keNc!EQuHfrO3Q zkMk>9u@|qgpC1`+8CzIh9^H9}+m~VB?6HfTmWCpA9K5HtYpX!Gr}^9d}@bP)dJ&}RNh(&y3o4uu(I+ytfJ|>XWO&K8Jq+#OeDSH-=T$;2 z&2Y<4pi;;_rxoRI+(E!T4iUpVLm;v>D?dusCQ=JH?iB|SZ1**b$v&jmV!kyGbcmbV z#@7GEezt-X>bM}2yvt(f5sPt}(gwzz(lgphkFDLTZhs}E)0#Z>v_)`N&Y z{0yiY`%Be}HCX_A4Hza{!Rq!aC;^$1f+lF`E|LLHSeVLg{5L5MT194spipSZlz?#4 z^yb_hEbKhwZQdJ(Fx1M77-uAUmvRO|MKY9O zWbcv!I9hk(<{c1Qmtz&xWn4sRjS`TMAX^?ND-X+Kb_FspYui5^^M2O@;r)d2=IuZI zzie~=y$D7?UbPj3JnY$mTR~*O;!wyxu!+kcEXmwGNX&R?Og5{J3eW{6wH(?qHZEsp z=keiSt%lq^pyDeFa^uyU23m_f7+BCRtHvE*1M!sT0u#Ok+=Gm(>u-fw)F8&6n4YfB z&FX=9=Ck{GFC!sV@Osf>`X1nhmBA#PXMQ`6_W#k%YwqdzHq13-e=+gaUb z{ymQ{L+fbb#6rRNfnn`(yQ$8_f^n;1^X}P+Gr>Ko9<%9Wqfc#!d&cY*>}dx0@{LCE zq}{gatDx}cIZc$^1;N!J|B7>>C1f8~hG>LCOIIpbe5CB~x+~@zg`=VPu=Ey13{t&@ zL~>&G90Pkt2YrR3kUnNt_dE`RojnnhT{s}}x&TbAL13zy&S9n6^c-!dND_jQZeT%m&YtyXC7i7kJ)w} zy!8+cS;2`(V(P_$!4D+U*P_F1PV`IzWGFAdJxS%O>tz$4cdE)mE&5F2H7^=L=(kc~ zqh<-q-HUF6a`bNhYJ=@*(;G*$OBw`;qYvSU0y~+Dm?Gr^K^xf4R;zxWcmU^Zhph|dH)&_+)2ZQqZSBioCnmR7l;K2IR=F3s;kQcsrq@%Y0 z27AbJ@MY;|(Au#yF)+u!vmmO*OIU%ocBgbMur(?WuTs=_y;H+dCL?zSs@N!ILx{O! zdmS+)Ocd@=P9`AvHN=L5bTiJk_u)6Y_`?ZtA z;CZF%#wdUDTFld5sj>3fC&76W?UWm4~z#WQa->y>PK&!1{ixj63ShFOB0bJvJyo+pylhST@9FtW-(>-dK?ut*?{FUt90&%F)C64Bi9aA9&DDc za~j#c^_42&D80H61&xT5>G?rtbHbChGr-XJuLa5 z1}LaKmYuaWAT^h@1dcU##B$pjzWLpgqas<+zRr#jcbrGqqoL0M;K3iN6M9E|Lj8rX z<8n#9PD5M#&T)W(R~b@xxWkd^O3n~T;=A^MmRXoA41Ab%N3FO6H*}$3QZ%5sazJ1l zpK3+K{A}}RqNbR^Wm7gUBf|4}i(BQcxaIk1oKu#sugfDQC6b;;Q$5;c!fp0S#`ES= zDO!d`$wIaP+FlH66kisH@4vtPjdhJQcRuo)Ru%6p<7IP-H@!q2%jKMfV^l}D@$VMn z62nbTb1bO~Q87e<Q~jL-%tbZW5-Y9^6R(M)-y6s{HzNp3ch#4eB$c>oW`asP%kc zr*+FNu%ltu;nEoOS@vmjyF9?Y=iWu&P>M5$`W;V{?j1g5yVzMC8M#<{v+~frY10p3 zJgQ#g_%Z(F``6citn`TaZl<-JI6JlO5pfO3h=`pz)STscPYwI}ruo$cBdv9{S<7oj zj)~JqyT44aX}^8vWFxD`soL)qY$t}XsBA8h9{YWM+We-MXPUe|fk*(_3|Oi69eum# zsh??(X!(i|N+*2Nf@Ya{n2_o*@GVTf*~DqKHChn6)|q$C$`5Fy77NJKO`RT9AGSGB z^?FU$I2K#ZNZ#Z#@a@=>%wy9ReTZ^xh)EeG!u;4u^PQYtXp3PP*_+V=D#WBO9#*nB z@_6Ns9V|Ca<3v<8J~Jdy`{08(Im+R!*6}KjG4;0op_6Z(5>N%`PkAhdu0FLvgY!Y>fc;;pOizGYS| z-Ia?Dmn9vQ&lORbY8*T*nvJzy*z=(OF1rSfCvS1?Ep7Tzfy6}Z7ab4FHw1e-%U4mx zlf$gt$M&p|C8fWZ?fqbAOFJ7noRzv#XCR?lU+JQK;G44Y#iv;>r#^iL zA{<6eH`!?apr4X~pgsRh_fa*@FS&1CtVrV-h(X{mzDgc)oz+f0N{m|eXRN;x>H--) zA?uiC>6H@{@7DMQ-b7X`QQ}+YUVZYdL0g1r?|tpaX@+jO>8G+e{wk*Lg|3swej#=3 z=A0Gu63v4C=Z9T!8NYk z&6gw0ntcW|5}nwUBSzcITED!QE_8ZxnYo2S{cWbrS-OLINXh-JQLQGKfon2M`%cAb zauz*0f%X1Cpk`{>%p5~e&MSv~zOr^sO0}4>XH{N{g~f~<^P*JB*@LZIzkD`U9OmSN zNs`kH6}L6#tr<%RN3x7YO~TJGcZ3hlWI|YRmL|-k?EG5g@FcM)Qy^83jlElk-&*hR ze5wJ-@Hrys=W9)l`@T+88Hnb%&vCjL$cavO4lY`Fu$daC5GlB=*e{OWF(*uDtiS$Z zKJvrt+dNEqAMMHY=`0VEMQB1$o%^KG(=txagto!6rYg`tbl66b9XV*Vp*H5$xdUW1 z`1{Q-@U(rmUinN~3Xx|NwARvixUV26h^G6P5*WAx`V3k+c zEc|5Def3CySct_%t&IUBsF8Qwvqbr4!rj>?6dd(yx-4S z@4MFdqB0IM&z--zuIq%hKa?u zmqW~16G>+>J*R2u?5pA^izQR0xjEsveQy{J#1HUR?6Rejf_gH-G3_f+);&&=iJwGQU#A6vIoYjX)g1OFz#5A4o7u=iiY4#FS{Jl9a_-bW;MuH#z0O;>e z+7Zw%?n8}5;Ix93(F0HH9zUjoWVc1Jk#aU&U*o0E-St)Gj2gBpC<~N1-a^`nk zUT^P}O>SA3cIc%PW4?+$Ajb(NlVg9a8{$fX>CT^c^hDKpgDb>iN{g7p%}*d-ky$MS#y@bZYuywjiz|^>TvJvO_|n-l;Y?6pQR)eP3csVi!tt^@p0!IULxzpm-T&& zHW#>$WYfESb6O?`ZZmDXAQNYfOcg)X8|73@b2A2na-fpMGiR88!dXVbjNwGDSIiHe zR4OKWyTGDB6ItLWc)Nt3aFa82+Z;%3Tt-kXtD>zt!<)k;atd0jHL{#P!|k)+^fnXR zRMAH5JolN&ru0NTO?$hd`JX7G)Pn~3x0uB>#3Q^}jiQWkb*Qc4$QNf~ zQXFAH*8%69K39qvFcF^HV%$G^68ZK!a95n+w5SL?tvT29;Nh>XklHah2q#4USU zE}6d?-c&aha0qx1X3BzTVx`UIj?JMch0vX+U0O5FVs_E=KpPi*_3c-piD|$$^!VTy z@Kdt>&f^TtuE-zWp!G>jIbF?ldqwQET+Ptw0=-r!^&q{4{02mfq9|Wky0efJN+MoR z(6kT3`6yWX0%K&E6H!0SBnd3*me*5+xy3j{mZY(vDiq!G5>KhmrOuUFi;wrR%vhrB zl%C>6cdEHWR;H7ERW8MP6Q_1fW5-RRn}uqFxr#A*m*UCv0(6%T_t&|7k_tpmCIgj2 zpBgD!d{bx;mrl#jqN6ZvJ(0fjFt}84%WU@Lg8#@$=Q(gH3-G@Wf$Juk9xXHSaP6N{ za+kn~%6DX`ygjES`BB+4ZQ8KX4>LYzDmRAzIOMUCUI@Ey*E|EaxLz}6JBGB%XBxBp zY2N5z4W~sYIe*p`7Z~fnY()AZ*W0##7-`#rE6hTXN;n}Lsb@9D+PCq(Hqygbu>=)Z zMUD0SZuZV3-n}V!);KI#i5ZI#nY()>TNa*1?mTrW=NpjxP zW^mla-@{ntyFXWf)6zX3UHzsEN;{S|>}D)CO~zYb+tr;33KfU4SpRn>FM!7er_DA< z+dp-KL~^1n^Q6LNp;ja=0N zTZfC-h_kpD{f!xK45+Yvdsuf?+JWJI zxubOaY)<}6H0pGT5pkpn$tw{_HcZ*z@znyAAQrN$LlTGU9aiksiI9=}e0B%xJFE>m zQDXE6h!>kPsnq80jS$=m#%s&=7UIG2a0f}BuKr3XB%SxDo=h>IV3d6sT!1kWrW6&r z^8xRuxxnhx;XrXaXg)QO5=McB0)#NU>5lQFf)>S@y{-hy%}MD(r7p76ceGQ5Nx zpCS`kH=xi9Oqz+kN&I1d$)~Q?=BL}k&wn_xV3dG!n{6}z>d2WoQLJ`_S)F<2{h_zc zcFqKE4lGwx)(H33tKeWi+X1^_6e9ACWtDSZt<06mGo4~Fz6|dV-k+WWP!E6*c&t-+ zyWM}CBi@5ir@DJ1|99-$kVjf&}yApyJo`C#i;C@sc4*vLIx&SiGzIr(fD zK5u5^Dl)9dXUDo|NnjDV(b~vy2DU2k_LT0F^-P(O2l!J%Si4f;psJeZvwi`Gl3sW;r$ff*kxT)K*UW*@Mhj0?L+Q2DeMxO?DqbFQr0ZYI}I4>2%TaPS{9I4wRsfl z8j<0ZAu9?vnbc3K0l}+d#ENL=TS@+fAbHN&R=0lYr#NrJKB%NKV`~LHF@>y;nTv{Y zIzyYs-QO_N>})C>ZI(pT=7s4ug8l^4E9War{vBSWUkUZwni87iiX zG&uVKejvXt6Yj1cVcNTL@z!9z^X=N^54gs;*`cHAIM!ba*KhnO7)4abf&a#0$GPkk zggt&#(_?x9OtWRW`nPL2Ld`?ig;La$epsjxH{P3_$9a?;5}eag{va6*1TY$(?vqcw zT_QkPX`e>w!7<7;d&4y!N1iDCb1YubEoMyLdoxs)`)m)rBog}7nBF@Hcm2Fmd(}18 zw-#`43ltE0ygmt@AnIHO`ciItP1xs_@%KdfSojxIdr8=ZA>uq`RfhUc%eg%Sgmc-5 z#e*_}@l2j6lRLlT>ul-|Ihc1Q36lx@;>MCQZmI9Co&PGZ$BIRk1~T@hybgKPlb2;3 zx{Hn4qeV)t1KnZ34cjWVzn7X(F4ZeRccCXau<*rScdlk@-nzhq9F1c)Cxg*L1M9Bz zmip{6kpC=c!dK~56o)vMy&asg+LcdMa!mBZ!Y3YboE?A;b^UqhB`5$toO|$r)&=OX z?pm)$+uOVJI(Ew~S%po!@qK3!_$`7;-@3^bFnV2bsgfoFQbVb6qK@uK-lc$ar7*FQ=2nfw90%Ix7SfCtDgnxqZK2jxi!LbJT^&A!ipRFilD_lTWX7 zvDjw24kS&$p4JLZsmzpJ;QeGFC_1G))5=a8*UXA^WHrUu~6Vjm>NDEO}AEX%S*1Zr~ z6*FoGY%Q}@bh)D1Kcs8*rD}kTY5Gq8Yrp2+{oPvl1!k7$?$rVz-J+(QowQfMKzXqh zg!bsInvJHL<(?|?aal$h=e{4j@p}BGT`#c&V;%bHEet0N9C7n5c-z(pBgTDYd$YS> z$%HY!70REx+nrvge$wT=)#Bi|?)s|D*RUi1Fz9}_tCFWEaIh8?k%d|c^Bh7R8ga+CHlaePCDi z$JLeb4uhEZ3SoK$Zm3eDfoPZgtAGwC2w$4QlOB%m!Rlyy#{tS zkEzp_txhxz6V*<{c`lB$tYDs5aK5eS5L^jPC_e`=q<;o3FxrT2-_Y=;tM{C1w;B;x z5=DNmYyH||6hi1FMj|lzssX-ii-xFY_=an#BquHWdv}(GcdKBo=jraD8VDw2#kQoQ z`Pl0sIPfAk(XqzM4!3lRo3ttwTJCO1@NpnILV9T17;8l5-oon4x_<`W1I`x$Q@D~M zGFS-mBjlbGLJs{xjyuWh0S+afokbX>lN*dWa>PK%C!I2Nd$aIbjBtwjwU`2?L;_5F zrfK!xt+Mwn0wHfgxRgZ6T2_H?P|6%M3CIgNJlI83;CLIbz}T#s^e9nQE$A;HA|(k? zPlLD)qsPq89(&JSRg2a4axAjVVE>0PB@Oug=XUHZU&UJDkPMur^aAI>}M z0vqRDwv&ioq)r@8E(n2?FG0~zbopy<*YCZ}T*S0GaX(vABCWShTf@$+iFLiuS}mh0 zSiv7OtMhVE4zWc88@Xuc^9KK$-Jy|y%d_Iv%WXM8tJ?WQ?zx9mP4M^jMEr*raVcux zIg9HL0QNG6v-%fK^|~)X@T%g!%Tmhh1vzR?VSu7sDo3mh=?*72%MpPwUqKct>gHJw zX$>kJ9aym$Dmf!MuBTvwE7OW4?(iiF^h;&$WaH6B*Tk7K2L>nlfgWA9PVxnx2FHDZ z*IU`aBny`?qzWpS3oCFL%)rm5*kAjGD`G1XFx|0)PZ7jX+7f~ZhufkFm zP8kdBH5Oi96tZ=rAuIw=zfa*3zY<9Aqos#pYLqBxN;xui-_eIY*7+ck>k8mnZ@&Ks z%n&8)6A_+l2TPhtnL0^zEUVteQbdakx zBo-s+KqmKmlgmB!iq(U6r_fg)M3lAmyGlw-8=-L^^vT3qe;W49tk1%qI>S7hEj zf4%-U40X0do5HYtBgwM}h{e4w)&%R|g(HAHu0G;?3EOaG3T$}4w0()8wG~hJF2VMQ zU=?^fhGBMB>uyEk z%}gkIH8Sssh9c0mHP+&3y%9a)BlWSFc+sAL_vr)>$-lnWMxCEFR|s}37ZB{TVw3f| zFAciwF@<>a6Xg7M56aD90sl@6k>}kF`s=AW15l$AFyPykm3~R8GV#)e?Pj3t zV?cMNeGC*_rt|31Rph2r-0A>Hxnr-ggKhKh`r5o0#)GjZ?K?a#6u66u1j5dwggh7v z5C#bx(&PUS7ZPYiUF{<6&$+U_UHg^(kEt8G!8+p69;UM(rxD*ElS)mND#i@%uxFj> zd%$3cO;A2JarbRE4&yq{vJ=z7`$s$CWt>J!9%rf;th>Q>R%5lRy)BpvBnF7rQWRyr zmT~YTQnw|LJSZHY!+Y*@!{6iw39V2L5EVNI?a1lx(l zl4qTYZ8KcaXBW~_Sim&b?coR!R9qj6fArd>xbh5idtZW5u~6)Jrt|HMfM2|FcLwyS z9ms+K;x}roYv)6aKpo7*RrGqdnP@soWE=h}Q&@MYDH_3rWw zUkR%dKx09|-+Fej$JdrCuO=>Hbq9^*StL32L)P;g6|I<*Jo7s9EAx>C9F@K91L~Y5 z798a52UszdeY}@*rA_O4!?Xwc`Ga_eF2lOe^nK>qfeCJ=`vuj(I9J=g`r1=SJ3cFr zFo|+xZvoizn3={o5FbSS#<5Gg3HILG2)xF_;4UlT$Im${wbw(%kRl1gLmXEhu*}9G5U?p+}={1@te`lsWuz zQUi{!V^2g`u$S3-Tnt=#_nbH3H$w`3!r{UEXG>4ct8b}{?*I*psD^nce2Rt(dL-Vy`=y-2u-*mK zuc4;tg7kl=b`i*jhd&ostL?`+sPA%IP8|DecV*18rT*}STMH7-w7+GclqU-BoB=*A z`m@y|8+%0C8(IA^M^ssTLgr2YNKqH}*a@pM@W8jy$MT7~3aMU0 z3yMFo?Wv9Xh9KSABG@Pc!cEf|4x_WRYC$`D6CE~guL>_r6KxN%wsdTjNtj9kx-dsQ zOK{tD-WBP=ydG74e)?V_YubK$ZAT5g2gt?c$u0_SpDN%MAEQll0QEcu_p%L}3|+wM z2&tWed)DzjuTR-5!5=E9b@0_-l28Vo#Te@_n-Y+dP1sDwCq+mB?!IWlr%Y!mZy;G$ ze?o#hw2ll}=g@a^0;J?~4LY4pDvS8yrqXmkFi>otC%pD(1W_PyOcr?8^|90439D zdy1C)Iuprl_eRo3N|lJqK!dXyRu_M`38a&vuOrlyUZ@ic`-`9Z(<{~e*v;}U$qvlLUm7P8Lc-hyie;79wEEQ~jdJCu`WKas z0>H^YP_%;H)_)do>~FY&oCIp`9a6L+$vA0j@o@3zAeX(bxVkfRkN)r*eY`0*B*;(4 z8eByYWwS5w8lbjI-vWx|R?FsY2>oDw(b zOqu=MpF5Q%o}ec`>l4|O$J7@#-bps)Z=IbYeXFJL+SCU_@(9B}7&{ZO*{T#y{el{$ zWdq%8^D|0&WDCUylM4YHk)MpF8&f7i+YS%hHs-q=L-X+fF5-iA_is&x3}gY`SALcl z&oFWMKJP-@POviz6EbJI-kVGy>R)9`pq(a6B~-fj5A&TQruGk6lS~*eX#YVBm5Kf< z4N8{x2K2=Eum1|~J@;QoHSp5^{?gy`^8fKf{5RMfG@Kz9AoKL(_s_{^06d5n5y}Cu zJ{Pd>AhovLW7VU-uY1zFFMzCTCd9CFjk*kko4oI|l&m(AZ?gs?~x&X)k3g-u#~7lND|@66-Q9ZOySQmX}& zV?NezJOmh_QuPF8EE^6463|!WJ^cG!_5j`Z;x)rPe@K7+m_i~4L^r@&l+)w>XuC85 z$$oCdRx;ho`wK=K6q#@jN0ul@TWd=!t5h6QuoEl3jYZ&imJPg2i} zA6^y>ZssIl>rna2(7k0Jpv;A}C_HtRxd=e91?dsONrkG&-{m$B$;sEv8teT$>{Un$=EW;_$6~)&-}giH4A_s)1a;q5Z)L8I68rVfhgZI z0f5LO003rtF!Ku_11>0egAAPd^+Q?!dJ{XeVY&zSc+22=6AfJtgqTYZMDqIe>$_>+ zlkT~!`ybU|6tlqU$A@Vi^zX2Dt{;9f2z?_+qF4JA;zwqWA&J(G!cnOp{?Gy4x`xUh z*vImG16pAtAY=W+mX+cArylMw1&BNfa<`98L+#ItNX(~h`1@xYlKjf7yxoY-Iai1> z2OM?obVH{D_m=>hxdkE{Xv@x6m-&3(Vvqq?1aK+NRtATiX3tdFnPjPtdLTkeftrGN0;@!uNOOfcuPyin&o z3raQ)XB6p*nMFAUoMu5Bs7}4<(>3>?szc`>%hFuC^~KJM4+CB#iKC%Ue}C2k&;tbk z9-AoG92|QxIK3`1%4L`Rvh81k>D+DPd!J2egOSr7MFrqoOWA94ilK|6Ipd^elzE)T zEO_znd^=yXcWN&%_Hwwyt?!uI?oUm4x!(ml$Na>MlbuZD8P#&9jrX#E}Gbd>r4)h_Sb-l7{CmJbaVWR zvzEgE+YNyUq5e83|BZQwHMa$l7scEM->CQpU&?}`xdqx7x9uc4@;S5ih0~Wd0v{PG zVhfbP2==>cAbIB6Irm3o3e+tG2y-!DZ{F$KnXUobWydcyEH~rUru^Kl{>G8HFW@AX zf@AsDREx%DBR$kh29hP*kX+bK>IgaDPP_{^zsGwXny+;Ke@HR2>@kSmhpZW2gnNG6V zS(^U}E6o)FioFGNyNuIXb`S==K#*J5-+lXSe-{1Hd#&CyIKT#MF*~UjnVH%mOeC9I z#w#GA;Wi;MAYRAHpaDaJ`LG8^x9eRL)o9C9!q64f4_73T@IXmTtHp~^890$e?hIcJ zLJlUif-m7-@(VaMXgsc-KLk8!cis1HNva^VKDDDb9~A#EcBf9X;HSoBONwhx$O5Z9 zn-j%T#@mQxY}hdlTqJ^YqlR1jSX>-kIaJGrZP>N&h`(>A7Om{0=(-|t;tF54es@og zNK|Jm^Fj!}vdGrXF4f1`(6p38}AB574qZA$A=3gflpq$H`?FeSc^iR+3r=8m9C zND}7jCm?V*O4_&IhIvR{MV=?0jY9CwgL?`vrgN2%?HNzaf21fK-mLVJ*nP7yrMJ7Wz)+T!%_ zOy0Azu$GikBVR6iY)F=8=yLf5|)vXNk?php4u!x#Z0%Q*nZx= z(U`L`axhlVvhY*IVrr6J2`(q#MQ`k(XOxAnY!_IPjUbR6XzKZ!Ii$&5MjL3gj^ZKa1P3v0!4 znsJSk38p6;WSAsaD$a0QmKrFQEt0-0Z~GAmL0)z9;rzhlv{3W91j&y{n??c@?s;6kbEq%4k|Ust)YQK1PGLc8^~6>n67Qlg}u7q9i72uF7*Ttx@Txr zcJ-=a{^UDfEuSwryA8gTnLF$NiE|&P=qLPIWfYk;x{0;SKo?oEfd{?<~=}3(g z$1-uH@>R)Kd$84AOyK~K_$kAbaRIam-T>X|ORdovq0rrcviu41Pui7U8@ozC6|p-+ zF0SyS637(J@2_`wKFX1llw6j9hCsIUeyX;%)?l>x3n05#KnwQTLfy>|cnK&G7RI^G zPPGXoZ*T7|$PuV{W5RLU+NpQG-gP2AH*~N33#edi^h;vSL%j#!Hmzm=7){^x>Di?O zfaosudmaS1A_>5yDgJ7YY-{=rm{ubqI$gwtP_SP`KEJZfwQ0LlVikF)sIK6>=S1KP8;d=-#gQfVOBT;drTM&z|{>G{d;K^g&LXj0{nkvEzTR zvw>P|1H*4H=BDS6HN1AGmW7T}ZY-8(I*#G>{fc{|&kPhNV|f>D{)$z1IC&Rf$Ec{N z62SaQl{hpo^&opLerldK=POGre}O~o|1_Nop?L8ScpO14^(Hg&$3k~@Qd!V7u1S*T z9x9HdwMw`NhfLP`tjbdE@xs>6qKk%g6q>|!iFMQ;+o{_5l^QiN`53Lzxmxg8HkyLB6$^g zEXQX)B_kpt0%Z|E$EDEpvrN3;ED7CSp!Ov!%R#ywSNN9;TpdY9K}F>SmT6FK#)XveokV0l_!Z80K{)AfT|&zAaLN z+*%s82hC2G)Ar`q)xramZaH;kZIBj(Y{=5Jr@LEGZ`B^;<6NLme|!G|lMrY~ZfFJi zgg1d+^nvv<_7%`Zos)i~;Qd1v304?odEdtewQ-aOB93ucz5~z$c3B2HuLv|veK!Cc zgc)S%bYJ-{#$9?TsG*_3#(P)6GFS}w$NC*OQN?wT%$t$z4FcfmF3_}l)U%m|5es16 zlTF>g^vH)A=pm25S#R3`F-AOSL*$ozpLHTDBjaXEm%T9PHX=V-GqttlBOxjI%r~Ly z;d~%8DP)zCAo)95A^2HWUw>9>)!yG5_iYxHaQZ4roNb z1N_Cq1Tucv#PW0yx|o07>X8E~tTRovo!N{ug98InHYtV@YwTQq5pT>uQk_ZbJny?4 z%~-dS7D0A{<@p8C?s~`wuX(7Ut05#N7&{y*?T_{+)mkRXf8QnIoc2xvmcUVBx&$j)wk_POiVuLq<{ z<}JQ|a8NpwnGFULjE;^TIkqWWHRa{ysN=Qi|LJ}3IrGz}Px3DRD1eeZPU`{YGuQIJy(9%LU-ly)BAi@&{djYOKb4vFUn=Z& a9Fa@pH^0FrIzTZ7nU=b~T8*k* 1 or args.multiprocessing_distributed + + if torch.cuda.is_available(): + ngpus_per_node = torch.cuda.device_count() + else: + ngpus_per_node = 1 + + args.world_size = ngpus_per_node * args.world_size + t_losses, t_acc1s = main_worker(args.gpu, ngpus_per_node, args) + #dist.barrier() + + # Write the losses to an excel file + if dist.get_rank() ==0: + all_losses = [torch.empty_like(t_losses) for _ in range(ngpus_per_node)] + dist.gather(tensor=t_losses, gather_list=all_losses,dst=0) + else: + dist.gather(tensor=t_losses, dst=0) + + if dist.get_rank() ==0: + all_acc1s = [torch.empty_like(t_acc1s) for _ in range(ngpus_per_node)] + dist.gather(tensor=t_acc1s, gather_list=all_acc1s,dst=0) + else: + dist.gather(tensor=t_acc1s, dst=0) + + if dist.get_rank() == 0: + outputfile = "Acc_loss_log.xlsx" + workbook = Workbook() + sheet1 = workbook.active + sheet1.cell(row= 1, column = 1, value = "Loss") + sheet1.cell(row= 1, column = ngpus_per_node + 4, value = "Acc") + for rank in range(ngpus_per_node): + for row_idx, (gpu_losses, gpu_acc1s) in enumerate(zip(all_losses[rank], all_acc1s[rank])): + sheet1.cell(row=row_idx + 2, column = rank+1, value = float(gpu_losses)) + sheet1.cell(row=row_idx + 2, column = rank+1 + ngpus_per_node + 3, value = float(gpu_acc1s)) + workbook.save(outputfile) + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + # In case of distributed process, initializes the distributed backend + # which will take care of sychronizing nodes/GPUs + if args.local_rank == -1: + if args.gpu: + device = torch.device('cuda:{}'.format(args.gpu)) + else: + device = torch.device("cuda") + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + deepspeed.init_distributed() + def print_rank_0(msg): + if args.local_rank <=0: + print(msg) + + args.batch_size = int(args.batch_size / ngpus_per_node) + if not torch.cuda.is_available():# and not torch.backends.mps.is_available(): + print('using CPU, this will be slow') + device = torch.device("cpu") + model = model.to(device) + + # define loss function (criterion), optimizer, and learning rate scheduler + criterion = nn.CrossEntropyLoss().to(device) + + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + elif torch.cuda.is_available(): + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + # Initialize DeepSpeed for the model + model, optimizer, _, _ = deepspeed.initialize( + model = model, + optimizer = optimizer, + args = args, + lr_scheduler = None,#scheduler, + dist_init_required=True + ) + + # Data loading code + if args.dummy: + print("=> Dummy data is used!") + train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor()) + val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor()) + else: + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + if args.local_rank != -1: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) + else: + train_sampler = None + val_sampler = None + + print("Batch_size:",args.batch_size) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True, sampler=val_sampler) + + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + losses = torch.empty(args.epochs).cuda() + acc1s = torch.empty(args.epochs).cuda() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + # train for one epoch + this_loss = train(train_loader, model, criterion, optimizer, epoch, device, args) + losses[epoch] = this_loss + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + acc1s[epoch] = acc1 + + scheduler.step() + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.gpu is None): + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + 'scheduler' : scheduler.state_dict() + }, is_best) + + return (losses, acc1s) + +def train(train_loader, model, criterion, optimizer, epoch, device, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + + # measure data loading time + data_time.update(time.time() - end) + + # move data to the same device as model + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + model.backward(loss) + model.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + return (float(losses.val)) + + +def validate(val_loader, model, criterion, args): + + def run_validate(loader, base_progress=0): + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(loader): + i = base_progress + i + + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + images = images.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) + losses = AverageMeter('Loss', ':.4e', Summary.NONE) + top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE) + top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + run_validate(val_loader) + if args.distributed: + top1.all_reduce() + top5.all_reduce() + + if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)): + aux_val_dataset = Subset(val_loader.dataset, + range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset))) + aux_val_loader = torch.utils.data.DataLoader( + aux_val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + run_validate(aux_val_loader, len(val_loader)) + + progress.display_summary() + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def all_reduce(self): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = '' + if self.summary_type is Summary.NONE: + fmtstr = '' + elif self.summary_type is Summary.AVERAGE: + fmtstr = '{name} {avg:.3f}' + elif self.summary_type is Summary.SUM: + fmtstr = '{name} {sum:.3f}' + elif self.summary_type is Summary.COUNT: + fmtstr = '{name} {count:.3f}' + else: + raise ValueError('invalid summary type %r' % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(' '.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() diff --git a/training/imagenet/requirements.txt b/training/imagenet/requirements.txt new file mode 100644 index 000000000..ac988bdf8 --- /dev/null +++ b/training/imagenet/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision diff --git a/training/imagenet/run_ds.sh b/training/imagenet/run_ds.sh new file mode 100644 index 000000000..08765e036 --- /dev/null +++ b/training/imagenet/run_ds.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet diff --git a/training/imagenet/run_ds_fp16.sh b/training/imagenet/run_ds_fp16.sh new file mode 100644 index 000000000..bbdf26779 --- /dev/null +++ b/training/imagenet/run_ds_fp16.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet diff --git a/training/imagenet/run_ds_fp16_z1.sh b/training/imagenet/run_ds_fp16_z1.sh new file mode 100644 index 000000000..9b5e7b165 --- /dev/null +++ b/training/imagenet/run_ds_fp16_z1.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +deepspeed main.py -a resnet50 --deepspeed --deepspeed_config config/ds_fp16_z1_config.json --multiprocessing_distributed /home/pagolnar/clones/clone_imagenet/imagenet/imagenet From 09af71a6b4468c30b175e0b8f114cda60625a278 Mon Sep 17 00:00:00 2001 From: Pareesa Ameneh Golnari <120066333+PareesaMS@users.noreply.github.com> Date: Fri, 17 Nov 2023 10:06:47 -0800 Subject: [PATCH 07/58] Adds script as an example of a run of DS-FastGen (#810) Co-authored-by: Ammar Ahmad Awan --- .../mii/A6000_benchmarks_example.PNG | Bin 0 -> 84752 bytes benchmarks/inference/mii/README.md | 16 +++++- .../mii/plot_effective_throughput.py | 40 +++++++++++---- benchmarks/inference/mii/plot_th_lat.py | 46 ++++++++++++------ .../inference/mii/run_benchmark_client.py | 23 +-------- benchmarks/inference/mii/run_example.sh | 19 ++++++++ 6 files changed, 98 insertions(+), 46 deletions(-) create mode 100644 benchmarks/inference/mii/A6000_benchmarks_example.PNG create mode 100644 benchmarks/inference/mii/run_example.sh diff --git a/benchmarks/inference/mii/A6000_benchmarks_example.PNG b/benchmarks/inference/mii/A6000_benchmarks_example.PNG new file mode 100644 index 0000000000000000000000000000000000000000..853e9237830098c4a2dfe4c2f3a29795477ea45f GIT binary patch literal 84752 zcmce8g;$i}x9$wx%`kL#4j|p#EdnAjv>+uQBErxybb}yW0!j%;cL>s@q7s6X0!m4q z*R$@u>vzu|aAqwRuz+Fa``*3xv!5`|$Uu{rfQ|qHfe>qJshL0^*x*wr3J(K(c>dJF z{ohxQO*B;?uO}IHz!z99%KFL>NJBE=mHodT#rM{-d<=n*4E*~C9r3Digg|b~wbhi( z0&VsSumh<_E@F1Nx_;KF58D_e>_gC zv@q&3{^K$4&Whh_@}G}4Q5+E6|L;j)|Lae3y3pV-qLD~<$8UT3@X$M4Heeqv<-K~m zGgqkaoR6Y2e==WM%waeoCkHN=uR=gTVAUP`uCTBWW!!Iyx@f{r8yU0*j)`DJUPkb(^dB@IunvUUhe-RP%m|hfhGjVt;i_ z3@I~{j2|9*byRsR2h>96zaP7IvBf6~6?<%ZLcV|}@tiqa8+h}eV^~P3MkkZce|LW8 zbE}uD>vmrRu5Jd;Otae@cn@4OC`r$iwkHQ)du}dtvDW_mogNuT1T&S*)aiV+sb!OM z)U3LQzZ&%S)`z`~jnXJvWpZ-zCYPzx-@j-38`S8{Ys?d^&~{2ny`4}0T!1Uk6M74N zD{MXV_FD1?B299OSMca-4&3=`{Lm%9J=fX>(Kg17tb@eiSUuUAk|%%Ic#kENQ)sqC6JHNpq-@V)!Srfaf5);%m^Znj*v$!WX-TR_gb&D&n-0byl)mf(H! z7LSiFB)uN4-6LDcD*5%T=Uc1SH=zgZu`Fu;Ze89T_gSaaZ>`@#2oK_P zQ!`>-C(J#o$k)#k|9ifXyrv=eKCtAzZnn_#lij<53_V3Femk=ZZ)|%E3l*9NGI&kH z!^2}^{?G{9Shsq90XIX8qt?=OeK0POUh-xwhS_zDfSPaGFEuswQ}>UO%va`hG7sl@ zz!NETywBw4CNL8ggjB1eVe`6#vPCcDo~E&?Mho6=akZD-TY2|*b3EI{?CN;V&?+M{ zfYNqhdK{4;N(<3j23vNDQ=k4-MPq)frr7yg2vaX8K7in$c6ymeoC zxb_3o*wemn>?Y?ApQ?;YX~mpTBhtR#A({8zK78x^fsR5=Q*!{$DC39nc(T9D`dYqZ zB$@eOUX}I7_+fN#Kci%;3y8V#omti!lTQJ{!mCPnTXlc)T^iiug>GCgJv_ z{bagW^}&M&mfd4vfo$N|wmm!hwKfm~UVQ4E2jW8ovQJE2>p&Am4jGS?R;n4;>P$;nfohP_wD z7*+@KXN~@qFlZ`+ zQo?gZOIC~#-r7*(!LFeIetaG~Im5ETl=*se)qV-IoXTNy$HFj*QJWS*cB7EpLq#-chX)7EvUx;0v>z7g=d`G38%-Z09*B9A2wxp~@8s>T@~jZpk`a!9JRT zdXM3hCgOAF)5jz(5$c@X%nK%#At52nCizb+hIq3GU>-1P0jVvD(y92GpG_2=xZO1P zuHq}?FQTBy#Gx6mVeGh{-x#EQ%iNb<*Q)x+WW>e)uCGV9junHECP06Bxp{eaP-Q6X z&1H5hXI9);^K5?r?7@^&84Wz;+|YvhQA~OE5kScO6?Tnrv06!P7DrPNtOq99LoZ-p%o zrz!dp9oC=Bp@k8A{PfS357jTmmMBLHRyj8uzVpF{gt{4gcjHx;(b|r35sMtoOWC#| z)s%Ql93#k%<>=#AcWa^%gYT8q+A<|-hz4HH{2P5~f=nF4l#_qg(Zbz-x9ANL|5#NS zu~rmo5gn8p7BcO{#KaeVh|@Cf33+~HB{6!Nta!sh`xX^^te*QSQ^3+h54YAc3=zz!BQW{n^7 zG)9npNi~KlIsnHD;^mC{NL}GrAdc10D$ZEhvrf}!RI2FFK>{-(n`sW>Vj~h z9Fhu~bT0iImStKE0W=L8WtJkFx^y*FBKt6oC+h$XWuG>!iXJHZ&S;c#<;;4p0?do7 zGNa$r^b-Sn>CZr+3-YtyrO65UNP%yZ-cqq~azLf9o9zd1hWY!$(chj#<%F<>003xZ=KFnZElk~_#UmG% zWJ}7dFv!=ybiZS#9e#1p`Ntq%NFYGWWitQbAi#}{fu%F6L>(=JSUek?3 zPGbEK{K-C}IcC#%Xco*0-ZW}ML{`JDJVAT|eTE2@LMZv~F9@~NPr^w^Tne_lJ%|ap zbp?cyve<$3b4Bm-?M!_8B6T;wYdZmL$FKglB}=G>2|aMK=x?mLv2q)o2C&qST;@X_alhl$vrr%8zJ z`wtxV2AmB&3)_RCH3zM2_v8mdjqhc^oVOpA`=SiZ+&RJqe162=u9JO8A zP>LYp>F@7x2Qb%QYC6lF@W>zfWPvD#Ub#J&pbbpB6zc3R3k#XUVIS(LKPAybJ#r~O z-^{jM9*#$TsJCj<&L`)G{){~>e&fbJowOoFgVz_t@iM9c!oZnFQ@9HnQMC6q6+!Zl zLoWNvw{iW`Tu7<{8UOfph=|8rop{IFbW8AvhsJ{liO=OxA|jK+1b^u=S`QlEvl1;4JvHP1pMbhhoP-Q}=q6=1qh)TWPYUFoHg1z6c{)Yn=5?*r2e`m|{SI&xH?6$74$Mi8Y*r)<;ny!&VRBsC zb{$sT_q}{?2d_3JL_UAG7><1QW3m97fNJ`nCxs#nl8=Fu2>Ct6ZNXfT3;&3x8|Hs= z^=GPqaan*oR{&99f`!VJ$bib$MAbQBLALhcPQe(9A_9#3(8q1hFHe1JemZ^<8;cqn zRzvo9U*|6O3){CixEz(c%EV?n=E!;A)@EkNAcN{H$=-H!$hwe6{oQYxvgki~fu7X~ z46Eg7PJ_p-%AASTYPzPim?ndQ0L}oWg!4AQvhd_|@gm*D zT1$I30r)T9{Y&Z>5|83frbFkbAOhh4Fb&;vo)+!TeTNYpO2?}#yf(F^rp)``2|1*I?x@VZFx~+UiT_W5OID1 z&|rzJ1wxrzS$IF5618&&r8X7TpDtiTdI(`%EoCtl7Slw9AwkePBn(gxI8-oA(eloJYqz4F)GG*T@YIdKi^#!qW+ZcJd0+v;Rvtx`>IeR#n-5r&B?N;@b|6uU_Y z?&XLLnw~WhK3Hp5BpUHD%>B>j&w#hcA%%YgTYDO{^&|PCV}}OWbJ%(dZ3Y`;B5yEg z;9}&XC5Gjn;|08qCZCOAuw=iKed5_4i7((LAh@0;s*e1H5JcQvT!Q3YiQ`Em?R+X$ zjgguXDx-IOASLwo_a;j`|@MVEj?19or*?mSAr@GnQ0ZfYHr#bmg*)Wz z%ae;JPjlk)c++aQQUG?=h%?nW<)xs)+5HFYe)bIOLb*hNRL4Vm#tw*KVG7i&D}rQ9n(R6 zWG-=!UEjt<%1P#N)?V^{Q}Hx*xVG*R#mmdzck&kgPGeeIIPS2KdtLCsKaE3oPSHuP%@8B09y%1DvqYN$~abvOB`0n_(X}4{a zaAIA)1v&^5hn+DDG7-HD_*o-pF93am=G#5aX)b`O8jOS3O|qGrSh>xXY1h_y%y}}1 zK0Ea|fuI=&R_lK^3PLO-eQMujBz`;tgw!BzFJ28EHQqB%!QXuxvz-Ec{Ws}PJs{3s z?C%8MUZ3rF>S*0jJYU63c+v7%$Bg!y5`)2g1kquzE2R-}+Dsi!m*EECu4>@!Jk3=6 zqX|*|iddJwDB?)wWcx5;lu}EBRxPg#H+!FfM~WC>*c}MBy0}6JdQ|ydFhuX0F_DkY zU3B9tptq_TNqSTGRLy$yVwT5C3@H&9JEB*_LFPmT34$I8>BITR6*gzw(_Hlw*07jr z+gwK}d#|V|NM2fI8$!7!sVnH@VR%VoTLFspkAc7*{>p|1yJ3M$X1ASMT#jlC3AFbs zjC{(mObBU(ShZx3#x4hqDUu%03I2|EoxTIK{*PJni(nB@>oyeUcioVyFBMYD!~KSfBOQ2V`=h{|Fr}AuF7N}%b>`f8^NnF zC1lcSCUoF}G*q1C7cP5#cTsjVUJ@Gu@Uj$}_8Fq5gQ$0)jKDVGow7~WDQfRU9)OZ? zyofEzNC6FstO?sNBWyCq;fOt2Speo$A(>N*DY8|q6o!!_M#S(H5@i$( zOKg$BJ(E=x$0L4uZ+*T0iudj-Dg{n7^IV+c1VKPK9>ihNd(rZzHx&v>Vt*Q6IG1Bp zo4qqzj;UT>DJ8_9{?i-Hi@%4ZaR^|2sZjgL&dNJ6e38B!k$n2uy`E28e3LfKt~1Mu zSD*17qYBF7j+u8FX2b=YvC{`7vSM+BZ91jQHF{2jpvL;pUh$~dKH3s?5e=3jz%5&B z>|n?dc?c;$Gl#a2(^2IWgv*OG90I};87o@J$^BO9#to((4)w15L1g?~Ax^z?$oZS2 zRc}0&{|}Km3RY6#bo^9Jorft#?5YZ{p#L}f={je`uQnqhibh6p zS2BLAZgO`_<@V@Fok9iq$@Ft4f+^J3ThJ-ocEL<(^bi|zRkbp?nvcT8uP}j|>%{A{ zG-Hl;T9J$&SSI8Geu4rdN`5F zqt+4;S_JvU0Y;z))eD7qvDWiQMV+;yvF3Vp43wJ&dD)Ui5`(sSEO#uYz1dtc(kr;6 zujlHwE`INu%rwH@^RwlAFuRbCkis^4%mS#B9GdZP>^?`5S$M)vCp=+Xff@qx8IdM& z_i4u{_sUFhUt+>Eu3*}C}>$>`OG4JMezheTr* zHx}&m4>|;9P#7PSJt*W2hwpWQ# zD8lBD@Q3X$U)irA%w^C5OQDeduLEgMW)UA2sT(ZmeWkDX@KOLd zxW23OGQs5qKVq>&7*LI+w#$yrGuVDBfbOoMnbRMZxB zJdt+<8D~1aK4O*hvkD-*U!v(LU2u#EsK$4IwQ|%cMED_mYY*%9yJR3@k|Na>Y+M{Q zcdoczuG`T(ruN00viQyeiK=YOFK9O+;-Z%TWTC=g=dW5j3rlNWI9G&+W%;Vib?YHk zB=afg)X!t zd6Zy-pF-@^TZ8qXIYw{tTd3E@oS`E1JyxECHbRt&Rg$^78!stNHb!rHo71AMr~?tR zP#AMslq+r)G>g)Goh|KmecDVha_ooVIIBS~P^V~Bs2l)~pP-Q-^^CT}jKj1-k19@e z8_jE4;T^EH3n+s7fhs+>H?kXHA=ZtK_#^8j4cX+QJkJlh&g`Ul-Eo?(?SRObOBNRWZp!9@4S>ORUAn@m46M~9{n{r)2SD?(>x2Kq6SlPY4TRjL zkhm9W7HqV$fv=J3y}82)8^1-zb2QIG6XbCuoQv%TqV;GeNF0)*Omc+HafzrRvOa#; zw(1N-h?&`9Tf7iyn&<<%UMzec(tSrGg5i)6F_dA{BUg^b%vG2Wj+Xx@>au3j=60DY-)NUZXJW(dPS1 z=_o0N8TXvn<<7+HsrUr;$*^friLd~!mrfavx@cV4JQVbmiB2#!fQ*%VvBl^YSPZj_`5BxES#dujd<3HNPH4>6%EamP^}Rg5 zv{N&ehTh(Ipg%+5Spu+(^oO%KUC++A5K+ZdGRs&NJ5Iq4K`CqRLtV)ff2AWaykVbcpJAZ;C%|ze zr-T}6hX$iQYEH$UIr!K_(|sVBI^hF)$*6D5V!xmP5z^zl+j!A6ZMJ!@`#iVzq(10g&iRo^ora~Gc)U_Q z5c7a1_P#skBwR}69iFe)JK7v&Xi}APpcu(vbsRI?4gJSF_3Nw)Bi-ReMApuC7IEpr z5JN+dlJTrAwa}T2HLsZ^n1$ z`@v~Aoh`H&+QWgvZ+H;JdwU+@yGrUZ?QX^bw$Mnkg%`85 z06WKLeARKZL8ws+eJM!GI$r{eC}x%n?t%^G($dm1^FVp0P8E1vYtfvH#|G^!5k3ss zxwD5*I+&YZr#J&s&h5L~RAGUJ9b{qNg`q}DdXKsRmc+u(5VLVLGfxGtQMq%!)l*+= zc4yz|C)fm!`xysi6aJ9DJ*VP^Y%Hv4%q4)0z$tbXsTa_dn9|jP| zsQr68L$W#Z6*xw^((`5h7TO5_GKS3w<9m6aSs(K5up`h0L-EvWlnv;<2^N=b%H$8^ z`Ms9j;GdtGw#sFTzQ1k#&*FFa=k&cI-9YtthlxyP-cbesrXH46e`N=0&vGy8XV>2@SI~sYwg@ zo1tE)QQ?84(Uto!DdAFSx*A}h=3s&b^d{H<>I53|(M{-Quc{d%-Vf6kzeVheD~$2* zjWdT7@bL9dihWhMKAt00IEOT8+(5XGz7OyvTUKQAN@M0QQN?VRmvu&n!4#gyqXa9W&i?EP6ejVn{wHglhlelkhNy|jw)>S^h{&xS3^_K=(;7*2j_(u=@Mn#ME49hLf)~c z;$U{TKHg%%L}d)lSs&-!?BRSREY%g4+GC_FT=&6qDss1|s1|)N)keKT{??&c?7P;$ z+A*Qv@Qy3-maJ-qvWf!3XlMd{Og&m-L#>&Q&7}5|AKL)6jL9aObT+Abu{I@n& zQcQ(82?|w<_RhafNC6?DkNwt~ld|ky!85mUezb8uf`LLjRtgU>3?~yy$L|$)zNufj z*d=xs4CdVDt?Zo?y@!w&P+7)?ad;ro8TN+bY#339ISXwrcqL2d%?4KMqtE7gf`O0@ zVI~5VHpYzn_wg+KgD#^f%90~eLd!u!i57RD*f8VjQpFo6*@))r(}c_EeT!y*E+A^av72K zyzLY-&mo6^cM#0tFaFx~Y``Chg|yLTsjZyeU8te`exBht+vubVpZye8RCU~aG9~E~ zUrvJ?Ra`^L@%`5?gRREyyP2Kwzn5F4odr*N{ifU ziTL=CVJKTVXvyo>D~&;Vsd2+n_TG*9I>Qm?4VJM(!p~0k`=9DqNNo$GbKHE~)m^2E zt|$@#dVFXdgR&|{$IY;>_xCnz*5T1FZ@a^3lYT>Mh~4WG;O@R4sV}yq&Q$Y&XM+^bE1Q zWFqWpN3mx|N5uhV@lQF{tEKMtwmJ(Bylyl z&CNO6Kc(KF%%y=n6!GwwsZ2Ju?75mztT`KP=35(FH|Ur5K2n-HgtYoP5%tPXA<4iw zg$`4$+2vND%H%ULZOAj#N*ij?^+T|$QK8bUTH(YhVkS>!Cbfu&CMQ`FX#H#NYjk|* zB`ay-k-+E4sS-t#Tc*B5aLuy38hCp0(uqbh?$M_e=Kv8csg4lF7ZT{ig-5_qhX67B}f@CLn9L5i{=UhT-vu#2;7pdvlAT~0hU=*;mscFwZ3Dbdx&Dy`p}fLzs{2U1roDd5ulnvvEw79O9GXalR$ z6pL3L3e~AHB<53rj;axbn54ZVMpg-gpaUmqtQ>(uKKKNz)AsR_9+AykRUmj@;+k8&f*E4{?Kd_C zYhdy$0mS*&-C7HBkKV4~FfK`N?E2;wQwgLmNlWm0e$##7pGWwVhMD6TyrnPj)nrkr zKu8_Ra5bJ1=EmUb7w2Y~)Q$K)o^5#L>B0`}T}*DBVf1@_@(v4<4D1+>=pv7GvjoUH z$RI=lJC9x;)KY zhYW&!DjH+AuRt@{DJ@79HcDx@2OPl5Q@;6pkQ^Jz4gST_drGwvjHU?WCJ^Ge@Y z7m$B;IzIxZ*V7IpZE-AE6)1_ss5I_}BzS2yStLUdn?na5=p{UgaO=F#y;LH}X&Ai% z6Skq({>3U$$Iyt2lf4lUvZcP|`x}1;{9{{we(O=p6}zLRX99519G;nxt+}lMl`1f@ zn&tN&A{pxW3=9ke92pBk@g)@2OtSg!O%6W|R1d>~FPE)&obdH4AAIR(?Mj5GnN=Dv zPISYv8Yy%YyGu@fI`|OaqWrDTpDp$e$@p!jpwVa`rMlA|kBbf>U;WI|k%i74<1wYe zl5|()yQ|vVRNiY)piGr8P}o)VMqRfzmuS_w0FiDZk}SlCsuKjn-H2;IP9mL~@sSqp z>Nn&U#uy~#A>fZ8+pTXKiX|+_1}0ROrcm+q1jc~xP3C6A zGc0TfcYE~5*mg`-KU0r`c%SoOdM}G_2Usvs19-2;T)p_GgDKm+4{Ze(_>ex;>~$jfg(2`O2jcNe{-r6!TcTI$>gvFj`Rkvurt2)6B;Yqc`rGpHX+M)d z%7t#d#jQ@ZP~wSwgz4&ruT(ylXSEI$3QSv~!VR``ic~$N>^qPFjtt_Z{Exo3H_zq# z@#eM&yeXg-a->A~(|GeBzQ7pB-YVr4R>1)-@3S8TPk^snb5^Xdn$byXm0Kf?m`*%a z=OZc-Vn-Km-Y=d)niaG?T?|P6`|Wz=q zfC+YMEM4!U*HZ0#^1fEgbz6gOg?r)D~&-T#v*be_J+JKXtu=O ze&5aae=jZw*tH_IwZMeuZYo|l04^por7mPfZ2 zOFb8-KXql({~m3O&{8B&%0zVmL3t`)Iz7$Q6rtcxlQKJNbn)EsXovGPWKIpDSo(l) zgkqt1X-kJwn+e9toHY;j;d+NkfLK zeKi#|X0mz=pfB9U%Jta6NqD;I*a~diz^hny zzjd)-9@teOIBhrkzy$7AZBp(XdQ#I(Mo7YGhEkRDSQgPDwV^&Fnxilza(|`eAqauT zbhm7Z9i5lPW~^_9sYVm^T~m8~=-a#gJuYnTAqraB!FaYXY2)r2&uG2FmJ?0Mqq?;T z05;Jqp8&b@!mAs9q5e5AU;?*i+I^4*s3P?Kr*ZqD{}9wSHHi%kJYg2GgZ#P#s2DKC zVqoTA$$3nY;6tRt-TWIC1GcBbC;>_41=w-28ISn{t~iqmmqR8eot>*zj%21o-u=%3VsW>{33`QW@Eo=VL)D_XYozLaIQ zJkp1Yz@*-uqnsw@Vj%N?Bf!^}?GLcHPLn6J0{6MG)vAQo*Uq4YTI}*^WpM#{Ez{{& zRGHE8r&s=e1>7)kNxmllg93wGpoo^X{V_Um|KH?sh*)pvvs6;MMzU8{RKjZmq7`mg4W9A2bKW>I??zV3{=?&J47rV>goI zN+6iPE~n@_S39%s7+mmp0HPcf;YVGw34xznzK^C_No586TG1W{%hM^KK#R<%lA+Ge z*<1fl;K__m2g93S8<0J1FMYl`CRY+R*M{CKh#mt&DQ4FpP8ELEcJd)>73LBxGo2sw z-l3oL;)R>`m6^4T777ajA!BG)saJCe zoqu}kP9tbFxaMO6#qlS&|5YQlxu2R><=H2x&Bb=_HH;$HYXv@5EniSUX6EzRw@RWM z>{i;b5-|bmovc?Nv(c=GN5yS2>+{|Dv1NkL{qA7tM2$6|^SXbT$qzqX^!oZj+>OG` zF9AFvr*__IW=+xc;U9hlzX@zZdL1*uGS_TD=AE9YLC|s$j#Lk&G)S?QgP@OnI*f&X zhE8Sij9N7U^l^5>+2)4}@3){EASE&Z+6w}|mJRARhSpzKT^NVBN`z-=)MW$AUa{hV?0HeujRA@!> z(08?*|E0+x_Ycd@;YOYS8`-UdmbaP$c~`xB*^Tlvy{E4_idccAJI~ z`H}E#_;Irh0HN>!%_N3u;90&=#W{HZ#08Nlcv1eM{IBod)hI1*#fR~V8`o`o-uH?N z*megG{%Lg@gn8(#@zU1~MmBo743d=y?U%>pB;1(Wo|FY9Jg%sRC23kfw(>r7?Fk79 znHk;I4Qgm6GYOhirTDvlb_KmZfgn$}ASSDUrtoP-nrOMmPS5q}ixYR6vNED_aCxo4 z-I#oaQ(k66sSU{rN<2Q^F%3Qfyq^+@RR#h}>q&FxvEpQ*2!rnA({SeR~i(olO65RzmO z@}W0!AEKll1V6bb3%4MLr!T(o1#U4HaP;aNOpNA_fdRjm)=g;d@}|gBgHb+hdla99 z#F@b=`W0dBv_8^1P4ICdiPJfGnWmh){E)mKe(WwT4i4zozd-gY;b)ngAIpgh$Slsg zbLjm5VGb5gn>!hPkfc9c0VA`oA9Q$lK1$JXLga(xr?O{6S4Zc)L7v1TE1*m~Qo>js zP2<#k1r_=89OMPpzc$=n_S{}>z3>8F9L=(p%Xb(U81eVb8XdK7fK6KgpxkvRkH-*D z2k!#AaX=<$%z!b(C1HpyP&G1|1|&y;{Ih^GYLFFrxbP}}G+OZID=^&V>+Va==Q|3U`}>{jBCQJmQ2et{!)Q@lQcSq%f2DZ>30O zQzw84EQ42|_N@*jWZS(`uA2>_EWTO5Mhuz?=%~s*3J)>~F#dudiba#NzMzNU)2C1U z555*;Rj%t1qDSFQ0)L1{0Ou%jI#>uF7tm`&&*RKbXdq`A_u|uatdt3;6bau zY6>cOX#9^qwAVJ|Z+T$!CXm2D!fgeF6@iTmM1;z32cLo?3EtER7!ALE{aVcpB!6IG zkpukiaVKXg19l*Kq6YBt!qAX<=jp5lDU%#(aTKc=leCxA=tG4TUJAM;c9Tm)q#lgs zZy}d;y-R?Xj|gWN-#P!*?(crcewyhBJq70-QU(0AR$y7G{|*>QFl|a;8BcF-u470T zk<=U3E>kbdC&V#m;|Ip_VJshod#)ORkMn4nIO<2rzfCc_06O#d$0Yf`Lk{+3R!*Nd zQdFf~z7MzzW{Vg1NhR2DNvwN9U2UM$I5S@a%I!C!%uRb)z})GS~9OcDeRS?-EfW*##uxOzwP6c8r;`r7sRg1z7< z?$iS~vA~5S`)0t`{q!0p;SB#}uZNo3LHOKdIMV&w_O^tqbi%2f% z)x9@N>{(rASufuMr*q~)Qi86YOkSQic*DTiT5zK7R=s3NfhsSXwaK{zFQ!q&LOAfh zW5gQ35!>`|zA6<08G!u!+{&$Au9N9~0m6P9RhX~&J)bTO{HtT3v3?IR=ZPvHFn~1( z)GUE#YQiA9MnVs>!_Df zK5V7s;5sya1>r(mIRIkY0XfvnH2oErRgFE0vh&?!NN_RSB0)~qwxG+e{44BWE(95D`AmD74 zu?5~-J#AmuO5{6=Ge;7vvd6JN^qfvwMNH0tJwN_HQv~TYhV?79LI~nb_Z2PV!~-T7 zm`eX1IxQ5~cB49)QNYXG2VcM5J*3xyI6VyvTaY2BB(hrT| z5FwpbIj5KO)B}49$a;>E8Tt78M`bHfSzz}^-Wz&yUQO~r<>Me`10Z*p2W$fa9D_C$xqWHw%~&z&K}v7xq4IwLb0#%O&cO zC@?SLGoGC+*`j~p5fNcEgC^l3tK0e58ns#@ALQ86=F8D3wA9d3ZE~}4!|&=r(tFZ3q}#7 zav6j@Rgb3%%SO=(@@4gs+L4;i^k*fr_m)6RB5;U-F&i&f#l7Ki5gX`3yUPviRYXS5 zPZsX2>ym*{dI{2bgAZ2l@^M03vB)EjQTKghg>vLK|F*ubHUk+SEbVZ~fBHCcYX|U1 zfQF*R&nngFIF{B-`q_^3u+$b4_iwK;4Gj%>f+?6;k-wse>A3=#$o@G_Sj6+_TChGz z)2LyI!$4yDfFvjV_cWo0>cMLMS2}=1I8?)8ed5j8phcYFMi(F}#PUds8p7@j7Q|z6 zYAjo2&)=bYb*+Phgms@s=H|peX(dpKMy8NQ{UFo8h@p%Gr!UaP>nRo{vuuFJSQ=+S zDHlNbXSQ@p`xa99Mn%&*q3$O0Fe{K=PJO>bPUcKkzB{-_$JFerEmz0^rkaT+g zm_pj`3HJ%6u&AC`&+*RlpOQ+pga6^d^8e-RGt3$@ArSNr3SsLuNYeu7-k_UFT^EL2 zRJSGIRR0UjSK_T^iL)^WUmtW|l1M!J{ass2YZP>G$QgjK#{d|8$mFN_+9So$va+(W z3Z16Xnl4$N1*pJxYr^eXR4aav*N(MC8ukgx6XewdPItp80D;oW%cBkMZFNW6C!93s z=|$xwxN`XAdvEA`eSMK{+CIWZagC1yU$OV*Vg+MffAK|pFMp`i|F3jm_} zs#xTa0>)+#IG*G3wxn-FwYNWD`B?7~Oc{S{bSnK0)(;HZx;GF)$m!|nmVk%Ha=r5B zVxDFZRlHT);_6t4T{sbidt{O+nKKAnpS<`n*5W!-;)iHLrERySikb0DfW1&%#Qq~h zWTE;&hXk?uGYR)akHgizpq4ltKqtSoVTXi(jU)8-(l){4tQqFCy-<@46N#!yC8v|X zR>#eKqXAZ{8H@@t&#cs09g*9 zX=s=~A&!`s!mkUd|ltkb{qkTb~mx&Fh8iEaNr^uBO~wyF2L z4+3!jz;Jbxiqslmp#F=lFOSD^ZM(j1V}_)RAxW8uBvD8i$~` zk}rKrIk(@1x#~Z0CehsG`q$EvUrCTa+S2pKs*pV?f;VqE`oOqr@NhrS?t($u zeCl8<5xsKgoZm&WQ&T;;%DGG_X(ONR2GzR0k)C5{0iPm5bM*fC&AJ}@G6Xgz=9t${A$Q5mS zZj$mynMH+8P5X5-^l>V*e@>qR_RB=nGxoukP zB+C^MYv{sL$ePEo!p~IRb7N`_*A^F#QVkOIY1D%r)FTTGv|C+=s?Dc+1*S6VsI(FY zTZ1F2%?-&oY5E)aLlo;b%(9!xu~6G?*s#Itsm1*qss{Oi&+ue$?ba;Lk)!HTLiNJF z;pf{)BspcJABPo`3K&|*vpB|#YTY7$7e(~b>hZ>N7f~pPyaHk*!YC1q0ZFbD71&~M z5P`4|Xi_ z31zx*N+03f{HGOw;=M?<=4OeXC8Wf2asf-lfnjy!@7}+6);kuzz;lEF3Ej{*cH8=p z?w&Xh`r8;A2M&;$a^*;7L3Fpub5;>I<;vMBHia4e1Riz|IBR-U^I&s^wWB;i?0z-3 z?p_R7G9+MaZOV`HSvtvh`12lnNiltLV|hoHb8PX}UQ$*;b#?W^u;z<{1n%Lz!g-S+0+>shE~k$^4Asg z<{Ht4s)5v-gLedoc^_b#6sD%8^_103YR*VJG7Otof~E&{F>3{Fm`4sexW8m_gp;p)33d!`WRCvxzo5j!*>qcT#!5BORn92q1*u~#c!>2@;*u<2DuOq*PXhJ zvM7rLZ_5A%MGweYi8UJ=GCgIG_;bi#uX<10F%OaR5vDTRUP@!w72W;KZE6ot9s3SS zog@o=2Ml-Izo3Mi>M0U0x&{qIu2~8HV7?Jgs1L2|#eGuS?Aa4!4%$7?OSNT83HUi< zC0q#H)#aQC|1KS!i*XVb>>o4s& z$HQAM>{y|s1h(ZyM3{mgoqM0^cZ!KY%vF?1tbEnO*Cj|;0QLZr3 z?Yrz|b|)_Vp{uIaPP5|Rd?%IfAL^EtS?x-)2DGVGjU1A!nv4=rwjFxO&uKx;z@Bm8 z(x)3{Kb@ao?y9?mU%ekT2Q4WP^8#jkWlrm9r0!7AMQ#*W+}BPbILAY`5vR>5n{Ifh z+G*_CIdFJxeh78$8FX!v>)CL;%2xx_xFVYQLVi2)7!26Fr%G1wjy8L!v)x30?x;%l zx@6{0?rYcc5Zm9e?*1cv?V%Tjfq9f9QeMLVAdbp5S0*kyTS$BtT!W67trNBzaDQ8- zVY$(MALg(&ajaqHJCu~jG&W~YNkP#QB&pp9uqsN- zTGI^0^{vpQwBCKFoO(#s$DUzS$sl-Os}6OD9QEZ-%1-An2jw|Nn(0GDxFA;ZM0lHm z!-ax)e$QJlhl(k<9yt&K`0~rkSNONHfam$>B)g90%ZG(I)RUAhJbCzV9&SH!P0LxQ zlkAb0S>xH(D^rV{ZF)^zcknTe&ApwN=Sw&IaNkByO?(EVWSG@iQTB8rlR`_4k_OW+ zJ|bPG}!ih^517?N2a|a z&OS7HiP-x5yrOw{m+y~h^DRDG75$pzwh8_?r19bUmi=SZ6x#dseMWe?^vlyLlQiX!fp4zG8?!X)YIdw|<`SHd-F0oJMFv9A zz9T{Qb@EMiuKlMxbBlmdkyFdDdRC}xUV=mJpsM$E@U{9uIRZdM6K$t->`u!{eauw) zYBO*jT}lt$S0<8Tn47>H`5w(G*^J`6qKW9@#1X>v&kmydBj9GB7EkqojYV29u!|pA z_?!S2p<@)qiaSUHOE(GEfXkOI5%tS@?**BO0BptI^5JA?LtD^f2qC6X?btw$A39Rl z5eYo5Nr%Oj&X3C5aravcNSs>~e2YQlVE_!VGXRau+X^@C-|DPcCJ?{_@p)dtCThHd z!1+9C^BKIs6$kMI=D|s{z#|7mTVA|)@#f7NjOG86*)7GTjjm1dJ-ND`=E|azr@jI8 zzLwV3R;sfb)A<*=pkivO2={g}h{U2!57*haMji=cFAx5d+l4tH!)o=D;rGcffOwt5 z47{9pZ1^&(s;XLtV)cgwMLDZYcZ>5AU_$Ks z`<^EtEt_&KZ?Wl_1Q!Q_``LVVI*%oz?AEPY9UVVmnp;A2LiMBBck3FzhWez(3ICu6 zH#I|frZXRx&6%lx1lfUf)zKfgk0x$apuN|fe6IuljZCa`jIneggD?T*c% z>*SP!($`7x?Y46|is^oP7w}8d&K&$A2=^<|xLe0RfM852JYou0=Lj z`158TcXJ@k5_SVwK*7YM`{L6K42x__IXV@~j^Hug(Y|yiwY3?y@AyHxjYs$UX4`SC zYO3~=uZXSkUORs5uJ6d+ewdYmW8%hvvc!bg0GfX}%!xksB8V%x$$L)SLUMq{Z7o($ zsTDyQrgyyvbim*)0&|7Yr$wy(sZYN2d99z_F%H52BGWOWsjA zl$@HH%FABjIXBjfVHSMb z@7U4&uLIIuFsNw`rzVt|i%*iZdt!c7Vn!d2Pb|OQ(%yawsRa0x%ukQZP~+G(7lSMo zmAH^L5BtufO0nFJAlph5fQr-P3pUTs&nL^DaVW8C?)Z?xM_2E>p`L0q4~GPA6ruY) zh-~byx3zKX^0c(HXLhFT+_7Uv=FnHxtwfRe!z_mvNlS)M6w4L)K4v|t&iWCAP0tU3 zgW8R=v$F$XO-)VVw^+-y7OFbwz@5wqU?niTwW^bJGO~7gIC@(mwWq!`{AHRKp{aC4 z8+fcvGKjqstT7-AnC}&S_1`l(^6bqOJWj&AG=psC7`PLma7HAP@ByWf|1b-p7!n=ckrO|!%!L36Ew#E3S=l%S~^j!?tM|a^1{6Zn` zig_S4Wo6})Co|-zrZ@J-;_yGQZ3snu34g&w<(w&I%)e)RlpM zo|H3OpV#yoyATjS#X=u*g~c-h=5jQ&v|sA1JoiD1@;X+CV2tsq9LQvtU4@(i?alh9 z`ns##)N!|s!Wj!B_vjN5s#lEX5%fRk#ovy=`Ktx}%s>c?29cHeLr~~>$T@=Y(Ib~l zZ&d|l7@kLTJbn69ZHK&(u}9L%8#Rje694zjbMBY9@I>P0RM7UrcHvD2Tgj8c2~@Up z^@4~6XTn27NdxO{64SGOYJ@|p+AmV)I#+2^_9%OSk4Fm>M8ky^Y#0-Vj3Gvxf1~$=4EZq-| zipdU?qr)FFe~a>h9LqiP%h4UR-I5qAwAvpnjg~mY9;t4u!OD_U{1_PrTa(5fDO{|R z_1*glH+UTOMnF^vS|3(BOUuifUKx`; z@8mNrKr*8!rbB7NW0yuwd=wr@2z*b!u#uJ;sWZ)XS@d74i`m}$eDG%B0AZPDB7vPD z;riHg^J&-jPxcIQ&PA|^8M@Nbos@NcO+`x`Ce6o3BvG$4K7Fd9qCy&Xulpk_EF7*< zyF8T#`3yzL2YCQDEwjK*cp9%FNIVMs<2RkP$DOiXc^dTZWB;G0)9ip@?WjVDP{nt& zxU;$WGQ~reLyb_&2&Hmrmc!&KfExJ?)jtX&TA{124;AwXG@ocxjz^d%RZ3v4;(LXH zV05PH$rG=_mX;&FsiDnHO_tTZWYr3ix}eIDJ>YMFnm^m~7>#4?h2OUB^yVnbP~2R( zeA$3_Ewy{F%z3+X_PrL859%(RcN)4C88+~bydBN?4xwM{cG#s$)$nIQT?E`K>d_+~ z)U@dTc2sYrtAB=FJKjzce8Miy{9zp9X6xd+TLW{VxU+}&{*Y~W#=x7B2B`3V!Ly~a zl~ONhI8vonCGr%87zU5!e^%{Hj1Qgfu1+#AFj)3;lc1|-Dl#RF2ARPmBq z!knJtu5eFtm%e=YlP$~@qcy0YIK<70v5^2BCG9&ay6&KYY@EX3Yh2B;jdUYXh{(jA z^sWB|Mbz2U*3@z z{*#T5gB``C-&mJd-@mDF+*o%D50AEv$Sj(FMFj;Xc2Bu?TI2OU_*)8Be$A`A=WqE8 zPDQ-~!Yq+g76*<@GKBrs@a0S9RmZy*)K8`99r(j4#ZG&u9|_~qr#H9$6}HXv3rL(K zj)4))__*rx@e1eTye3#b4!3tn{`c3-lRfD}BmXk^E|w!{PR*dEx|%1mOj1JXlFN4+ zhU`1wT`4IjJX*>(&RZf(TNjbTwC0_C6_aACL%u5J4{6%dGqsJcQq02X%N&^(`9oD* zy)pW3z;_Mo1W3(n?j54`%$+AI@E0LIql_j~j0v46!K6H-J^?Ooq7N&%NQ1~C>7||@ zls1h6UAywV9n_7-&M1bke9Om%TK9xU#y}Cvv#Y;|*-k5e#--^q*@cyhD7e-itALpFN8Xx5!r|>tBTu8+Y*K;-F8j#N={{d|#10X#O3zN8|() zhESzk)ncOIV0ez>XQnzkInZaPBlO_Pi+f2cdh;%0*Df~jUGb$i#4m;Q=Ml3nk~Pa+ zDkk&RFq@P|Aw{XnAVxok6$el(EJD(Cx;7DUNLpqWt)1ne;*p#iH#V?b@jd=B{JH{=gH66J5b5=MWck@Uu{hteD7X`o7-d4l|S=$+e%IxE6rF}T1v9@(B)%iAtHZ( zLJt5rX13Q3sX77u;L}$&8!tvqfBHn*LT*X%sSpbtl6_q-0ZEa6SR7JLM*7_|?c_P! z104+NeCN{94oV5kx}ZWHweWmOnMm*Ouwkc=P&RfndaS$oWVH#l{+xmog}m}k^<6-kPX>EJQv9-hO_RAX2tE1ArTKnt1Vd~c^j5EGi=~7ThE_(h`|EM zZ*(z2vf*v1+OH8xi{*);b6-DPXn0K%j5eIInpD$y3}qGB;{Zh$>|Y{k0Hz>WAZuHK zA(+)d(+6pp%-j({?L>X(3ed#x#(f2n07{cO8(qN$SvSl08+NTDJ zxBX&-u=ze;$LH?s6E?QTzSC|zs42biImPXhYU464J?oUR8ROOuAF&i>qNnGl9Y|bA zseyz9ZMp4R1fFv^qM$8PlK2}&QPEJ=^dyFPbvL;O+Qh68t% z)YaG;Lp36UpOb`CuApAUpQyu+y{2M8Ypy58zf-u*?wDLx{Q}+F;Go5z5zs&>eovFrI!r9* zCQlAi&h+$j$L14NItbx3TJ_Zo|FNL&Ep&MHGk1Y5MYO zgSmxE{_&^!d-hgX)~;a0d{eNSJr(in@!KjN$y)Y4?rf%_*UilWwN<5?bC9@}6yzdO zs1tnAL@a-Cp5De@g~PwssZ$vi>RY|w&E=cCl-I*`m0RJg0AqdP@I`j#(NE;5fdb@lsuIZk4Bse4u5+qv2Ys23qr4%iHoS?iT)v z)d-gE4cRzjda zP<>*D-d&ReC+K+LUazXU<6BkymH3MkQtg%~!ch+cVH*VPEQ9oi9+I38^u9LtX;Dfk zNhk%Od~PO}LFYfe7I3?}rAfL!Q;TEnmmxQZqHl2c6^7&q3vOZFbg1a7|IXmuA%V_A z)sKC@Uf=mWxP z>P8qqkX?mYhV$r$NE+?a++3(J$ft)~Er!LC`y7$N9O1<$g<)Z#4MTM{6Bctc+LpOb z!{TS&$17ck8+{s2$6~yEXZK-GFT)K7#T$#o!YV6;3k%!2?u#y`%FWe@)w@Phe*QUa zP~m1I>#BcC;^O)^qwGMa9QqSE?VIT{jfB{4hOIXe)G_?{%G9LLc!aNk%loWHu^;o zj;GSLfnrQZJ2B1Xpg;qt2KXUrjiH`I7*Vs-e&&`3{0TwwQg3LwxrO=hH?>#=RC*!D znLfjW9q#)D*%y`x?ofGvA=ry^y2&`vhBI@g%xwZ5PK(d11V3Cz3VF~o@(NZ z7a8Z?d?8>dDOrTm40UEg_t*DTd!?lfSud{F)oB)1t=1S#d^JN1$n%KKaEYf*T3kpU@kH{}s=BxRelp?!aE-qG(o z2dpk-(vD_M&ZD+&*IE_cxd4 zdt5S%iwet}?gdudVGm@ZBY5Fjz+OR?8zWL8H-rPsU;B&LKZR zE`^zqYrfIf8dAXeqWGo=lQ>o;te-|0YgA`vJ3<} z&|@K_X5}0k;MaZo=6e?J)X^;M&1nhdSGOx7rJ$FQlBULF*E_XjQ`n-f^vPz9o%b8YP!2VP~&dExApjr)Qg+ z-m=HVIQ_yltm&+ln2hmbFSVM9rH;*CT}EsC+ZH~zMQ;;5fBJ2Pb-`I5Q~7SM+*j8l z{%p!We8aeqiQe#@>4D$9c`glCn2t&woDXVaweR9*yCU(8&=K{|jyEI)m+UP+RX5w0 z3|R4rw}HD(kTcdz&LfSzBK&*;hNpmoF>=c?GCb4XadQoT*k+{wxfy|V{iq*8?sVt} zvs`>ufh$V5-K$BTV-lA@wI_(1Z;EZ%25FXnUj8tsrY143p4}pC5c!amLxyAHA?F8r z`PCv+RaK+quj175-NbHaMqlu7z1Io0%fq_dw{}|j0TZFRddYQHSN>b@odSa(@VD;s z&H=8J<^^de-^nRoz#jlF*9ozNl!LP1Q{f70)OLJnM)ku;a#`!$2XTH-KUD8tUrCSe zgi1x^H~=67%IgIV9}%zC)A-(TRjZ7g1G43-3u)qun*xzxVYF53 zJ1ucB!ZTnm+9`4|HR3^Ifsd(hH4BADX;a$ zdxpmApU|}L>PP9yEI2U@1S72e8B|*$LXGn~9DUicdh2%6$3%R}REuT=Qg%f8&fQ(p zWkTgJ$lcm<4#j1hK2PHsYO8KlgpQ~26RP4=W5PZ@K1Twv7x;4k8p+!;6Cb^!q$F(V zyQ2P~HaOWf1+N$T8ze_eS%1(5%>gi9a+GshrX3w;(Xm2~Mu|@sfhj(bok_l>JFhkt z-6ZlazsO;wwY9OapE9;@T6fuC_wi7_{d352hFJxNqg9CtcAZU|bBAt9r0S{fb$@X5 zVvlXgM0;;?PV4uVTfYAIER#?}8fe!7e_^rA7BtbGXZqVl*OOuvq!@0_83bOADE>4L zxZmBAF~0u0rD!gYydddFk+!FwLsDlMrni?0!euGNDJ`GSK<{`M$RJ*ua`)~q2yZag zmyb6UQT~FFx$Z7iiXUE6(DpoB3S8(qjNBONEKeN~5*v*~k6mq}7)&qdMC6=J2)=`-d+m!^ zMGYMM2S9|HJ_?L8qO!#9o1E6nOef;pZk_j!e2D%*T}>_X4+t#I+~PUFmh-NOv$Q&m{L-?)O>!qO761 zyM77hS@cT@&MS^?W$&zhhuZoc8ERI$pzHhL$f8>J*q&pz^aQeMtSN$TH)_z1DEh~3 zI8df$GiRC>yv$#c5KGu5I|kFSiyE(M3++Uf$!tE>^!;E6`GOY=MhpnR!|X3Z2i4Wa zrb7sBT)Txs`^btXrwfQln1%_!Jw&HZpXN{9vll09XlO|82qN#LM{3nOotO#hV%re* zz(TU*q#GmE@Gk||iBhhASMRZry+jAF(DRul(3INxrf7$P!f(ZbO7Z_lCmVox8FF}P z$$Ff+GY?1tW1=T_$K=|g3nca9FK`I!^Zodi+7t8fa+q2t24D$D^kdgeZ6fk*tl2S~ z_=w8(xrr;G9?k7q-!gtcS6y^n@&qn@0tx<+?KMPu5OrO>26R7ZBq{hW4CCRoxJ*c0 z5f&k$Hjrn-X>P8&8ThW!6V&(?*pSq+?qrP;I`_HJ|0&(YWmTk~eqBOpj!~_N#-C)X zprA-5t#F?dCW5-Ux*Q4w-*a(th7U6y*->*n;=9bg< zUR38)=D2br+?RcePU+g8%z~Vc*v!<#DT5N#);GV}C11TQ_czICd8lwcl}+qNzW+~0 zUS5}KCvF1{QBL+B9m8v=P+s-r8;J2Tr+wXbVHs@7tI*I?C4ZX4u8g-KxqJ`Qse^vM z5DVG&jNOQ{NL`tOl2555_5ephU3c~)@=nNT86FyPN%{~mA9@bO2R22ZTFM;W{M{aW*x6dotQKZFqyLR7Qf`unl}>-l5cdeD;fpzJ4^I(_J5~m~OWY zJ^SmseSj=8$Fmdsk<7-ve}PZEeld3lXBLV4tM4 zbSVam2U-yg!Y*GP?Jl&bTSciW!y&klCQGV`mWqnFCZiMv!s+g7Sn|_Kl0jSYI3t8u z!tvKo$pTQ2zgZ%|)zIIc@Lk{D?*q_ySf;C9MP+StP}29m?hnyWS)^)A=G zHQ1~nSV%+;9dte8LU2lNI-#z5Xq-$;!MVilT-TeZ8^+AL=Mg}VJhWy13JG_boy_9u zI_S-Er@7+KyPrK!-hd({m%RhWmLv5pw}izro&59vMTt}Xg!u&FRsrGW_8ME~4J+&v7L#`xfEwDs_-%V8}NJuv)$j<-k z+cQtESbcfBCryWY&7`34=a)}=&U}1xja!dL@j<4-*{Oln*r|$x_#k z*Fd4wbB7K&Si_gIuo0?queOpp+}w4dHsjN3uGM%Bt9w)V_7Hxs)lZ@z``T z&xQFHdjY zmyu8Pg~@#gPRLt@po8F>EXRz!5x1Uq=5;2+uQTj(k)t4hNw(2mU}sTR@u#jqM>}3k z1c2;IfgH%1o^ko+ap)6Tru6c%&O@F8Frd{9B@=5RX9c7@y&)Cz2! zQu3W0!5l@k_sy#e)A{QX_cQx}38az`=%xD5%~W=kbGJ6X=i$A-qF-x!F%k~)90$x? zj*D$<9Z}Yp;~J+NF}^k7Q}y9}rJctvi)hF-Ci&)mmpB82fbb^8vPOxB;)Z|gH@~Y! zzQ(?X#`3n6T=oxR*n%iF^p)+hjtE)bRHm2L(D4i9BhWdbz?#h{;Gs1eW zem^?BOLpc;826)?*^%4S^~7Rtx;1N8LxWG!o~l@}{yXVv-{OSt{b3XF5^{XG->Pbe zZiA`C(_`tLjg384u4(o4(RYKc1G0H_Pi1X-aIZ^eTKKLLGlkY$#zyZ6_HpmFx4@W} zLNi1pGc)hAe@x4NnqU)*h>K$xrLn9!x85jNN7oE!8kF2~uQTp0Gu_%P%PS=lHX4=PlIQ?1aBVovf6Y%Xa^4PBWUTp9|Ml$7c@ zQ(-@E`q_TdiA{COQ(}x9JB7ofF1ONK6SJIYIPg0@l0_1>-A7)1$A)-|#%k9_w1!RA zFLg77xgF^?Y@lUrjo4&e7kH;Jnwxg0F6gdyI4Z$x&fxSTuQ>11t5UQkG8qtBR(m|< z&fC@hlGNS7-(YK?GZi}5u#QD#OY7k%>xhFNQ<9pRj5rNMMR%r9xu{a-ZkaQ_yF8`b z^b8Qvk&ePh4oQlsl(?0WQO+|=La)H%B_$Y5W^PJ%X&SCRZxpq;)P9Q%EdIfa%H`hv zKoA|#D7_7_@i&(OXT5)_e9NtG=7{f)0g(dFCeQJ|&;1&HcvJA#9Smz|Ey~EPg}5x5 zi_Cu7U{4E@ZKoR5ocpY8ei(hb4X7|P=7NMHU#OPOqawh?OsN>|J+LxM8JWxOyHnSP zEJKu&a5Ws|dwaH$>mcW5r-KYlnf^a}Ot(fBk**#Na=O92w}(S@^M?sNjg7nk0d-V# zRy7Y!wx+ARX9!Y@EUk2E-E>qts;3+bTZ^n3DXH!>r%M&volzHjDNjlN5C2Gs<*hX_)=epj74 zJ#eoh@g7LhD6Y(Wi|pF2wuAAL^ntT!kiU`%xY}jSKW3h#s+5fktE*G(Wlgic=Jghv zZL7}ZDx`!~u5;c$~6!@ zv7K^{sHiB3aF8OGVdg}XhPZXLZ}gIG!YDB#)I%;>M$Rw4pec56RFAYSR8( zqr+=d0(PGHrbR<9IBe|qdyX|BkCTVZ)^Hi(k)@lw8gl#HnDML{GYX}~=>W%3UtC@hu@0oet!hsgJa28(z%^P|-h@S}`>N_WFN z=v$pU`L56*n&Vt0qLlSe)J4EQKzRC5;SH`mZ3{$Hn9rwQ7tNxWWifcZg!@x?1Qp~Z zVkUlAaVJp6@QI5@J(chLc&lOwJ8gjo`2K;8!93@uEW`Wi+@X1evk%5TWZX=CA6TrZ zC_rD|Q{td=ZHL?H+X@}h!1l1gNAVYa_avu3zh?9-%79&5fAiu|r5hb3HU4Ww5ZZSP zPrtcp;}Xb5=(6CKKjIzB#B3)OFS3omAqL`Q48C_jU!!>Dqc*4>42pVV#4*OOA6g-d z_24d1bL~-t5M0$-0=#pXE%4dxBP^pE4M%_=O1&!=rsZI}cRA*^iz;oGj`uD0iIH@E z1X3ZrR2iddcv%f<72O{DveR<9Kn=}IP0^PmW}$H%^7ix$sr{?P6rk-LTbaJpqlG^D zF-dL*6``~W@Sv(HuTqTnOV+dgBC&=B=QHuIcs1J;F|)7vGG%RCW6m`F(z3F<=G7lj z;rbvAZiyS*W&T*V>g{RQe`<{RPH*A+wFno=`ln$+fgEL2wkP0dkpv-XqI?Q&in`^Z z$oYHz&AMebh!s((GO!BkV;BXJ{sbB1k!S7>ceGeW)p>B2E=(|7oeY?EijJN9*SBeMp;*#IL-)K8(w=Ktd$h0-^0hU4j*Z0&U(rotkuHy~_euz=9LX3Pu$*D;P zcxNwxi*Y~It(~>0(B3SoUE?sB8F*N>dEvv=Wk)K3xov_R9GU9tCn6=|v~S1do5$m( zf*;*=8muQG#!~iYmv0dE9}Qjh^(bw*uAP{)Z|*GtLCMv%zy{XZ)s_AAOn_x`izX+X z4jib%*>yS|i5*@7>mw<HXtuK(yy$G#=5 zX`KXt5uz&i9f7xjdrp@$E~cQMAS^5_BV){vY@BHF zpfyM$<=ir+TA|_C%qtrhPBl_yU~Jq2ttYnMJVy9nq;?wg1;$lh9R z$UZt40GDfeyKC-9E%jSn>8%8Jz^?$h)A+sR8{9IdTObn8hlWW>`5DymOXmO3!?{xxgsP zsBr2QMN|~GPodAZ;kJ0AVd*>GERlPdn3!s3_!`~sZG|MItXPBk%#*6Bd9oyB<iE4`aI?}^PzRv`gB=on^f+#B1}Qwk@E~KUj=i0EN=7ViH3UM zNiL&hMgg}DLO_N%cIV$ELyZX$uvRHRr}5vWF}9 zwo72()X_H$7voD##OoyYe0KQ57}!Q3#ap$Q;Ow@#ykNkRah1o1I=lJf3!|X@W)Icp z7PKcN>_MmQzAd!bg0TnW!d9gKVUpY;n4k6k{;p*G0gUT%jWjbgC4E~nwepItU62oh zt}X(vWSqi7JIe3X?sxmWb3(Jv%vkbBs%x4P&rDw2`Q7MUv6_6~fX0UMw<_1GLJuF# zO9(P8Fsw7$V|Qt}F#lA4nZwD~zR^-1w2Kx7eIipxY*-yssU+;O|DuR!CDk_hP5Y+VtRE5e8qL9Wc& z7Pp(1%?vAwB(Bz){ko*Psk*U0q^exL^Vw_xE}0`v!)2R({b`|m$*ua-Rg&+^_Ia)~HvVbYt^#Op>-_xvoqubK z($&*IE*=Jq^}otM=|2*kbOF-db9Uz$%C?cW2BA%QmD)SRR&5*N_1TbTL-;eHDvDyy zlbS?Ug87`Fw!Y0=jbC}`>G$?Ff4qt{0IKkjm9BW(LgXDq{^VIfbbVh3H5aEUCfjY^ zluo$?lIchCOQE5)O*!f4Xx}4jzvQtvIyho0p>!3$-s$&Ubx-`4{yfQLD3WbIzfSwW zL_CMk=pt70|2d(zd4szyGeGGk+38;H(sq0I+o2Dr706q*ft2`~RR7RX?9(fu@x~%N zPO$UO@2Q8wf0qArm7EyJ($Q4C#O|(yj~Px@Baw-lfAQM=Ux<28p7!JFi(&~XVNgFy zOP4PtM4upeKreo!xxHPiv7R>%bqR%3?OcJy+aDKc2k71QKgQSQf-vdq>m6$>LOtIO zBp(Hhz>n2OH2Zr)IX>6iViHlT?=6#8Tk+?$*E+%>4TNnTqB!FDOM3%wSllvBC}%=# z$KTfplLmr`fcdP@PmRNQ7lD>`PrerDO$b;x!eIS4U1|O5hpX%|VR*}`|1*k|Q;e|2 zG?3&31swn(0{C_ts{RFb3HXW6Wal<0&Y^OS;@p)EUN=bB~8fwmv>TG2R7zd^aCd zBxFJwFa@%(EZR4T%g6D6^`xaC#l;8jAD?M!xY+jK=#*7{(Q2O!iHGakK>6wQPZ3#A_ z?sWy|cc<3))#6?x8+}(dLFENo+WareXBQP+7&t@&1F!oWMvVPm$DO>;Jk-IE{y#7& z*oWc=o417Va#+~&rlvCY;lQQY(PQ*a@yIF4*;t{7urSOiysm!yeaK()iMQkm3@@kO z+0*rllHXb7))Jk$kxky)VDwS|Ff4Tgo!BfR1%T~X8yn>rSP$5q3weH zg3bRL_@-RO`VCt}vvr-_CEAkvZv{5n#EXhNlfW*%zXMAZwM8jkJ0-_%w$N(o&X4P_ z*ZxCPKrVhPtwi99q`=t)r_|SLX)j1mekBEkckS&AW1AYNSlQUr+8`$<>$zVpV7u2q z_H?FU;xPp!W%&j6{wbjL3!!0Qv?4IL#2<_&EiHMnX=P~}+}e4x)RD|OBnU8^zIbbu z4eKbQxsTYsZ{@Xs{SR0B7H0bi&1QqF(!X!r<863v%d1!4AYBlzZ@kudP9fpZqo27( z*|2~q^H}-s(Ve{byNIhat6K`2`K;FYnQ7k(DdGrZ$+O=dJ8L zXGZrZ7qjYHFDw{<=!mo^gK)m&3*Z`TehH`rKnHl*U|c3)MP)8yqfXdSo3IhbJ({NRYyaoZqe-*~$tTJIng#Vk2=6q+uloyjxdfOt~i z)+GZw8Z^ofcckMq7eB!~*0(d~{r#aEnM9!PixFfwdjCBWufpWSM5bD1PC!6F(AxV? z%ZsN93xgQ9^7d7@wY<(;wYFC3FU<9K?jH{Nsp!@8==_;s&G>-2U(X($b6X6WXsgvp zTH;lj)+#l1|BYFygE|loGYUzxMWrWIULG)%Efem(`oN$8XS>dFOOegbvpzp>%Y1gI zI_D##L%;v5Xv^>#*Lb(ncqqj#%gtqiGn(FuQ%A*bWM^mZTJNf?T#tiKHZw2S>;QPk z19~D%6)5Eb2xFmqRf}B&PGr*@T@J8JAmlw1&e^6b?_UOgFaE1`V17@YEya8c%$5L* zQBzZoPB@W_CgC}Ufzf`7EYwXu@!q(8-yF9I%Jm7XeWHGY_bW4@%|O&FZ#nHNe*nYG zu|j%KCSyVk!i}9{EX?d8O2jG@YlJb6>UT$V6!FC zywv{F{ni@H-So5{f0m< z!xL!r0Hcz9@<^04#XlTdZ(_fR$ZdtjcTs!0r;985{G0Z6tw=8qgP0TS|413)u8i-n zS5t%8Ug`_u?JLF#gv9hxy1IBEAfbiX*-qd$J!H3#o^{W<(-jm!GLQG$)R-1ASw0kO zdH?=-`xgN*EsP+OT{SRHl=OISu~{*&gzbSwPuug6F%e?T;}69K{5(!UxZVe)0*?p! zM@u*cDfSpd<>w`4un{3AaU%xQJKd)((nw3>=_Bf%Eyn;dW&b&C=DTbR?~z3#nG??+ zJW3T@F~1*7Of+6*^`GfKR@5rd2{{7iR)mhsmdv=8Bh9TEn|G^kxgEtoIdtjwzIPOV z;Lv&ed{h9#LaL(w2Z>H+5nK?k$cq>i1Pg6w`DpsaDDkFpQ3t6;#65w9B}|Md|9V~g z!gR`k-_s8hzpCM1=rFMer__H6(M!@zp%cIM>g!t8g0=tH@J_*Iq@TBV`@&$#x(`SSMGx!y9(cb-KKJSi{jMziMI1Ga*z2{ zBqb#ot5gUn2_kN+02r{xNPP&F8nP!fV9QXirDq<|hAf*hc-O;Ya)Ge@$OXj^B~Sr@ zKUk)q5~s$qXQz4~9JOiuIRIY5pq5IDAv?v%%g4ulQXiG{<mpn0Y>!8Y?RBnS)Dk_!NY$SO(5{s2-!>;4Ch zaG3Y20@z!!SBWffe5AsVamT|06)TYMleHFUpZU<+H3+ze$ksMVboMn-M}8eH3XJ@Y zQ}XfX2jNG`YnYi7(2TRCyzcfoqZKmbff?8@txkQUQvGtU7^DnRN;^i+ct;D5DQ&OD zi2CRll#$#g1X6@}#^5;sG4^t!J<6DK&|rEL$hjaC+XT#AhDu?M?k`M5Fdh@P$m%>H z;x)fUMvk513<+`XU!}{3^11hxr5!|LfHpEk){df>ZyU1@$YIDkZfrduxapDd(otv^ zKvl=wJ*cT-J`^Hg6Kd~-x3bx$=&4rm%Aa`8@wm$jo=xS6rZME=eSRIWRl#=_b{e7( z0fAT6GQ$9r-F*kw(T{L3t|WUdz0SPofSV9}Vcf)Mj4Br3n>qywXXM&GR6Uqnz&hu_ zt7_+cIa%P4X%euJje*@Ek5Yhdy0;KXu1nBQeiz-$d#w}8YUJu9j@q_~nQfe3kf!_i zRn1fX&g+L%R4}M-UG8%9)|#HuuMFtUVetdSR7l03SIJIrTYz=D8|Uy&{tJadkGm~W zB`hw4vN!vsVH2efD{(y{f=kg4Y5Fmet>ny!@87@U^YtMMsD)B}`1XygLWTno@#>-Y zK6uyWRG`RGu-(+z!Z!!=h$9c-9_R2xPG49~&KEO*IO&5WCumRKV&_I}l+8yB*A?+7 ztjlpaxE=krbKDpq4uIR`ZqTyG^;h1Rl_Wzo^->2dp8#lv{i)XU zp6mD!tb|IPMMaL!?;^`wB?o7;X0q3&snFs*sC&z}<$?B{2TI&273}~p{#TQL_ob+K z(8T^ZQ}1)ljq=O{1Bx}x>Qwq0wC^^>T)7EQku(y%&c0jFen85;fx^{2V*QdSK52K0 zE)m76ps$;vZ?j%8u-a?MmbxUA<2##>CGOOdCx53XDGcbn!Pk^)Csacl$wUi?`|bWi>+sy+_;3`v;$OVVV3N zK#18*`El|02@hLaTbv-^zjA>qOPFm@ys#`IE}oC$4(VTyAt$@A?+;>BV4t@0-CT7mvCtHsE@8=|#la@YrKkw>#ODmh69@m}S>&}_IMW?`yh^velsiQTSh7;`(TOPDx{KW8+OlVqcjogOB*;zBR?-q)i z-@NIwFC&fo_#ZU3_lAQ^i?Em>N*5kjR;KaDr!W0--bIJqO+MnI|GkLSrh{Wat|^wT z*Yv@{bZ+q#hZheJG;Z!KT2}CI%7ITXZX%tYnX!IgoI5!=iPDkWl5q;>A;&^F%tjZ5 zM(iquhi#g)ZSYN@8~dW_Rt!ZwyZ4o-K2Im zHkN%&09&m2A_*fqQ%$Y$TAv3DY8q3D|3fd>9!IA4V#33pi+`HA!&%fYM_rehP z$}iEMT%YVs=YS_T$GH&VO=TWlWTw1`uI|#0Z>B*KrKx*XjusUzEjiJRLB90Acl?In z?ff&4$~Cor@%%Z{R`uj}$t2T$OQ`9eroIpqLz~;gt1najWI3!asOa}lw z(*NFi0lIoR%Gprrin{UeI_=v%`?(5j`fqMFA-^VLnvqGTMe5b3S47Ty5)7Q^Q;)WK z>{Vf;zH;1k6KB+aXk+b_V8gMN8r$8q=M-;$?ujoyCxePhUKqt`B-|&8I}tF*LoLXV z(Dy%i{8(|%hlxY@0EEZMj7E=#o9mZto?oZAfGXmj$J<_E91j{+f|%C+qjq)erTSiV zAzk6E0k->Xnrwc3r>hau{SSMogSI{s5BM#tvk}_2FUguFczp||z*vY8u(5B;-Drzf_nt3ZCW;&WHp%t`d&G57_K7wD)mv)=k!+< z8*+c$EnEOWLSTO76VWgILW`N58+e7|Mc?Rdp4;mbeM?aHKZLOa5u4D7F)!FXKi5kY zj${|vEL}aayz+Y(cXVfTZK}HzwZ>RHsk4$yZi`H)3hb4cVu%zeV)*g>fVGrH8dI9A(=lH)~L>!4nAy$?xr)NI)K~1Ma-bqr*DLs%@@~_N+ot7MTY<=n z6}&W*cjaNd8*5d=_q}&+vm#K?|Jlr`i`_ngA5F+TeT1g(DNP&nosb18qB{q^nE|Ia zu^nOmhTs!E^An=X*t;=8C?;m6>oZat)Qv(~L{4cF_ZUuHOuxjTiX+PJ$dRa69`TU| ztbzY|q_@az#k!K-l(3Knn{S8n zrt--q2cB0a`6$WCBj>%A>KtYg$&zIyw2kj6^C}oMJu~#~yZ9BUrZ!evNPA;8iPqf` zjhB&(dJt>)f9~Ye6#Fmwn-@m7*;ADFez80fLkdUD2tmeS6s^EbmRl)8o}0K}kwZYk zGNhgEDg8~UV*{BU6FDOdn|>8+R=Y@h6%`xhPyPHqpH&yB4QnsYvLUP(<=rx?tr$86 zoSD;LRN#V*sTnzY2(mQLYn|cf)HAXcM2c zlq_lc_KjE9i>h8Md|!6?|A6D`6sWUMIxN%6fiUi5+umH}5W4TM?4kJ(|H%UWRGw6) z4lC;M%p?UiwqwX%sjF;5$&eksCg%%_Q*Y$V&v&nXFIU-99IIC&(li>r2q^6kPd1I? zQ2E~KHL?!oFdw5T?yF0Gi!MjGBM?{j2W#V7MHy_-+Zkay(6^7o_0(hB7CO;5$^%IF zw+0W7j9Bcr0_t52(j5s!-$e2)vBNz>Z|6*i9>>aobS(HCwZzb4M-WveR)hliSC=y~ zHr9`i9k3%DZEv8sVlWGhdW@hlsQk;wy?A;^=HHkN1Gwny5pUw5UXq!{Hwgiv#? zGuZ7v4`&61nmP)}A#8#?HYUt4#DsO)0CYrede``V*S1zKX?%os64AqB*QkfEniCbK z`)XEM&wDvbtWn7LLVs%guTP$}IJlvI%m$3mxI5f5;!+qtO0YAUmOu)R0_1m`(KSDX z&zfhCiR@}fJ`OIX+(G_;xf<7)DSLMBCYdi8KLAUq-_gM@+?2IYhdHnEdS4c1eKOGb*nlvht6Qbif`-r-w%Rt6gfnRNeDgqlt&kR%e%)lQ?W7SU5bcgD+&Hcbut zqI04P%1{CQ*f)d3uc(f#0((qOBKj4Q9IMi{lKYPM%h}nwEb$?I&JW0*ie3+rR_^zm z=XLL?y?o@^YL+$GVrJQjb*Bl!^8!c|Hs;wg4lj4XV@N|hS;U#)ne+i6A@q*32U=dg z&IEo?F1Eq+`4lYl+Ysad?5z`&fb2CJ0nLGL-ypNRu|;f8RcL72tF_V%MD}X6z95)I zXfR`M?**0&@S~`a>+(`@Te6_s;R_<)Lzcf>ukrZP)A2X^uWow!boEYm-BxG5Z!jaT z>5nAt2@qrt<{Ck8wJjF{H1x4?Ag$of#Vj;oG#|#uCQ6VSMW>t3G2DkV9%?$~DP$2>y1bi0izYID7f z`z~)Zod5LnpyZIsj3FrIx=KHik)eL%%+Z@S7k&u;{zNO-N!4muLE@7i5s~!-Em()Y z?McLOu4$${HWbBc3hFiy*_G9pd~`d&b)YzI5(6e6tH_(R# z^MZM*>X-CNvd^)qNwf5Of`1D3Y()tN2ZvnhhT#1RuQ#tv-K(;6&3tj4w1UE{uh8+_ zIUrw+Uy_rPIda%|EnY#c_8Fm1Z<<#cO>K+iID*i%NIoOZZ8htXVLHZWY!C(+SdweJ zhRAktupk@Vff`x^%`pB;s;W+>p`A0VxJfaNX$eoWrs8)L(kSFZ9x6X z%4VY)>-3ir@+h|GvOLmMlXPbZ?f7-0Fs|E!;rl`t(Evh($%Ej(rGE6#)3~6j#0d|i zMqs@TmeDEfk|z8EflVV56CVI5bA}3a5T9gIOnL47$uqJxKF6)`Dx zbn1?{=Tk8A%%@+~JMTT^>gfl1hjB$NS;r~PnCr*FBf&4aW;2xgvp-|1IEn=5mvrX2aX!pUOm!s@g6B-xRa@W0 zSADP3si{#-QrQ1Z;K`niva&oWwNzbE(*{G0XOy+11a>z)H|AA5JU!@ZJoun8u-sv5 z+vgEI1ITzt61Pm5R%!v)iH5kv;>huV1;nY1Vyq2B#z4hEdN)IAZmkF>1UQBQ$FFou z3YSB^Gz+y-of7_qTtTT-!OcTW`pn8baI{c=8-lmRyI-+hw3R;4SD|0@wCwrM~oKKMXuYjWehu;hRCu<_-;Ujq`n?{<>c1~ ze-tthkymOGsWU!wV`kx9>WSx7wc5>9J*DSU?OSvMQZxnpqE=s-niz5O1yXaA?)fKk1(hP<}=xI?Y*BgdOUC+3N2u{$BQ4J}4N*ppGJvi|t zsp%9<3TNM~@hw|+wfKY;cWIQ8T}QTZdwV;eS|o;Tz2#>u*ZAE1J~XY`4h|CPXW$PZ zW_oWfoDu-a2+rZ3B6kyvj|Guc@nOI0p+3%YyUoumr_J+N?hyJ(4++i8`-}D za)$=9Y%knuh(#+q7cC$;joDAM6q{4`U`Bnyqe7$`LK_TXQkFQIKq9gKG+{JCFval? z+GQRy@Qo53 ztk0IhxiSj5DBrg8?eov~yQkizW`Fwexa@bB;$?|kiCiw3SZG7AcV!?Qz{c*JF$@E< zkZvL<;P9UL)_jGlyu7?2szG)_9men2o@K3JjnMmxG<)4n;*sw97Fk;d`B)Hk)S`PL z=_)W6g6L$Ay*3uTpOKWb9cg|+jCkN5?b9s9zH51UczB%o0;_vy@}{a~930=^LkThe ziU|lrz66dRmG!wdhbP=NyV18_T)n@5S|@da=sh3s26T9d>pH9HOUV=Gv?7a#5} zj7=IY?@5uF99($f{WIW2z{0idlX)?Q^kZRWH$Jvlg+@e7OisS6R8~=m%a0IQzn<=V z*_$^PW{7Y}JT9z;8~3nX20Na2e&Vykqb8v!))hMc+{7y%EP|!9d9#^Kmb(fy1eztc zA~uD3g;D6sGnlTl*PV7ssQi7{qia2V+xihl_D6#Q9&z?t$UWywDTm}U=%X(hV+EyX zIksg*HOr8-$>t*ZOfTP~8vv?5qrB!-zV(iYKe$0=_FZapvEL48vE zShF8>uigs=bw%ovG_Q_R%uMkyS6|;g338%66nPPLdAFfj`um#uzu&%6iUUad64&xW zc%O-i-c=>S*-uoA%l0ZO)9edDuW&<8+(rQ*O4sz8nagE%g>5{H)ebFq)3;L5F+-;4LlC(sMb-waRB$-afM5D1A&MRkcpmR$@c1V(B>W{CoXd z(!U)_J|IZn`fkU5ZyrVWl=<>#FQx}d>sqPiMu5l;^^qMV0 z8q^;h7$k7o4bkFN6fz`uG+s6gNE~T^uAC^t5z1x%F?-cY97X(HNTx+)u?Mj9cUN&6 zF+|`1CJxy;hN0(%0E@x1BcFY%+haJeaccs8$go2uBIPq4A9A+nwP<5>4r|0BLj$pt zpf4VRj|V#L0}oq>sk$S^s)gvZO4^kc%$PlMQFD77%}z@r@;X}d=cHYZ!!utqs@(nP zXZ<*MZ}TZ;&C_#we!u#w`t%VBI(LpL3FXmGFeJplQm%d!>986wuif5H2sP#{ zjNa_+<`dt%GQu!+5^F%Ds&8rWu9Nuh5rbrux|fCiKF+2gTTC+A4t5C87%q0b#Fe;* zxDn{MonR1}GDGQ)eK-T|%1@w`1p=6-246Z7&|?YV<#UAZc8y~Sxq3#|2>Y1IbDj;C$N~-X~#I2XO zq6fJt5>B7vw~OzRhO6tuDmWDFFrFe*?zw2#vWV$bgig1^YPC|WQcAXnk)YD zG7;4X5dviF68V^8cX{`3U{Ulo;`u;t&5kB5XdKr8nS@T&vE^8QI@D-?#Cim-SX4w+ zb=P#Rq~&HFC)>gcCMeB9u4u5{w_p7$6Z|pcl1T$KVRR~0fNegTv|y( zYy$GKMN=KLGjA>_#VHtjk{V0*#BEha%Fh8CY#6sdFuY`6ezsL{obE!Bk43QUW%SIz zJ7leCI^l&)2Wo|?*c$AlUBjIQShvwh&wgFAK+=Xt>4B3An$SP$O282n?h~VWr<-g2 z9T^RJzZ?-gJO#p4^?jNrv!B)LTVhA<+!Q-3%Sf_Zjq>3Un#(!I9w5p0dr{A7(p(T+ zPLi+){?UAUi%m-8hBUbM>TjoJ8Qm~FvHQh~Dt}s_7>Lw)n5u(rlYnG6*lrXWTGaAe zH*65vq8|oL9GWN}p?CQhTR_YYL~mDzO9mE*KUsVgZM@Ij=$qQ+c-+`B$= z1ct-6kODvf=)lwG1yO)zCnGnn?y9)Z8+7aLp-%DIW6<@f>hBBoKd)#N+|hNb{5Mlk z^c<{5E%MQE;w|8@W?;}kWZx~;p>9TIX6%)&*Z>~nuKN2T!&!@$Evt;~iCH-}7gZg1 zumcihT2fbzj^KbyJ;&u#k_1j0n5G;7a}Y^r(4=D{X&QG)kBNRoYBbSGJaKWwCey5l zZ;gxVQ~O9!_T$I79JWf!kU7e>jy%JU=ARcY93tCWC%1*m-xHFYKb zdF#M6cKT$=ZqJ-`=nCgrp5!uU@WpfUY0@U0-31~sx@Y)FA2>NVt-rO(Rx=iFnVuf4 z3S@x&OUzHbO1x1G)~yA!KmJUsH= zqNA?n`Y@HlaEHbw@utkzZKUu+)F)5)J*F`-MJ>Fp-F|;swgz)C)QK186XnxP8Huvt z0ARR??Cq``!ouZsql3$ZBU+yzw{lZ^6Foi*sw?h96y(!&D^SAwTpK(d$)! z#`s`7tKAxfCL~!B>}mFq`1dJ5!1H+6?vpfSNiWI8IgR3c6r;OHAjZ+ zhUWDua9wk>*O^*+<+sI1vAkA&BC`?3iiz~%jl>^ zyp1C1Se~;>#Lv5ii<_s6pMYT0cwbWfN%ybmNhuaZ_ggdo->+8_B4PIWCuBQsEOfIH z=c-432@Xy}rq7sX_(xLpI+xVnmpRW~bmHBD;@en`pg@tIldXq}=J!qhxCnzz#HydI zA5ZA2fkJwkXmXR5S@bG-==%0vLtoVhtIO{#*a?gc?@5k>Zx-g`z5)*whzY!3evbU- z8ZgM_G-GBMq9~BrjZ6;&-Lph@FSXY|LwQc1IJ6 z1L@YaCE#%XY+~WF%o3r4El(JF03D(|)`=|pa&7z`F1SOAg%sch_@?xHrXi55{7M4) zgKgm?*X<;wGiPSLHcBE;K1>R1OL~Yd1BD3wLA!7-%fyUkDJ$zB5G{xT^;q!m|2>|_ zWLbH6_eVQEL+X9RH=)Na)ePL4-r-whb6;QUPg|h=7#}U#4>b7KlPue{!+oS|;Il8! z&li^s;egqqvw`ejWn~2{;|t(Hm{KQt8|)zbNVbP=Aq2T@?)p@%Li?~xAO11Vq>?3? zJ;pNbIoz0035|jM=loSUlB~jE<9sze+9tF2KlsGuK-m85btERqpA`|S z>_*@HB+^!iU_G9_fSu2?Mh$#ENT(T{gA`%nxndkSS8Q?Y4ma(&6|EochQ8{`Aa1yq zERo`h^Aj$;BEX?AGtNv;@7UL37_a`7j7TETtz2^* z+mby1XQ6TzH|L4#;yRJ=$|$Er12i46#dTfUN=l92)?okcB5&&B#~BJl8KSv7Z38L& z$={X54HCk5(pslSJEeo@l(M-Z<(P8j`xP!rWTXE2Ayrk?udG2kxifETTludMDPBeB z%U_k0lvtR~E^9=!h)RBV1@a2@{qdas)X)`VbB&QJLcMQ}2e0IL9@^$Kq4?z1Q>DaeK#k=9F?^Y+? zrJ;%aCZGFzsYz^FRqz?8CZ@@bScXX{d_*-4=ieUjBZmRhpb0*KssRNoq4RBtvADZJ z<|Oom<;bUfdu^b*TexHS-zAR}J?ZTH{o_$=f|1BMdbFzua1of-%FmxYafo3!9TZUK z-ExdLmMt?M^~_en^?D7b$e-?hLp}N+mf)6%LWVDtr8t=3>G6v*LDJ3BH;>FpVfewX zFYx;H>q@Pt4WPlghaLxI(&@3oD9u?k!=pECnx2`dL1O_uI%K))(uUH8*Z>MaJ>8DV z0SZ8D`Oi=a%H;5b{M{T{Cu}S(EUY@m0(zIr%Vz(t16t*6$6E%rQeVNb{0n^Clz;%> z-1Q$HK6z5`%^7wUqEJNA`-+v&^9J5?xqHtw$D{5%0}90hk!qx7Y(ogh^2O?L+pT3O zdQ^DVBs)Olo!}MQ)uCS}CYl23j9$lRD(J?{5N1vY+&^I{KiKt&;eO}9z+K>CjtBM- zb2t8#Yx3<`E-E6@gRm9AXfWB8M`8q)h?Eq&!@V_Kpq4+aCB-IQ-(vF~psldc7Nkx1 zgBsi6Gy6EYMBk43?g66cYa>!6uH9O~7EV z|FHXf*rsDlJp@NFP;MNbHhF!xtsLDnP-yp6w7L6;A;f3Uya5xCe$L3fNdX#tVC|SZ z)4X4>eNS7yG!=zNqErPz-WGZbPDK;whRD~M68c5eK%f<@3vXG&Zs;te;;BYb2SRa~1 z1mD}Ee&&~4=OA{W?DqA91oM&KP&wa&MKD;pp%Sc9u^%g$;_bheTaAT^28(p+H>o7z`@yY`qXTS*uK zA_a_n_1h$!XFyLtf?4|N`oG#UgJTk_hERy&lg01G1q;Wn@91yrguwCM4S0n!Duu#; z{mbtYOZp|jB%nIL7&fwiNb7q5MnWMh{cC6+*?oQDgbac#O5)Kfd5S znZ05(G=&R}2Y(U5S-vmq0*@pW-LX^R=FzmT%(5$j)5tusn8s@4Ghe^+1ARuN?u?6D39C^^;}a7(fH?m zgqM^=rm06AEk&_CF5MO(@`09cf8(7rZoM8gZzJRn(69ErkVxNg%J5qLUZDN=E!d>N zR^}AWYeiLZT{p9*MK____65^lP$dxdCQo5WhF#GU#+h!?Xv_csnB zma==N9Wf7yWr}OtODfPzD(?Ha)fp`j6e|NdEsM2FV_m#C-! z3iE)vm1MlES?<*k3Yq4<`=4Sv+nYDkHLfe;o=>3CmZj8tBU6{Y=<=*H=iv5&joLU`s)gr)P@x)LQAW0{D zAHWy>y^eFu8u9mji(B#R21ZuaH3-PWUJTfU=EpLH;G&Aq)ZLsi z`m0dh2;GYvCecYbWt{1c?cGl(+=^0$7VuI&f=8sHWh7=Qh`E1?;8qbdL`#Vzk|=Qj zkq)H~;9Rix+`>wPsFnv!z;90e*09zbB+P;%~N#FGdH~}xRzL3%fvV(9>m=OiO+a(nh`FfGOpsU^Qyq zq;nbry#rv@!Md#YvIi?8=kODDxe2)+-|efzwkH;ax84~MW)jO%yx;YbWk1{R5e z-1yM&DkN+uxc)e-v>g-?rq$XYtD>ak9n{2d0-W&c7Fa@xy<*q?0Q%zI{MPcYh8)VR zWQSe6*&UNq_ZbDxV*0SSxv3;mO?q2jKa1#wU}MlxoFGv$X|LQunzp^*^t6rAS0?3> z5%ZM%+~Ehu->ybyd>im<0R{wTWg!Uz0M&6E65#8DS?TG`gD?^7{JvxMe>Dm@^-J2q zxFu^sV`2(?#9|=AMiqi}bpH8NmMxy{)xrrO5|m(@k?@r;#l9US;9E>80LzkWz~hPj zLqHJ*Q>-R*myf(R)3XjmY`a zA3`SK`s`Gm6ijG3#h$tUP8j#wO(ss`&jzVphClJ{2z;5QemviMFK2_(*|Vi1L#>qi zR|KG1w1zMOZFR#3=KEL-j}aPA$#x6q#Met|`x%}7#(};UC|Hi^4)qPs{+3^-Nnxbi zMwrZY&IhZgO?cmgt$*|9**RBn9`u1semQvJ`Kz;WaG|KwobiYxk+4wiVFQ@io`zHe z1b063{^{~Cj7wbJbK>M2qXW1p_?(W`tr~j=ALfZySl9<*C2pW;#a9%R0;#@az;1{J z;Je~i%y70cc~x2p%>MH;7g|rZshKG^o4g#cJ1)RK?tE`=zo8PR8BWYWkl(&tdfx*P z=}=p#kCZ)&mLjD)HXZT;nCBFw6hCG=GvjgO@zBrexV5X~1FPdFf2OIM`uKd>RLWRV zHqxfD@aExVdD2&!dK|{s%A)mM{_}xGNvIEiDxhV3 zDV(b?AzmANdA~j3Faz7Rt6Xo1hl|8CBLjo`s?&bwVrno9@4(t~nQscD_v@G^&-j3C zi|Tf*;4wjUQ!=uyQ`*@k4IFCKp=2{%*gFWT%*{^sRdyuIFxa4zq71B2>~{a(luXn$ zQh#@Pp|b1jR9Q}k$wqmTI61vnu{!%>e)#$}-4k88t-^bGK8^Ve3Wb81LB7Zy1tSad zz+u?Dn>NAbh|PGTeug%Nax^o6>u6u8qO1&w{(1->v`-ruHG)6(4+ucs>tKJs1{K!_ zlu}ota&k5cTclz!cLW^-LG>-1Pvz1W0Yj(kEv~fpTgq4>^?6u4XGF;8)@6dgmKQej zk4mn&u2&Ipu!tFf3*|fp_?N%n5=3tuua4DeE<_mJV^r}>F(**>@k&|6A_`V0s^#w; zT$hD1*h=Wx=~#3RAHFgfm-VElP5<2pvB%vWJF$3t8IK$`n6&LkFpN&@EdZ)bg*u2}Wr25=Csqi?U#)Ob21 ze2zH}ycj?h*}@cHz73o!R(KQ(R8(MXp`J&+Nos1UQV%IpeI7jcSfq21?cn>uP7MpDoQTN2&oM-${Re`;gGy+h_Z~k`^Xh%T5~P$6Bj*mpbE%#q zKJeYUkx*=ic`>LZ{9@+Bdtp(*AN7Fa++|5}W0NtjyuNXfX`a{4cc%Z8@20CLK|DBv zvrU)D+*<7G5oUE~WiupaPmLEkq`#QnDR2U*@y8O=w2~tEfOXCKcu(h}C5G7#I!%Ws zYqnPne>IVU)rPC&AZC=24zTg^PA?2xUt+}ha^`%&t2YPcpL~C^E#!;xg}vpWp*m#A z(1MB;&+;a@wr>b6uozV8KawgCEEGylzn@@#5p8XI#OqlAkM|EONB4N#zAds6pX!#zgVcTol9;|?%o{;W)pNXd3lM{N6&6Pyq3(h<_MF6 za^TF(AJe01c|jbG^j^|22-!O~Bh}75(Xy^M9J>kjf&1IqnIcY5G3%XmF@VL}XYSzn zZ9Ap+QPTWqjFzgYQLbK%Vp&;#d2@5K@^is!G>!Z{`zdIOTQUtd_iVy%zkG&06uWPm z&8j_AS%Uj0<>zUPyqq=^pYUc~rHg0}&FdaQmxDufkp!V+4k>+1*l6?#BtdW(swx#9cRHU!@QD!NaRvqnLD-_e_T#x2pf)X=!uFJZY*hwz;^esrPQkC;pdj4#aKQ-*jJf|AEdp4+@RZihY#M zUEEYz7x)EtUnQ-i*};&@|JT>qvV)qL*}`aDF?0WU946ar>rVh-M3)8V zQIvrGgjBnRd+U}N4)Zck*?<^vJ#cx6rMzbd+SI-3z@BAP}@mVytv>y-ezrNMM#TnG>UXQxQI!f9@F-08uUD4_mB`G0!kQvTgTC<+EXbUuQBV* zoQ7vT>qw^0hr0oHhK0uIey@MOA2d_n^~xI#7ED^b=1Q1SsM97c0k0t@<|BHy)3&yu z^Ye&I!>8T^@DnBF0%#Z$6BD>;A~zc?PDK3=;da3?H7_Bb<_;U=ynvh>w`KT%?)>KwA)o)6@bI)gn{G zcC2@(JazKq3g|ReuTH$L|E~B|>#YMTA3I^uupMHCqLJ&mt>3?kE$Iq75yrC`ULWg^ zn}ozK9A81hZob#WMTUz0?Af#G>grBLA3l6Qgv`VId=+6HPR=0H{Ryj;c2aqD1S0n2 zNkfcxb#-JgePL|k;x1f!n-l<3m*VZ|QO?q(CA)qfNtpDyddN}OPI(`r<|6&>v+97ui zA8CALWhI(HVh4l*6ygE}Ly0j#R9f=`-^~WJ20m3?Sm`CX1KTHYS|;0(_Z|=9x0rDn z&kJMsE=fJ{`$0N@s)zj8&oVr_69X3EY-MH|S+*LY8HUxbxi%^o8ygccV8~;QrQpsL zKx!#nbT#YjrGNlxEuN)jW~~6b5-<4|^SSM2bf7#?&ALpYX@y5!VJ+?O@NkKV@-W35 zW$UoAW5`9r){&jA*~`8jDiPt=6gs0f$&{C$zg2)N!-xyNaihw2A^wneQ@RRMK+D6$ z4GY&z_x*cSp?B}fkIYkhUD(Y7l7-1Xuyl~Sko+t-P)uCBy)^5hQhqif8II+Zhtg=y z14x_fZ#w&}@-jqqs!7*lV=KFN2yfV6i^Kp(@l?|lao}L$u@@39m6Vixe0&U90*i_k zu)}OV{FuU!X#FjjU)I@AL#TvYa`eh=HY9VTJ+|Y1g;gkM(hHw>QnW$of{2LnH_jcGLnNJ}tY*X5fbHV@p3;psqIs#Tc(V zRz*I<1V&RD8e3>2z|{wDUduegWXL+42Ycam%C!K5SGI2QK}k7ecwjH05w8N76X zzsrpq*C8)C#?647go(Gl^ULL*Vm}7yQaJwMP^@_vX9xfTh~V>2yrc4AO5Lk)d2$`l z{pf;%9ZyH$??Q*;4|VZzSR@~Y|BikP^# zrgW_}$~#@p{4m+IXQ;)s^-Y4@ReVi$c6~rQhXdP(#>cIn?Ot)_2PL|f>jWSFQ(6;; z9hRG5=~*plLF2!=Fi@iB0EO%^c;s2V`ilec8(3Ck zls@cy5~iMWiKLFBl&?pR^6vR6g^a?L`;S{1eEW60UnK?f_n&;DEz+Y3It|h~pv8IE zQIK{U+sU0+UtbT3V7t8hy>%&}Xo>M}B7{Z*=w7^M+d>~bRm*SGF}fkZTu4ZC(2RaHbv6cUGCBuRUqC`O3OwoiAu?qE<2DbZqNjIz8mz3Un))88tncgUq~+w|y1Q_@|CGd{ ziGYPg5P3MbL#jV>){Gv8Hx>>hE9L!|+3fz$ z4~rF33RG)97`*GgB)YDc?!{x;#~FVeI2QC0kyfR2<4fP0`Er&EJV1eKE>?A88@Xcs z@mbZJ7-Lh$#Z!8omgjf{CPl?OJ$7NfqMo>XIeW4FB;qPHSiKuCR@Zu1G)Y*HBh2s;LRB z55&4hw*=6{VhXP9(W68RGoOVK*Yl6K{N`rlPG0Z|p;C$}ev~UMZRYgS!N7p)n9$B8 z>bKI<_dIFJdhlS&o8}T4qZzigK$`Ijf>0#4w=1a9EWS-Cp!Lau$~=BMkfoT_Ogrc$dI#D|dnKe_MsVt%AIE|wx*flxw6gLh01TnI z>^2P8kEZZ6En*Al3Y^;|v^;m0R>mmC zP3RgAO#jroUn}##uxEdOzdwrh)5ngXrZVhzN4~sWbXr>4`SYp>9gmcik(o&AN?k%; z%5sovlm7WlMV>L;^i+Rtgr@~lJbNid^YaR-*rKqZu==MR|F=8RI!eQ{3BzT2U(Y-O z@lI8=EFladn793G@T$GuCp{)aK6dlgt&$DUlIFvW3#6dr7@%~9C>8qGF_lYb_CgW1T_IGv$T@@;D_2{|l76EONguT-_wP68PzRd^FS)UmeM_3PH*kbQ}+9X=?BdRQbO&KYi+4%k2_bWus?S|KD0+yP9h)mPjh zVh#-oQaN!#M17O-y_>*}Z{50uFBT=vThE_Prq}|uICdx}x-Biyrgu2A*7V_muJ+D? zf`dzH2AiCz{Ym=i4F_qoQ_%#u?=}S%YBtj-$S)*X7VFYmrywpn+ayV6*p)&A3i z4PNptWjRAa<0Q0^^70-~pWE2j5RsDm_ltTZFK1`hh2anbPjamwU4#rBNN4qxC%fis z8gtvxglzpd=}`6c**1tQfbxR@M9r5+C!@Y+?_N3q8$&}wu@IyC0DmFvsn|=*E#F8J zOL4OV#>-P!BNaye5EJRt{bh%@qAmj&DBlH1~8%{+EK(TL+%C_lGA3yxkzGz2$-23HRE0dbcfk z`{7FiVRE`QnLH?M1#pG5SWY}YO$9w<#7d{x$0P4wpV!e*Yq+ym=O(tQXA0vqfqe?Q zKM+7Ioum;pP0eiGv{1Ae>bng9>ad07-jr#4g)^-O7B(NPUaWPCgAj;xeea&dfEp~P zH&s;@=Whe>>gP58mb!sT!+g{3Ej^EZ-F-;N+le1@buDwo2GQ3zSUALmglMlMUB0Z- zZvoka-QmN{g!1P8eQR;SU6R#-=tRAL$+^zyw;IdM+E|vBRV!LMjEqcY;7lA zU45&tBMEU0>(#k9md{a{Bx23dyBSzq%^khSC$}3$<^O&ed$)0{P8=*Y{A5mho^>PM z+#8=(UyPp0db@-|MvbBS34=>7D$Wp-O)+QzD1SSsS&8w;0QSD>Z)2CxSpHy+~#x0J=>24 z*R~mc$ocq^Y3{5NRdsRGM~D>&MK?|?ou#PbfDOR1*Qx&PB2EARNW|KO5fxbimkQ(? zU%q@<-pbyxaK;mRsol9{5D%!1Q|ya;`|ucATu>fPb`b#Gc@fcLp` zcLa~|+B)rLH>WF+?GLm#v}AWNN*OSLM0Vrt+qWxGgC%h2ok&H~}vu87B1T6z;V*Mm)K7U?$+UC@$ zmMmlGS=wXAj-i(Rs!qQrgh|5x>A>fxkmXU>BGg|9ES78h*GK_Zg`;?->+_gVOxI-Y zK|=w2nY4&XW>L*{C*2{XzBDT`jgvoEV5i@x6o5)pg4#JBoDlxPdd~N)!A=9ddYZjN zyMt=B%}?HG8iu8oX9}uDiv<@AN82|mYt+YU>V%An@C$#vuFJZVmgY?&Rrj~MM&3Py zRN(qYT;JWByRHP`mh_;KQl&9=zjY@T-Uxl=Jp42|k_)f_a)63II* z|Ns55Mo`b!r-#=X6nyeL&dmB7=7cmHR!RkNiKolVErIy|0xdPjJ~R4rtz(g*RXIg> zs1Fv(|MwME3Kyepb9CLg(4F-5-o5&F@s^hEg~KD)m{4~{U%ZGwN#TCSl0XKcf8g=E zX}FZ05b95*lxX`?B_w3Ixh0El^|vefr;U=N3?w^B#ZqE>%d* zBwTZ`yDORvgQ@>eaQJJIF|%!?<~5~XUOu#Zd0+gDD*ElXA@-|JDis8 zTkW}Yq_m`jxCvdVgxCzlmx|s`JNgcl3|7~#T6_TlOcH{Edl*<>x{O#26P-+jc=4-O zufW;c*4?_0pPzr>!W(rh_CHvc0QH`s5#9A#KC*5O`nyO+$eY9}MwlgNM$jrN3@0F; z3dPt^phRRO$?SQCqTnO@q$`OvXUotO{d!Rjg<3!VD>Af~7!AGM}? z1M@?NK8rdizYt*~M_dF|yNAppzJ!MsM+b8q{Yyw{GQhehuL?paR*{M4qK5-@~~|%rL~Lo?kGLk6fj`03Sc!dOopV z{~sEt;1agV5i!9fLZP9dkRW&4q;`JF7iQwJdK-Irt}mxaQgZn8Y1`R1-Mn%cCqqI) zdOp58JetK$D?SYt+jFevMZD+Q{rmP!{dhKrHfaL?b1gppkm3?KX!c|=Qq)Y7$Mgs# zuRdD5cc;_tJ?;^Ci`^`LAt?Y!HK$_^X4d<$hAsiUAz?t))Y95~=wUywZADmfQA0xu zRZ?BeJiH@mE&+yrKxU|@^pSTK#19ap!~*KKU8kh!=>-$6U)Mr({BJaH(?2&Fr}nM4 z*8>uwI5j6!k&(d%?rR8A#aAmvM8uqDW;pbC=uYNFHOGoquSAb$GoNQjL(Q5Mp%y5E zr0G7wep#2!YX2h$rKUA_6kLy(-mPj>{oRoBZ#$0-)7{ODzQ-hIQ!&IaT4igc-d48B zRs7Qj>CihEM9-MC#Ys1?4Tf!NTl4!S_o__8TL`^)Y-D7hmGZ9%HZQ|@OS^0-!li!D z%=q7`efu^CCLyf-QdhRf=55PxrUicn&>UJAeVkWLgnmNe(rsngU9uXoMhdShm6UMu zAg{=JgANSjV)S)%3fIGyJTx5Luc5)hs->mjbtyX91M32t`;7zAPuUBW@uh-PmUY0U z77|0hQCWTI!9I+7#J$px`T8Z-u5HI%0#bwR{#8|bT!x@vOg+C*1^r?Jr9&?fe9@3g zTsM02`S76x0RpxPpJJ3a=>pBOSjcLoiO+@Jp3&78p=FVml^qIf-*sOIqiD!-LxCJ^ zq;`qnKn~nEO6bu*jxb|?|7{j;bU_RE@)bw=4$4^FyWEp{d zl>HPTkWx$WmBXx?eORZ}%k;TxB0)@9a<*kA=GkPO zWamJy0P|NLz2A$}b}U4J5e)*1>j!)>;8#$~cRrc^!#p>(@pqNE7kNFA zH>n6&zL`fLnC5W?Pb!JY$7}zeN1#b5Ese5%UB9iE^ZOIfF9GJwRLsjXlI=HH?Nj=P zPdW12_I~n;Gl0_u7Bp;KS1g2fbWeYFv-9x20H!QHAet zn4NM2cnNLp9nd6|N~7;lD???ROIrqFj>d=zq+)Roab67%X~P-Lw<>DPDRCOq>E9&; zn-XP>f_8H=l)(*sPlrAy^N%Zb&rZ#`B67pYSs{*l@#pU6K+*|v;nOqM%}>`(S?%3h z0kR1g07{o#R2o!bWY8(*PE^dgdYo=Br`n#>t{e^j@8KE1a4=l3Aou-Sh)L)xEbSy-$nMOP`YB#gc$ALw~? zHRvh5#(#@G083&T2<$lfdGpV)-n8CfsD!>2!rp$iyMv~^bpy}uhv@~Q5cK6BxavQA zc!=%i(kmQOCdS4Hju;XOZ0Dq%g4|%-79oV=0SxOdK&A1E_G6>r>1N!Wj??(Y9%2~J zsn3zIJc^m5j;{@Q$xJ>Q)ClexwFGHVA8?MdEnrF1qt=?ch5dCImu;VGYWD8qQIENk zs>CJrY2ELK&Ck!DjN%BO2->BDxVX3{Po`lpnpU_h@{p8z<=GQamX-4Zhj+2Z{n>6` z(1I;RlLTsP{zH~?4~4OUyYiyI^`rkS$9N1VlRw@VQtTRZ&0MW4Q%%WCES;~p>4=A* zOjRuljVfW5w{Km|#Z_^iCbf|&oT}k&{LjP1OE8Fa#F}QYb}4S7LMH!6(L(3xx*?2X z&DC;O@VL2@Th}2+^M9T{|E}qw(&p;|KlQURG!nB;e#uf+9Sve)U~tfp5ZSQx4`?O3QJr^(ysKS>qJ1G;f|cAGDG#=km#Z`t zu4-p|>%R7X?i2eY9ZOYIxP~}E6Jjatj4@5;_`f&idBAa5S z6`RxCsAhXEOPxOa#6&_BycMzjz8fw&DAyK7Bj=W=j%`C-c5L=!fR#s-}iR zGc)hhO%o>o7W+Sa#!*@xWfjquXA5^#XImFIJUyMWDya}PzW6CMQe#Lt=-?l~QNd}} zE=S2#Q(i7a>zuZX|2=?|-b-uu@bU2h4tM);HkWX008 zRQl8Q?h*L^FZ;!>I)1w=p?~XHQT?Vt)i*4?keOH-PH#zjvK>}H3x;p81_`1KFuP5M z&nm^*+1P}1E@4@rpJQ481yJ|Bn2ZdO!1aJUTwGkBTTH}I)7sI=(=UADVL?KlioZLa z#g4zX)KMT;aNV$b!^v-}`1#>&rr>{}V4e$|fecR$Nl#3FEaK48Dl~eb;C%<9^E9sA zT$&J<)P4oW-oI#-pg)Ov>@qyetaBwkOrtJ=zaI2nVGO(Fq9PLW&~x2A{Nze^*B%3d z57;3A#jJ*z==$}6o*p9%t$tgiY6?ULy@*Mk1+A8yq(j>-P;j}q5@naqFI~0a$kPW9 z>LkUm0K&?{KfCy0YHKf$77qmo2-IgH#h6= zPm*uVnj(~1z}Ep}rvqMp`0ydZYhaDtlysf7(GUM5NaFrF1Ao(#kkN|0qWrQc!P^43 za5cuJmsmuzUdKq`E@0S3Qag6EynlZNTr#cM+{Q1Q=5*JpAD&v^S}$OuJRoueYi6wN z+Hxg>9>!cBq|Xixy(dO|r!w;-&llzFOXdwE32N@s_$K!WPE@c(ID7v=3@wd_rgPxE z!ChSp>w;)W^|hPxK1x_<9=*ef6DJ_m2RO3DrN0qnB%)4JFR9822&5na5*Bu`kQMLI z6+nY}e;xI^8wCY(D6at%HNng@yR3f2v}`9h1h@W1$%i~NJguuFEtDr(7qkL$O5G6G zeFFJI*4E>#3-hj!W(D7MJoBWR!cdlsr58wTCpFsf{kzZfy`Mi?ew^)k{Q1+JHQTpu z4biA(U&Fz2d`)Ob3757;DqUyEbWeYO^)2IDUKp>KfVc^Ox$H4)K-F|KIHHg5U$WgR zBRk{4Ir&Ia(+tfYhaNtDn}`TiTBc^->GGD(hu&;ncY<`}NK1E3!9(*G2Q|Lgvg&L> zjM)Q)sg33r%Gcu*X>4lRdZLyx$$JlK3(}&{j(KD8#nqz7xL&B+;d6M^_#NfNO#&Ud zvy&4uJw1d|-;9Fb=klByItr&4a;UGUQ`S~wID;!_f(i#}CBz!-y--4Dc~G{T2gU^Y{C68{k9P^k zOmH3h^2~Lk_&^TpmC7=)VWoG_YU@;Ecrj3BaBMI@poAkDshY$8S@ z;Iyz=UXyhcP%pd$JYkmw&`|So#Pc*2cy4=ro&-?Q5#1uKp+uwem+=MQiy3j~=M}_7 zzkmOpkqsQS(}q_`klL|>!JT30(&%}v6J};+sc!jG8~uNQiMp32b#9~JN$q^Ky`)H% zZ>LYfX+N3Mi=FsB(c;ysXSmuj4LR)xi_Up$Y7+iVEcrGne6+0)SjHaL*N=Si@M6H~i&mFgmlQ=5#g} zY`t#Z^JU9?0bYQ)dym2cZx?ftlVJD|&;$le=nOn%cpQh&k>NTJcbAk0Yf5X)uF^5v-4L&hva(gvBMFx376X4?Ofw_n@IhPMx3pePrYDx0?(MR3rXJHJ@s9 zH8EDeP0C^(lloCC*S3CfB&6d*L)68zG+C!uus<)v%%-kK38d>!?#LBv8;8qcuzR=T z+W>Omb@?NaEF$x#{?j@L=91;8zki^AE;#!-U&=vw;+yRhu0}hKpnSUby1e_$&RVC5zu1r0HIW1yIE{uS`yWxd)!@xG!wA-VagBy}XCP=(@rinFzwo9>^VK_=D( zjU%dckUOIZW!LUKft{$V}t{KFsXR!)8 zoxh9fA5NjZIC;3x$?o_mj{dFoiK%ZZ*&}{<>2QpHI!gO!rBB!Z$25A7$A-Thm=$I> z_Ubk%HKP}3=)g555wq`liqw?b&W#$Pe|pV)IdH+$PYp*teE8Fdim-sif>l*p#dcS! zQD27SOKkC!COd=u?{~ySNB@|e9ibbd={hmpKkxHh^;FLk&HDURb1CYc9z(B^MF%Fn z@r7oVPhTz7jNCM2V2l9nn+-ejh7Swrm{jdy47rk)CLkbiPbwaG2NVoXt)B$n`T-K{ z-yiKRd6B-aGR6@yC&~48!sxCEDID+xsyPRcwlMooxO^Gt6QK{q3$fb@tk2JH2}X#G z4GkY;nUOZ2<$|kaa{nPcCi=l^p?|q*D1!lE$=JM<);!sV4VC;;gQs48TzPx=;mNLh ze$QV_9GvWZe0zT>yov9ruROUDUwu`Qzeq@kE`ll7I3-cpu70`a0bA_)DimWY7u9OE zf?cePCD_KsjBT=~OnX2<02xl;e8r(x1g#7DWrFtp@jZhv3^vg>r>3TQE04x9x-KBH zc^^i<`VY!-gA4fnX&n>wDt&A_^!ow0<36AZLa49Dl)T%}u=vrVR=?9+18*_ll9H6f zl$RjIf|dOueFxvmY^XC@NLm&DYojH;HyYiEwqH$MN85EoZgn3q?cGt{MaE?Yk? z)~Z6ev#YD=-8;+GCAtXeSP$TPXA^bkO|!u%AIsM zdbEuc1!UB`|1RMJhk^ww9hzy@2f;o)Hj7f@cK#TFn~!$ORr z)uPp`fN5!7G00SJEqQoO*5BLQ!h*Vu6&%;1=GN8#uwYXBj7FLX2!t)%smGNH6#|g3^zwR>sQH^E=$$A&tq#2LfcL&DH3{@b{h6 zd=ac3n?5_?{i^)z@FkJKM`%|ES-}}PcC5^Q;Z%AnqSHUL{tFkz(_Zl@Y+~E62_CGW zy81eqfuB5egZ|)Lq4;$8Tu>C+Lh(qYe$QUx@9DOpA*UV~I220LSMuf{qHFC6_i?S9}~M~y?|V&)p6^%n+#W{m)xL(m;e zbG5+R0bC(4b{>=t{zb2*Q)u1w8gr}zO6T9XeXgyfOD1)X5ps**;FY)c8Ms<1=G~BnZgc_FmT0m2s;WTT zV%mDl4M)mqu{#zN&VMjVd>V)X^|`jB`ptw{h2AhjYahf<5zWN@m6Q`?>FsTC_H0#p z>fK5+Jx`y>4iR~u(O6#&reHQTb20`er|(ocBd3{d_%i_o6vIh{9= z%_c!ZL+}fLB}FG+`U%(0JC(I7X7XVE6PQHz+qd{x$`CNycY0|ba#sE=RR%ZAo+0wT zPKZdgGx&h(vz=(y9GE)DwuaRvCxY~3XSJD#aAKxl$pdy>m;6 zbrD@%@B=GYg~#s;r*G<G=S6bXMBn{`TAd2^bhCe&K$+bjeu;xc8GceiMTIZ;1B%1%w{QLjJM_Ls zJoE@`REPA=jhb^(Q|k8zQ{;*)87S)L7=La9%eV(l2L#UR$_R4ZIe2U6Q|tA-XRm&I zMvZu}qM|nYkR{0yV*M%j(6sdO(*BT8kH z7^*Po``rb@2O9bOl88Q15@=_h!&r^=2HZd3j;`b$|G7%{NQxpuLKLS+74Si{vhq(N z*jYP7tK-{lh^nQZ$3RGdQ#=yY~!(Tv83qgXz-KM$6Bwx}vWLs*CDW=$-v}yUg+K`q?N^ z5!t=V3fzzA)wa1LY-*WjSlc8`ahDVj8`D<|kj=77$$##x^?^y-FM+xj;*SJ*I0yXN z`grk}+J@bQxjGxRY+0!$cXM(&aJHwqEdR2C+ie9FT9p?EYU0VbIP6(4qPUBwdF1fn zl$4Z!*&Zj9IQPwq20^Ia*S#!*8OlF!^>mpfd83z2ChtYJ_swoe?7(?)M&;RUk~Q`x z-Y(v~`?f^yyPS2Nq32@@tnO8EQ1j9~N8?;UxiHtas{Ccz(l{^o7F(nZ1MPFS_tQ-g z8!j@u$+*(+lf#C$ptXcYH>hWE@%Ac*JteQLAN*ORCGD4l8u#7I-_=;+GS1=pi28}b zQvqGQnyfeOuK6r-x8k#&yeHH}Q189-lg~JCM<;N%iqfHbbm->R%Wrz$sJARlx5$V- z;dhK@R2+tD5}I>WtfYk4?B!B?SjfK_uKyuxe}t04Jz^WCfRf)M-oc9ihdaKf(8_%2 zW!A0pQ*bq%(y%Jj;AptDuj|gO2&Qje>}6i4ob7yGI5%}iA+1nt_ z4P1&ERjXU>k!=v$n5O(`*6L+bP4JX9dCjFo?D1Q#kpD_V|7I*V%ctQVPUgvDbt+t} zXk3J!kcph+ld;i&wnj@U1l{PV;s+0iSyLS+^WQcp6wE_UFdIj60t|w`0d5e?WW?j2D1uNFAyUkY1&Ek8-JLsMQ`4%#-G1>?6T=6V*nJ>fo zv=%KayZmk_oFKLz+1YoMc2hQoafmd1wECc%T<-U{Oy-@Ere@7m>8M4+8M`d0Y6)o7 zODnFPQ(Mq^a_3h5JK)qJ4CIYGuE*~%WsD56LNqP1uh8&!v)B(Q3d}h%b<28*8bB?Z zUrpQQKbg^ly!FNY&{@nZ%Evmp zUK`|TKBT2cT`!?nI~C2?m9uy6{DG4bWnbQQIq3@sbXh7m#g;+bvrizIiu&Wnj%`li zC%ZE*!tdcW_fVvmX#UlgF^LwTHt^hw;TwHq?O(WAM?xf)Vq9vx6D-+43B;^B%GCo= zQTy^dG~7O1tnfc1Kr;FrRQf4W6B9cjiy-WCQGKt+hDqEf*m^2|QbN7wb^7$DDEj}( z9AKM!FbSKGrvJPh!D?nQ(q&04J%u$pf=X-rMq3Z`Z3}az-@Mysh%-+?hqM1zb$Zz6 zty?Pttp&Um#}AyI+6o^Nd0(&kiJtM;GQWDF_0wDYyzj%x z)d7wcvFhqJlZ6$(Ppc~?fN?ubKEFCfzHi@xh4!WNST>zy6H@u<^-Jq_$3951EPE@y zZc>4K?o8Oa8jg`k_VRy(1RUAd9WqZgordW&YL?EzQB%crSU! zjTp+bYq#Dp6jIR^ey!e8yW;a>r@YlmG7aP#zwi%MtzWU1+d^PQ^ZOGcA^UCMH2b^+ zvHZ8<2;C+w$vy0E^j9=sm=_&f2PyDhT zg{yc^3qnZzpFrkG-so~s@YtZvPb;mu?MB2qXr^Es+GV`|>C?_7M_hGlR|aM}Y^L56 z-;Wec&A%(A*TzjiDO=5-;io0GV(EX&)*bH(IFs}IL;1L_{gZDqpT?K2%}(s$Sp4M^ za9Kv<%FdmpZmgAo8E@mER|V$0pN)_2>(EdGo36g!P|>CNJ-_7G1+Me5KT$qFVk2mME#BD<_T^g6#qTqI_#dtSimv{eCZA?}y#5b#Knm?!K+7@E7W8>71 zUfk3IA<+3xX)0ZN?!JFNze=0Mm#=OvbnKA#6JNhG7O8^+ET>m~O)2#ql-^jEG4+0r zx{SHM%%$Y!pM04wV`2_G-XyAH*)~;mXEnU@sWtBde*R8AHd8YIreF)4=HkmKtvvTN zwbwyK`8HQs#nVpdWA~rl_(K%cb9Np>_h~#*nt}`-7Sc3!4#UQRBAzQ0lXUq zJ@>&G1cw!&H7V$|3XO_Nc^t46u=t5(TzhgF!Hcl=mHhkIb_NB*y3(h(`1n;J>2T0# zcZASRBjIqaYrQU2U)I_>{!liO|Vk0)t*T2)8yg%u}@dl;TAEYPS_$FT<%;ePQAwP*Mif@ zeCERwPAV!tdszLuX|2rH8gpmv)~aHmG)c`5U(7rGBf5s;hfi%S-dW|1Gq-_`$gWxux znJr=P%aiOse5O7)&*Ro4VNYFNf?QcFz49V2hs~`;ti-<7tm)K*BebE>I z8c$1p*61+RVBa?=n|%=`TzE3iEMGv1T%tNAY2|$|8`xJ`w{P{n z<}1kgU_=p}M2KvRgDfp9Z1Q-?*I}pwAAlaOfc`I^o?lh>o0T+zdT93ZYbV%tL9G!T z9Zkn3F6Z2?@tSuCLBcwML!^XP(OvM7=#p61sHA+y7vh)GDA7}3OTyh|6sI#~a}fz(jHkus;fpe`XeJ1Yoy>g6ZV zWE>n_99Is0wetm;_><;R-RnS_4Vw+8hCI>7b&2+FF#f^@!8TTJk@UV(6Yd_2t6b<^VsXVSL=oX+f*Hw;sC z9oTj|+S)4FwA++| zpYvwrkJAvnZ#4=GZ1}MJIFeR-zirwcSMWm38e?A)`Wx($pg`jKEvbwkDT;~?M@d&g zV?rZ(2F7wg&s3on$4v#Wog(KVi&a;Z7HZIv;eQfF2$C0+-5@@6?hsDuBwNx|r(Kfj zuP{{Pgq^2WT+8bPXr0M;Nm1Z1P}d|-^P>c>_pjinyj;4Cp||2gT|trMn+#i z4Tp*V&kX|seFPH{kH3`~bj1&f+rbA~n(N}K!?kug2P>}{zo3{DI|do8<)2>&9U1mr zSexXI^ZXHtNTyZZda z3n0Hov1b~SURE}?I02Q$@M;4CgF~>+0(lE#h=$4rG{bt4FSewExC*}7l`WRNQ%Bt{ z-`JD+PbIK3=%Im$8PmG`)@{MndFE4{n-SRUE?v6hfu(Zo?cf8P)lW1<-VE^@Gvl)9 zEX|h?8_i{wr%7^dU`tEn?+T?j+-ngFgY84 zZK6KeR~3j(2P3gky9Rpmtt4+qs_-y$cB8iw96+=RaG4bT7p@c}Z*7R-8~03v^z!Fk z<7I%?a#-?&Yc8#ruE99a-(St9D=L3Lpx~C@*@jvQ;d_kh3#>X{@)_O8y=CT$&S3pq zLm$^B{uHT}^!K#Q!5c2NE_}b3`>Ah&tA&AU-Xc`6k4*k9%3BQ7~CP?J;Dz;0pFu^lX&?4)mBz|3*jbm{taKEwLgjcW~u zG4Pt0n3$Y=`5>A<5r(CdVnFX@+cqf`o{F?4-YhtH*Zs0Z|NA2wVZ!NMw0PmdfD&Op zV+x2R>mf)3FZ`4wgoJFdtqnQ^Y-ejh^#aSxDp-9&U{F|C*sjIK!?XCkEh~mE;?W7d z-drQ+w*}*Qo}=vt?`f%c(f*vKed%LlHE`VFvPBT{hIAmD}H8nQm~4|nG1zi2M4t-CuqT*XRUbri}Ei&sOdu}!|K998-D+x z!&G;!-qM*+mrmaD>a(Pz#HEakj5~K0dH1>)lJs>iK&EN*RL5NC@C@!IX6{i6b2re20w3Ux#09E4uC0!P~O??s(v?_^zWVv zEtu8cHs%P!cU0-Fi_1%@8`NQG(bF&Xia8lWQF10`CN`6unhp-4{K*?`2&$=BVNfN+ z4T=yr)iU&vm%nD8fHs1N)aSJR?N&7?erXL3?Wyac+4d9WGw#&^aH?=TcFY%K-t+^4 z4SXc5r_8l)djIjx0=CYp%{|{~So`@|9s}8AvBS$GWzsp$cPG-_nK2P0{_On-nmVJ=vLtvGE)2;X-OO6}YC8WgJ%ASM0p z4n}RLrq$Sx@KE7jO1~AM|+VKibPbzjXYCgF*2ii)4=# z*yrlX?&OqaO@dvIua$04&GORtOX{JfJ7;xhND_g8pWIpgk;@U*X~2x3wf{qyz(C9< zJBP#1@0`V-zq}`i=i0SM>Upnk|I6QSU$D({;1xN6axFJ4_<=^Ags|o@?u>F6X ztT4wxCwx`$zxs^xLMlfDn2qxO3<3U^Kd3th?CX}ziT*~hSLsBb_P>AG4RZUHyl6dG zOm60GOizFI?8^91jo+*w#qa{5&HFuK`+HpeC#4@yw<||#CYYLO+n`+x+fxmIb#*F@+OvpYPPi`Hnz*4qJ9@om!SJEh#)FBBjeSr zeJF%RL)RvT(k#g0$aFsmCL`8}+BZfIkS7Xhv~P!=7Ji>#3JY}bCHRMT)Yi7s|0`DF zmsB#8Lj~^pbNn2f?U3HDmYJU?4irU*0j5LR-zM<@Z7 zVSp!(!j)e<{YZ8uFLu90Dt!ANdB#FjOY2l0?Q)OPIClpRkAHABLVqG7sP3S&ufvPB zJ6p}RHwaIoo^5X-x@4FUL88OI!?*LRPx`p>a%3NzvNw-8-)tc@S6(G{x!SVSnin2xH!0*kjW;=<^okH<{KfkLRh- z$?TE!8t-%g`x9p;)U%&%V>JqsLAs6RmHs~H_kFbJ@oA1aIySQHb;mwhkaIp%$n9K3 zZv(^G0dV_NchN#j`iJg=%k{1K#eS@hM|4U^@NV7w`$@sG@EuAvocx=AytCYsK_fy< ziy{ZVfhmL7l70dhe_?cN4rS#5{I_VhkV1Rz>=|5-N8=vr> z%SPUNuw~Hf2M;)e_&mUtL4&=raXaMlW#FBIB)4R%N%$ ztIe61ZNxSW-`dmMz-T076h`Hq95nx!nX6dm?qG*N$N3G=7~xX@#!^4MpYfUG=jLuw&k0;CN|4oYx5Fl~?aoIRGa6(!ZeNu3e{&0;fhWUCF z#_Yk^QQ3z5KDZZ9nm5!lWNg&6CN_eiT7}a(Wgs93omi&zYW@U)8AczSaMy2a909}8 zKb#XD3@}={gExovZNsjV{mIwAGE#(`HZf^~H@nR~!p-E%m*-Bu(oi#_v9RUgZ-RR! zy;{1I;a2pwCHnwrq84Ut>!OHog}Yp2+@1}4#_{dxRuqYCS1hsDuV2r32AhbP_gukS z9PmQHd2Gq0$%f@$xMvgabc7u2e`qn`W$B5AGmxGAv17`%wP666)wEUn55t5Rm;v3H z6~OO8M);Y>deJk)9d+jiQ`8S~`m4RN-a=6THYx-^a`7VA%bUMt zk(`4&W~y20U2Qr9Ae;>%B}+)sHnFg49mlm={@$i8GNJ92zIjCZ6ztt-4g0Inw_xE9 zU=g$S)Nef@!PK;0_dh&#fZYjgR(n54*CQ8mSLTP-Iv+qLfg5FL|3^q}(VW`F#+~2S z+t5JO^Ot%JiEdOGu z>`@Bcy3gTu!mOVTy$Y7iw#$qL0bODQJ!5KG+NXj73BijSJ$QHsdBG_zwi8SjFU;z{ zAb>I04lz_P{`Mv@Hv>XI-0a5Y1jw4@| zwJs73W0p<*B0B^k6H9OQ9>*56$hE=eV?JaRBdh`xhEw<_hADb&c5{R)^qGajaPfdH zH^20iu%N^qtnWq_rOG_H5jTrL)q7fljT~^xLC?oGlKXFHT)_36!?s^8@YMvDetf1} zI`;AD0@17$#AD~`FutuKv*I}`iR%2zraJsw$p(A(vOd+F_6t9Bp})fUoBOj-j(*r$ zow43xgr&9ShYx2*OB%3iz#)jl-F00|s5-tI0iaNnvTI#d3-$$??-i*=SZB7`r4n>8 z%#%bTe3fj7U7I1uk$_+$i0F%HQJq0VPWU4V8(|>dLcD|oLg+bx(yFPg+bVIGZ&!~P zcX#Bk!&mzPmhQoaR)W?j z+ZlN&vj0#5fUGs#&A9!{n}}S27>*oWx3S)dflj#U7m0fl)u_HgT>_#avA!x+(um&3 z)zvkCB9)=D0^cc0y_R&_l$?Se4xh~ch=1mvQj7xwe$4`%jPlZyC%2lvVq!5MT9kcU z3!5s!@g*s$e7-yG>_yOYFYMM{fBsem$(K~3l#;Sm`#8RN!K%Pr%koF~V(n&0^|8cw zl=s-I2`9)aNpbB6SqI0C(ys(Q1JQNE?$l-siSBAUGs0ixFt7Af}MVzr5`!v07}j6GqvUJ=LZS-B%Mqv7p5M` zsyA2XhvLA$oa}iJ@sgE?_Cx?h^x{D**(HqTEz8&^8v$c`B*S^K#fiZf1?_0!qG0WEdb)%G=dpm7a8or5*_dsyObPw^uEub%369Yr#|&QiQIKVLYEGtK;6 zh~CzKTKF`nCgr-hO){-NIP5J6*)>}PGc6?P8srre`rz?qB+I;l>4l-^2R??ndbzpa zD7td0#q;*!4~i;PRaMPd*LYw(n4F)6dAC$M6=OImm80Fw=+ItKtvh4Qr!%Q|-TX;L z6i78VWh|pC67Xpt9Qv_yPs-{k_rdC`%p9&TRO9p3ii(Vk#BZ4Rwnyi2%NOsXjsJf*71PV03R35VE0zb}tT=jy`(ya6OIzA(-p*+E_N%7eZ!3h~@>I`e0p3nfD0UbTP z)>c@~nOa)=_FW`peqvE_2Hyo-VR{C;A8N?7$dw6G38@VL^gtLyccmi|(h z!WQ#FvH^m^@U0=0gO=ve8Q9@TusD2dZDqO{d`f5)=JLea6A|^gh^$E1>S}0W704ba z;lgHWD^(O!RsAuuL5$pZ;C3fu%I;4AsCFDjtfol@Y`N)fyHRlmnv9T z&r9k2y2$$T>j%Qe1^FzLAJV#nHGRvlhtFisI5*1^+xj(?%S%8#z+(6+U)?DPKBAXb zN3Od;0?l(h=BtdFS|!3Jo49d5!w#C0acsN35ANF<{3e@4qr!JI?d;*lCo5cgWRdtB zi1nkWeO-B)sgS%a{;eU0$ZP_K0UOlV*RGhGQEWN}L1=37ajWtB(p=xl`3J`tlwpHW>5XJcEI z-?XhQfq`E!k>%yakTr+a3Tt!_+d&ag`)N5kuz#NGG$Y{YbNFp0*suTns2 z?MimCs*x!65Md&gj~Qmx=F#OacFYvxk)~+%5cS?cJy{sHBMCe(h%45%A>X*uwGI|0VKy1Y&SU9yibSjUxK7W1(@-AHF zI`Ud;{*C-;-QC@&h7H)#oE#hkCqjd5wxn#RS-&<^{GnUl$KGCEF_cY8Os^3_;Y+#g zL~jWey4#TRi;M3!r?%a~4V#paq|@XOCpO;dC~?_$P`H0=EYgrltkrMTedo?*7?s&r zt4fI?rr=Q>71R1z7mRvNpf?ylhReqk>iz}7McSt+hHt}sZZsgE9L_AAd@~6aaV%D$ zAlGvb?>>L}+Mz>-rs+F(NXkI?{4HW{;E7-*bNn}FTNf8%FLEF@3(yA1Fc_$5Z!whv z(5!&!BDg=lCYzhQO4QDm^I?^#_Rj2fIBajKUDAAk4R1d@Hys zP5Hp0#|UJHatNg9@$5uq|4b7d+3IupD107G*nD9H^7p5~O?g0A^&8Psv_lv|yE3?F zHcS|h`OPL;1;Jxc!Dq%HJ)T$|=loGba5MBOXcW*vfXSfu9B@wE>o681Pg@X(MbRm zF6YrzU_gV9`sbggobhCA|NaZ}eHfMf{g?XV@B*~GYk?K&xYxpKjd^)^lpJ4hlSkY_ zkcEP}3_M{IKHiCZoPQUT8aPj?MpcSRN-+W|+m)45^5}lQW?BQp(lC3(cN8zjmr@BX zEBsh{l~5e`VIYt4f#4>cnLNBA{|#=DK64+=;uyERE+Z`gSqqRDW>77RAogQt1g>1m zTk{g7W1r7x8v452ygaxy+2P#Dyn|{uy)^>l5W>3;&IGEvn8P?xks!Rfd?@>AqQ1<| z!cyaTv<6nOF-|Bt!Jzq=t;@j2$EQ{|kH&+TFe6WQxDY<47P>z3(-5lfP~P`K>r0bo z%w>3Idp$&Cn}YQYsy;dG&mW(7*h6cAD{UKr#hxqOe6C3ymr_$v1URyj2e(52(Uzvt z>#x zw{!2^I{_fdT>{W3Hp9N0$=KmX2)0_407(hA%+P(n428A{G%rkrI$@wEcP3s~QB>zG zn#f5NS(q_|h_g~W1*!%sN7^k-RpRSFkX&8rTP=L3dd~>!?k-mBPv*L%udffE!$P=S zik-aPFoOCPcuy5>06whGL4zPu*Z`Bd#5ZolypHnrYveTar_rq!oLESgv>~%Ywk`Mi zTnxkG)Ln%)fx^xB>@9@)2|%z0`v5oIyNbskDZ};`R&%0ex{F(cx~dg8$e|IGu*@OI zV|+ATIAeg=oCA6!?Zc=jCB$n(QL3YPd*GkAjAa0y-#wQ0qKM#l&V8R>>$3a!%jB*0 zh{90esJy)60dfjl0%EnXk`*i-iTjr-n;67k>4p^KY1+6xU_J{~Jct;M4@~m`&E^Ct zY6Ia3#A>x6fyVb3-yYSu{s1TuL9j;Ht2p(Eeg+T46arF$Q!UHBXkFk+4-ahM)IQ>gxU~BVBw!Z($U_;9RBRbx-=Yl zIL;E9B~>XF_bgz$fX)1Wc9}U3n%{(<--Qcwe%om(q$a=>=fokl_l$|f`4ZZ$QQ{5; z2VAYGsi}*9;rwaP%I9uBhLSLDUtEPa&1>3=I0(-Ya2HYGaQFp`)}ChoaF9EqF3?D& zRSp)@tckm^Hid2IgchOFqhaKp2FWFrD&{)2ZXEl#A>uWg(;#*@npz=kAU>=6%#a#V z72K?Va5GasXrP6f%{Y>Y#7dDm_v$?ku-d}#WwZ%aShIu z>AlJZvMA$s9B5nIaRGk*c7NP+e|RIgZeg>LWai+IN6>!G55WG`IsF2-x#56fK|?6r zqfK6ph5IHIYBdL#Gi=GzTgj_nj`8|8T|^Ke?z8&Jw>Vt2%;;{y@!H!q>|!I=r`x7k zA+wY2(fDDogLO^cShS?><6j6tliIy~elqv0L2zh=_>qJxcvH=dgV)MdcRM{_5V-Hk~w*#O4uKvVfGA z6nGLkaZwtki)Ku$I+v<9Zb#Nt%! z%W+Mz6uA&y=x^%kLd9aZe$bBf%27m9uNM$Dy!e$eEmqu`0&AB;$}a9>r9OX>;lqxM zEQv@bW_#{qaxtu=fi)@65HRBM6KDzVU>IONqwkgP@ijGrSXP80Yk6*8IK?6l+cO}* z*Tm?4pPAh_$U@aAODS?#R4|kirxvmWG$%GS%Bp2(!ZYjPm_1-5VF)(Iv}H4Q3A2&i zR_Apooj``s$BzK~)RzwVMPQ3B*N9#_$50T}_!x<%H;z&BRUu=Ds5$?0&9hOdIB#6D z?@B$L@&$j;arWYbKO!T|BjVJI5DOA8ZFere1Mk24fy=}?)fB%XB=InLn}^Bgf7)Eb z7pz!aS#m%1a=99PiNtxw@7xo$iUAa_0V)T(RH62n7YfTm>85M^U5gwW2DL(?rf%lu ze!I<ZXubZyBqCrz+-bs(=-p z=q?nxSwt4KtK%E?I|w`K=F#Tpt*H@eEYi5u^OoGT=hZ7#^_l`#7>t)gD?DvGjF>@b zxsgI-@-|_%8&h)<8?J^Ak-qhYn(tGo)yt7X(!*7L=2gKiRSG*7AXgogS$Sh!ceoBy zs+rKY`?xSn18uZ&Hn~PK_yQ%NJc$eCVj2-pm|A(4 zS2~^>rjg5RZ_xHL4~mM5+wtgZM?7kH3=nH{a&k?Up`oENhFkyvLYWd`GCbzKS~{~P z%1cT{-=nE-Y=ppvW^dDbjuV`BC$PUy6*Jg3LGl>BRG%&~7z^$;v&oY&klcpZ6vhlC zzR-M{e^*~!Fo|J9E%pm2vR*5iT;!0<6JxKKt!w9)GX9A8@7D3+mJa|%c2F7_ z+8f*1AYqQs?J_-UY_B4e-~CR4DYh9PG-2alZ)QU-TJ+t6R1{GZ4p$nRw?bb#Dv$nN z_&SAwLUAA|^I-oMV!!)ATc5kz<>c7k)p0uZGxtwl@ZYyXbBrf7!CBp7U6JVEZWLW| zp|W%`hm9OV=cAI#aU2pX{h%l!^wQqjhJ5s0H91MXAr7JtL`n3zHb{`lEcQ>-Kn@2kQIkM zr#j}MxI?da=ImJ~CDgffvPixT21qH}J_anEm>kIIcoe%{ecd(ZBbOwj$wZvz$8zEX z{dxtkub6Mz({x{wj`dF<>DNlzdx;dPbh7hDoULSM>_#~C8@-_|CK?T!ac*rA21_6*Z3wFz7O@wK}j<3qHj$Vcsnj!3cveUfNf zMz+*j;L2pnzp2dIPtiWD6(patBPS!o#yNWu-VvETzwmJnz4F$~V;N_Igrj5lSCuMZEJj(K3S6x# z8ga3w?yx7=3XD7yJbGPSjD_pDxy><*ob>xbYl&_qa(4j5#Msz8{y2e)sHTO+#H=YX zeXW>%wC1emjvZ+Kk32b95OER2|H5Gujxv^oWcFJUux!o;F|tHuUH|ClTwLHUO1>8f z(`C|Czlu>c_a@x+8De6PAOcrp4@&-Sbw!@BE->@fdO9v!*pj$M)@q*n%x7pbFL86k zmx;0<>V-vHP?4FKi)$enxuf>UYAM@1L^-8rAs>(HF`@WEJ`t&G4wuv6pTz^hdRr5} zh#(7!Xlq+H-B(s^M71$#Yb|urZ}G=Q{&o86^-0fBz={hAX)dKp7>h6!78k#=V6!T5 zW;vsh>hmmdBSyjqETxag`ISzQ8m_6GQ#N5EUHa};j~rFFk1ilGoW<#~vD@eNO6$g^ zCYlo+C-3lgvF2Eujg;y>$$=98%0SC@CzoIjhbikCr1umqbNXBwDtASKjnk zhe*E2Ai2lKbEcvzzrLGejarxb!5;g*VtdQ&o0!h@UG@krhU)nd zs_n7Y4h3BvS15^7EU%z^k z?ZFwZIc-WhOZ9>qA1#lnuWGWj@oJWg?9>`9S)N2)UZ~T}`>vUl-DX0LeV?cOHp+{! zFNlFgo@aGi4L5^yb^I!7JDZHmX4*$`#QZ#d0Y=p}JoV;~D?KPFd9g5SJ@Pk#t)}{?^D2&RoiA>A7z#(ycu-si)=VLcz?=xGkOjFPz zoy%BQG5?uG6h}SfdHS^O+s|LW#_o`D?Us2`^v!C!j&pG3+WvUWB+Qv%26l}TJfgKb z?x&@F06Cw9L-6U=SF1<+WwM&wI>jJv%|M!Pey_szy%>?DhBrg4~iLj z84HE0WZEaQZk3))R7STLa1<7D$$Fvh0GnBOSi$v|kM|;-j4*W=d*r%q3u-U9w;cVo zw9|PR0T=taDk-AB@@Ge`u`!vEJX}IMbL~q&G9 z=EzCgM(kC6c%O`5P9b-{_<2%Uu4i7D&E|&oBOC6m|#Mtsj`7$&5{_`&#~n+8Sdy zf&<~f-{S`4CByV6$NKCXGBCFr#%E<@hBY(>!?0f~(pHv$~MV(;y{ z_ZoBmtXPCeDc+{omg;s<+QLVps%Wv$%$O%wn&@4Bhq@G4fVYoN5fz$Y8k^h9_Op!B zpf-H_GUiq?bpsWu)OL<@QbwG5K|R~5o#`L;Rd@h^yGv8mkTB6WfsXugu%qlDg5et1EyM7NZw}1XuSit4B5M`;GP6`TPvNGfra^Ubto~!kjfTSWk{| z^P2$b`Q^F3wx5iJ+t0K2qX7$8Sr~~xD6>Kyz5!!L zD}M+54eaLN8F?i6_cvc6BFS`PmnA8acxkS+gk_wXS2h&C8WWoc(!w zNHJt4*JP>GWA59K?-cAcP0CEt zDMQCnwYn_LS@*Hdq*|(|roCbAqhB1IOgEk|Aii$6WlT(?U$!rrpZDfhyc_d*8Rd{4 zhrthbQ?H3ti~$ED+@lK*%x{V1QYkhkR!u`e75lTUu4onJRs+-BU1NJNJi#hhz6s%7 zQy~`sXTgyX5HZ8^AscL$x5)rY=(sPhgH}I)jw@e|*8TxH!;FcnLn#%^P;<$9Q3=A0 z5Yh;5_%Yl8^W+bS3;>1BJA5?zHB|0y!)=NGEf|LWSgmvS(kp8l8=K4GyD}5eabi0? zJH>R{9@|cHS665LK-n`xa3Z2XZ z%LrY0xT?KTA+?0C9P7$Wk8O&43GV&$N>gt!sXv2^LhMXfU8-7rZE>=pv3e1hdlXu) z>`p4i(5+6b{H`}f0!h*67^}O~7so`ND9joFc~L(5HTuQ~y-ak<2eTJexy0msA|4}H z`?*@F9$B)3j8gg+Y)U2O(fvk4oe8G7F3Oq?(1tO+QE(basL>kEuqj?;Frv#Sk1-Gh*WH#`-3`2H?%Jd4hsrwZT= zEIJ@DgnYUF?x7`?lM@E6lzz0S1TIb+63+n~%}dZ#CiUstPgq-N6PjYCA_GfY!OO6; zk}QSx0l4t7@bjG`s8WccJ7u5*7)z&xVV65H9gr~XA{*=JI-t0*yz=yex0hY1@4BEq zZC8!_0+2jDY@7W<*OPE21^EThzA1sxl`oaB{(IqO`(e5O5DpZ?#15A`Az4}MrC(!8 z&rYp-UyPXD-H!u}jRWfqT)6{6q%VE4q*50-bLt5%OD$yn2y4me|Gx%OOSj( ztqPsd>bFhkE-EyfJWEj%44j3rrct(=15+D7Kh@Nui>o2!@|&4Pg_RlCrYa$Tv2{NW{A#l;GW#FU=V#;_QQW^k7Ov zTZ#%-<(ULAHQZ!YaQ9q$(Pjb-T7Hvj#C3Pgb9CK@mJXhfECi}Pd8g-}qn?PPu5+OX z^RdSteUAe@RkfrQte!TWJUcy}f{BZEFS1PH+l7b0wgEFO8+s{R*{l7k7RT>k`B!oI zEe5)$i^NQ*ud-tHdc;5A{sJZwk#?+{oM!-qk41iK~FY2=j~QR|CJ?du&sMdE}=D3{40JAH9dxfcXKy74sd?5zaobD=5!=sDyNO zOI6x9ftA+X#wIf_FK^GTVHZvRAs*g zS#e{v?iz|zqSkn}w8@ePg_sFuz)fxre^%pCAKP-)svZS35L{kedgT{`nS z;PH=*WZoxNexbL=EIRxeumf+*?p4lA^l!TuiABjMEN=L7DQJ#?<@)FAYeoE8_(e5^ zXl~zj-Qnm_j1&s+&Rv7e8~}*|WG9Rr?`=cVh%8~5e7q_eGyDKdT~e<8;<%2jyZ!*F zRW+IG?*O^|^Qgw+fkHrqWdZ`hJis^NtnoV+#>B*+Xt{a$@@2w^71&Jb(JP4JZoMTJ zb{rU@=c_%Oi(NQFmBs$2eV>t-ZlEkzu@sqyjKJD!PD%AX$J zKR=5I*X_L8osPm9%fG-_l!gI?hr7Ft(J}npZoDsuBEcrTQR>EHeIK1X$Q^>;0kA(? zDkERJb}b}8_>ahJ!;mq7~7_JRTFDJb{@xGxBM19(ftb*oXpQoq1@k2x>U=nqA@u5)bFQYJRwu+)p7NHBp*fC%D*iaaXCWhp#5Y@fz#@|UAQXAXouCS*6`epQ<-k`> z;x1$Owtd?+(A!8y*1hS)RE$=N^HE6&ig1x6gI&9%%nJB9`F8lqIc60i7Nb4=4BY-2 znY}Hor>SW>EUyZi3cs1N27-3+oE>9~+P^c#G3NOJyebqFUp{?0fKP{pZy7$lkSp6z z_yc)rhD)W?k#6Xa#9IoU)iSCt3={5>z#(wCuVEVs#8hZqt1rW7dK{4}Z$&x3b=VKF8e+Z66dXyA@^I3wY{ozT7i%(sIg)$zyfk`^x- z|B{%PgLT5Vg6H;iOQPVQL*G>u^{J?*`K*^$JtsAZWaHbKsw8o#@f1eGVgIMGYmI8U zjKgePP7bl@xFoB?xHM)EEJF^4M;tm%6gN5Hfdhj9AqER3qd|?0v0?&2f*?1mBN$YN zOI&n9AZ)-eP)u7OXc`_sLdIzV0tIN#pFY^f_Hm!~|L=XD_xJvu=l48mt#i{ar&;AN z@?ciJMcyhH=`YYGA-?&hu5JWw-&h*8UI@?$;HQwt-B4KI3^l{WI|AaMKT)3}maIOu z@+P+f4{ZP;f1#7M20$Q0hd|MWhGhKjczKd1k4#T$evo zE9i~-ONBxCxu6R&&HxyOky28SV&afDkyeOc|IF2sO|s}77H${|<*EJi69(Pqyfi=hTJ zo+E1GRK~}}T^|`iH$=)*8xH3spy)qe0uN8>RQAS{9bib~`rY1vU`I9k3aG7QlMc*6 z=_zb$H^`RcB;(@0aOCX+;|547j@*bs2dUcg!%x5^m6erYY982nuSZI6cy#BIi)=ZU zT&Oi%D26dVVa46pgx;#)+R*}_)-<6PCNQwZYt6ocr6Mu1fws`q2FP1?qulak4v(Mb z7+cb2e$0M$&`Av9F5Ji$St2~k{fTF@42j~?t5^}l9iVXu>kOMZO)Nf$28nDL2Ql8c zHbrByZq1>(BdP7*I`pPJbrOd`#{5{(2TmJ=Fg)bg%67J4bro_}DGn9P-59$!!Ds?v zhedlC^=(h8WzQ&7BA9r6gY-pV;XoXIo)9wh>5>`Y>B$}ot(3?R(v74-zc?wrye)q9@Y|B z59j0-{{T6t{Myc82$Hm5dr0Q-4Iygu2)0F%NrU1frCuR%MC}?KfWcq8HJ+tKvIzP$ zp>S>GZYs7I(NE1{4(cOKINg@dO9VB~z6jN+Z7?ChKxNU_-ygAS;ObS0MB=!Fw}psF z{m!U>!>;?PZD}+STwqmaz`i2L9qNF1Tl4s^%BQ$eOS7^-VWe5l@|!UlTv1_Dwkug) zJ)JO%M~+*c$L%!affon(`)fSL +
+ + *Figure 1: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. Runs the client with 60 generation steps and input prompt length of 2600.*
+ \ No newline at end of file diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/plot_effective_throughput.py index 357fc7f9e..350c269c3 100644 --- a/benchmarks/inference/mii/plot_effective_throughput.py +++ b/benchmarks/inference/mii/plot_effective_throughput.py @@ -12,21 +12,30 @@ SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8] EMA_SPAN = 16 -tp_sizes = { +tp_sizes_all = { "7b": [1], - "70b": [4, 8], + "70b": [4, 8] } -prompt_gen_pairs = [ +tp_sizes_test = { + "7b": [1] +} + +prompt_gen_pairs_all = [ (1200, 60), (1200, 128), (2600, 60), (2600, 128), ] +prompt_gen_pairs_test = [ + (2600, 60) +] def get_args(): parser = argparse.ArgumentParser() + parser.add_argument("--test", action="store_true") + parser.add_argument("--no_vllm", action="store_true") parser.add_argument("--log_dir", type=Path, default=".") parser.add_argument("--out_dir", type=Path, default="charts/goodtput") args = parser.parse_args() @@ -96,7 +105,8 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}") mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + if not args.no_vllm: + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" validate_funcs = [ (validate_token_cum_latency_SLA, (), "cum"), @@ -109,8 +119,9 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out client_num_list = sorted(list(mii_goodputs.keys())) mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list] - vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) - vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] + if not args.no_vllm: + vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) + vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}") # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}") @@ -118,16 +129,18 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out # Plotting the scatter plot plt.figure(figsize=(7, 4)) plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue") - plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") + if not args.no_vllm: + plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4) mii_model_fn = np.poly1d(mii_fit_model) plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--") - vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) - vllm_model_fn = np.poly1d(vllm_fit_model) - plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") + if not args.no_vllm: + vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) + vllm_model_fn = np.poly1d(vllm_fit_model) + plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \ + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}' @@ -148,6 +161,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out if __name__ == "__main__": args = get_args() + if args.test: + tp_sizes = tp_sizes_test + prompt_gen_pairs = prompt_gen_pairs_test + else: + tp_sizes = tp_sizes_all + prompt_gen_pairs = prompt_gen_pairs_all + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py index 8ede6e818..e99dc5a3e 100644 --- a/benchmarks/inference/mii/plot_th_lat.py +++ b/benchmarks/inference/mii/plot_th_lat.py @@ -3,17 +3,25 @@ import argparse from pathlib import Path import numpy as np - +import pdb from postprocess_results import read_json, get_summary bs = 768 -tp_sizes = { +tp_sizes_test = { + "7b": [1] +} + +tp_sizes_all = { "7b": [1], "70b": [4, 8], } -prompt_gen_pairs = [ +prompt_gen_pairs_test = [ + (2600, 60) +] + +prompt_gen_pairs_all = [ (1200, 60), (1200, 128), (2600, 60), @@ -22,7 +30,9 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--log_dir", type=Path, default="logs.release") + parser.add_argument("--test", action="store_true") + parser.add_argument("--no_vllm", action="store_true") + parser.add_argument("--log_dir", type=Path, default=".") parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency") args = parser.parse_args() return args @@ -56,19 +66,22 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): out_dir.mkdir(parents=True, exist_ok=True) mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + if not args.no_vllm: + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) - _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) + if not args.no_vllm: + _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) # Plotting the scatter plot plt.figure(figsize=(6, 4)) - - plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") - fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) - vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) - vllm_model_fn = np.poly1d(vllm_vllm_model) - plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") + + if not args.no_vllm: + plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") + fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) + vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) + vllm_model_fn = np.poly1d(vllm_vllm_model) + plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue") fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) @@ -82,7 +95,6 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): plt.legend() plt.grid(True) plt.tight_layout() - # plt.show() out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" print(f"Saving {out_file}") plt.savefig(out_file) @@ -90,7 +102,13 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if __name__ == "__main__": args = get_args() - + if args.test: + tp_sizes = tp_sizes_test + prompt_gen_pairs = prompt_gen_pairs_test + else: + tp_sizes = tp_sizes_all + prompt_gen_pairs = prompt_gen_pairs_test_all + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py index 77377a93a..52fc0da50 100644 --- a/benchmarks/inference/mii/run_benchmark_client.py +++ b/benchmarks/inference/mii/run_benchmark_client.py @@ -80,35 +80,16 @@ def callback(response): token_gen_time.append(time_now - time_last_token) time_last_token = time_now - postprocess_config = { - "logit_processor": { - # "name": "TopP", - # "args": { - # "top_p": 0.9 - # } - "name": "Temperature", - "args": { - "temperature": 0.9 - } - }, - "sampler": { - "name": "Logits" - }, - "stop_criterion": { - "name": "EosGeneration" - } - } - time_last_token = start_time = time.time() token_gen_time = [] if stream: output_tokens = [] client.generate( - input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config, + input_tokens, max_new_tokens=max_new_tokens, streaming_fn=callback) else: result = client.generate( - input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config) + input_tokens, max_new_tokens=max_new_tokens) output_tokens = result.response[0] return ResponseDetails( diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh new file mode 100644 index 000000000..ece8393ed --- /dev/null +++ b/benchmarks/inference/mii/run_example.sh @@ -0,0 +1,19 @@ +### Run the server +RAGGED_BATCH_SIZE=768 +PARAM_SIZES=(7b) +DEPLOYMENT_NAME=llama2-7b-tp1-b768 +python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start + +### This command will run the client with 60 generation steps and input prompt length of 2600 +DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh + +### Stop the server +echo "Stopping server" +python server.py -d ${DEPLOYMENT_NAME} stop +sleep 120 + +### Gernerate the plots +python plot_th_lat.py --log_dir . --test --no_vllm +python plot_effective_throughput.py --log_dir . --test --no_vllm + +echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768" From 8c551d2f19ded1c2622f868bdfd1bfda5fa5760d Mon Sep 17 00:00:00 2001 From: Moshe Island Date: Tue, 21 Nov 2023 22:38:26 +0200 Subject: [PATCH 08/58] deepspeed-chat: filter stage3 too long prompts (#782) In case stage3 prompts are too long, the prompts are used but they are arbitrary sliced at start to fit into the configured max prompt length. This arbitrary slicing sometimes causes prompts to be less meaningful. Which in turn, causes the generator to generate garbage. This phenomena was observed to de-stabilize RLHF stage3. To fix it, we filter prompts that are too long. In addition, dataset rebuild flag is propagated to other required consumers. Note that since generated dataset are cached in disk, this commit will have effect only if we cleanup step3 cached datasets. Change-Id: I440f09decf0784e4c2c8167a893006dff312281b Signed-off-by: Moshe Island Co-authored-by: Moshe Island --- .../dschat/utils/data/data_utils.py | 77 +++++++++++++------ 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py index 744ad7927..0b9d6df64 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py @@ -92,12 +92,19 @@ def get_shuffle_idx(seed, size): return shuffle_idx -def get_raw_dataset_split_index(local_rank, output_path, dataset_name, seed, - split_name, data_split, split_index, - data_size): +def get_raw_dataset_split_index(local_rank, + output_path, + dataset_name, + seed, + split_name, + data_split, + split_index, + data_size, + rebuild=False): index_file_name = f"{output_path}/{dataset_name}_seed{seed}_{split_name}_{data_split}_{split_index}.npy" # reindex each time when using local jsonfile since it's more likely to get modified - if (not os.path.isfile(index_file_name)) or (dataset_name == 'jsonfile'): + if rebuild or (not os.path.isfile(index_file_name)) or (dataset_name + == 'jsonfile'): splits = [float(s) for s in data_split.split(',')] splits_sum = sum(splits) splits = [split / splits_sum for split in splits] @@ -176,6 +183,9 @@ def create_dataset_split(current_dataset, raw_dataset, train_phase, tokenizer, chosen_token["attention_mask"] = chosen_token[ "attention_mask"].squeeze(0) chosen_dataset.append(chosen_token) + print( + f'Creating dataset {raw_dataset.dataset_name_clean} for {train_phase=} size={len(chosen_dataset)}' + ) elif train_phase == 2: for i, tmp_data in enumerate(current_dataset): @@ -204,39 +214,41 @@ def create_dataset_split(current_dataset, raw_dataset, train_phase, tokenizer, reject_token["input_ids"] = reject_token["input_ids"] reject_token["attention_mask"] = reject_token["attention_mask"] reject_dataset.append(reject_token) + print( + f'Creating dataset {raw_dataset.dataset_name_clean} for {train_phase=} size={len(chosen_dataset)}' + ) elif train_phase == 3: + filtered = 0 for i, tmp_data in enumerate(current_dataset): # tokenize the text prompt = raw_dataset.get_prompt(tmp_data) if prompt is not None: prompt_token = tokenizer(prompt, return_tensors="pt") - prompt_token["input_ids"] = prompt_token["input_ids"] - prompt_token["attention_mask"] = prompt_token["attention_mask"] - for key_word in ["input_ids", "attention_mask"]: - length = prompt_token[key_word].size()[-1] - if length > max_seq_len: - y = prompt_token[key_word].squeeze(0)[length - - (max_seq_len - - 1):].flip(0) - else: - y = prompt_token[key_word].squeeze(0).flip(0) - prompt_token[key_word] = y - prompt_dataset.append(prompt_token) + if prompt_token["input_ids"].size()[-1] <= max_seq_len: + for key_word in ["input_ids", "attention_mask"]: + prompt_token[key_word] = prompt_token[ + key_word].squeeze(0).flip(0) + prompt_dataset.append(prompt_token) + else: + filtered += 1 + print(f'Creating dataset {raw_dataset.dataset_name_clean} ' + f'for {train_phase=} size={len(prompt_dataset)} {filtered=}') + return PromptDataset(prompt_dataset, chosen_dataset, reject_dataset, tokenizer.pad_token_id, train_phase) def create_dataset(local_rank, dataset_name, data_split, output_path, train_phase, seed, tokenizer, end_of_conversation_token, - max_seq_len): + max_seq_len, rebuild): raw_dataset = get_raw_dataset(dataset_name, output_path, seed, local_rank) train_dataset = raw_dataset.get_train_data() train_index = get_raw_dataset_split_index(local_rank, output_path, raw_dataset.dataset_name_clean, seed, "train", data_split, train_phase - 1, - len(train_dataset)) + len(train_dataset), rebuild) train_dataset = Subset(train_dataset, train_index) train_dataset = create_dataset_split(train_dataset, raw_dataset, train_phase, tokenizer, @@ -248,7 +260,7 @@ def create_dataset(local_rank, dataset_name, data_split, output_path, raw_dataset.dataset_name_clean, seed, "eval", data_split, train_phase - 1, - len(eval_dataset)) + len(eval_dataset), rebuild) eval_dataset = Subset(eval_dataset, eval_index) eval_dataset = create_dataset_split(eval_dataset, raw_dataset, train_phase, tokenizer, end_of_conversation_token, @@ -287,10 +299,19 @@ def create_prompt_dataset(local_rank, torch.distributed.all_reduce(buf_create_cache) if local_rank <= 0 and (buf_create_cache.item() != 0 or reload): + print(f'Creating prompt dataset {data_path}, {reload=}') if len(data_path) == 1: # Single dataset. train_dataset, eval_dataset = create_dataset( - local_rank, data_path[0], data_split, output_path, train_phase, - seed, tokenizer, end_of_conversation_token, max_seq_len) + local_rank, + data_path[0], + data_split, + output_path, + train_phase, + seed, + tokenizer, + end_of_conversation_token, + max_seq_len, + rebuild=reload) else: # Blending datasets. train_datasets = [] eval_datasets = [] @@ -298,8 +319,16 @@ def create_prompt_dataset(local_rank, eval_size = 0 for d_path in data_path: train_dataset, eval_dataset = create_dataset( - local_rank, d_path, data_split, output_path, train_phase, - seed, tokenizer, end_of_conversation_token, max_seq_len) + local_rank, + d_path, + data_split, + output_path, + train_phase, + seed, + tokenizer, + end_of_conversation_token, + max_seq_len, + rebuild=reload) train_datasets.append(train_dataset) eval_datasets.append(eval_dataset) train_size += len(train_dataset) @@ -328,7 +357,7 @@ def create_prompt_dataset(local_rank, tokenizer, end_of_conversation_token, max_seq_len, - ) + rebuild=reload) sft_train_datasets.append(sft_train_dataset) sft_eval_datasets.append(sft_eval_dataset) sft_train_size += len(sft_train_dataset) From b116838b905430a5fbebe3713a68d90638478aa9 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 21 Nov 2023 15:16:38 -0800 Subject: [PATCH 09/58] update MII benchmark to reflect changes in output type (#812) --- benchmarks/inference/mii/run_benchmark_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py index 52fc0da50..caf20351e 100644 --- a/benchmarks/inference/mii/run_benchmark_client.py +++ b/benchmarks/inference/mii/run_benchmark_client.py @@ -74,8 +74,8 @@ def call_mii(client, input_tokens, max_new_tokens, stream): def callback(response): nonlocal time_last_token - # print(f"Received: {response.response} time_last_token={time_last_token}") - output_tokens.append(response.response[0]) + # print(f"Received: {response[0].generated_text} time_last_token={time_last_token}") + output_tokens.append(response[0].generated_text) time_now = time.time() token_gen_time.append(time_now - time_last_token) time_last_token = time_now @@ -90,7 +90,7 @@ def callback(response): else: result = client.generate( input_tokens, max_new_tokens=max_new_tokens) - output_tokens = result.response[0] + output_tokens = result[0].generated_text return ResponseDetails( generated_tokens=output_tokens, From 0e10c4b0fc43ceb32b00bcfac2b1ae6bb8b781cd Mon Sep 17 00:00:00 2001 From: Pareesa Ameneh Golnari <120066333+PareesaMS@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:15:08 -0800 Subject: [PATCH 10/58] Adding LoRA-Distillation SD training example (#788) Co-authored-by: Xiaoxia (Shirley) Wu <94406484+xiaoxiawu-microsoft@users.noreply.github.com> --- training/stable_diffusion/README.md | 44 + training/stable_diffusion/inf_txt2img_loop.py | 56 + .../local_pipeline_stable_diffusion.py | 705 +++++++++ training/stable_diffusion/mytrainbash.sh | 21 + training/stable_diffusion/requirements.txt | 6 + .../stable_diffusion/train_sd_distil_lora.py | 1288 +++++++++++++++++ 6 files changed, 2120 insertions(+) create mode 100644 training/stable_diffusion/README.md create mode 100644 training/stable_diffusion/inf_txt2img_loop.py create mode 100644 training/stable_diffusion/local_pipeline_stable_diffusion.py create mode 100644 training/stable_diffusion/mytrainbash.sh create mode 100644 training/stable_diffusion/requirements.txt create mode 100644 training/stable_diffusion/train_sd_distil_lora.py diff --git a/training/stable_diffusion/README.md b/training/stable_diffusion/README.md new file mode 100644 index 000000000..bdadff29b --- /dev/null +++ b/training/stable_diffusion/README.md @@ -0,0 +1,44 @@ +# Lora-enhanced distillation on Stable Diffusion model + +This repository contains the implementation of Lora-enhanced distillation applied to the Stable Diffusion (SD) model. By combining the LoRA technique with distillation, we've achieved remarkable results, including a significant reduction in inference time and a 50% decrease in memory consumption. Importantly, this integration of LoRA-enhanced distillation maintains image quality and alignment with the provided prompt. For additional details on this work, please consult our technical report [TODO: add link]. + +In this implementation, we have adapted the dreambooth finetuning [code](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#dreambooth-training-example) as our baseline. Below, you'll find information regarding input data, training, and inference. + +## Installation + +You need to have huggingface [diffusers](https://github.com/huggingface/diffusers) installed on your machine. Then install the requirements: + +

+pip install -r requirements.txt
+
+ +## Training + +### Training Data +Our training data includes a significant dataset of pre-generated images by [SD](https://github.com/poloclub/diffusiondb). You are not required to download the input data. Instead, you can specify or modify it within the training code (`train_sd_distill_lora.py`) as needed.To train the model, follow these steps: + +### Training Script + +1. Run the `mytrainbash.sh` file. +2. The finetuned model will be saved inside the output directory. + +Here's an example command to run the training script: + +
+bash mytrainbash.sh
+
+ +Make sure to customize the training parameters in the script to suit your specific requirements. + +## Inference + +For inference, you can use the `inf-loop.py` Python code. Follow these steps: + +1. Provide your desired prompts as input in the script. +2. Run the `inf_txt2img_loop.py` script. + +Here's an example command to run the inference script: + +
+deepspeed inf_txt2img_loop.py
+
\ No newline at end of file diff --git a/training/stable_diffusion/inf_txt2img_loop.py b/training/stable_diffusion/inf_txt2img_loop.py new file mode 100644 index 000000000..20482bff4 --- /dev/null +++ b/training/stable_diffusion/inf_txt2img_loop.py @@ -0,0 +1,56 @@ +import deepspeed +import torch +import os +from local_pipeline_stable_diffusion import StableDiffusionPipeline +from diffusers import StableDiffusionPipeline as StableDiffusionPipelineBaseline +import argparse + +seed = 123450011 +parser = argparse.ArgumentParser() +parser.add_argument("--ft_model", default="new_sd-distill-v21-10k-1e", type=str, help="Path to the fine-tuned model") +parser.add_argument("--b_model", default="stabilityai/stable-diffusion-2-1-base", type=str, help="Path to the baseline model") +parser.add_argument("--out_dir", default="image_out/", type=str, help="Path to the generated images") +parser.add_argument('--guidance_scale', type=float, default=7.5, help='Guidance Scale') +parser.add_argument("--use_local_pipe", action='store_true', help="Use local SD pipeline") +parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") +args = parser.parse_args() + + +local_rank = int(os.getenv("LOCAL_RANK", "0")) +device = torch.device(f"cuda:{local_rank}") +world_size = int(os.getenv('WORLD_SIZE', '1')) + + +if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + print(f"Directory '{args.out_dir}' has been created to store the generated images.") +else: + print(f"Directory '{args.out_dir}' already exists and stores the generated images.") + + +prompts = ["A boy is watching TV", + "A photo of a person dancing in the rain", + "A photo of a boy jumping over a fence", + "A photo of a boy is kicking a ball", + "A beach with a lot of waves on it", + "A road that is going down a hill", + "3d rendering of 5 tennis balls on top of a cake", + "A person holding a drink of soda", + "A person is squeezing a lemon", + "A person holding a cat"] + + +for prompt in prompts: + #--- new image + pipe_new = StableDiffusionPipeline.from_pretrained(args.ft_model, torch_dtype=torch.float16).to("cuda") + generator = torch.Generator("cuda").manual_seed(seed) + pipe_new = deepspeed.init_inference(pipe_new, mp_size=world_size, dtype=torch.half) + image_new = pipe_new(prompt, num_inference_steps=50, guidance_scale=args.guidance_scale, generator=generator).images[0] + image_new.save(args.out_dir+"/NEW__seed_"+str(seed)+"_"+prompt[0:100]+".png") + + #--- baseline image + pipe_baseline = StableDiffusionPipelineBaseline.from_pretrained(args.b_model, torch_dtype=torch.float16).to("cuda") + generator = torch.Generator("cuda").manual_seed(seed) + pipe_baseline = deepspeed.init_inference(pipe_baseline, mp_size=world_size, dtype=torch.half) + image_baseline = pipe_baseline(prompt, num_inference_steps=50, guidance_scale=args.guidance_scale, generator=generator).images[0] + image_baseline.save(args.out_dir+"/BASELINE_seed_"+str(seed)+"_"+prompt[0:100]+".png") diff --git a/training/stable_diffusion/local_pipeline_stable_diffusion.py b/training/stable_diffusion/local_pipeline_stable_diffusion.py new file mode 100644 index 000000000..64abf7d7f --- /dev/null +++ b/training/stable_diffusion/local_pipeline_stable_diffusion.py @@ -0,0 +1,705 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union +import torch +from packaging import version +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.configuration_utils import FrozenDict + +from diffusers.models import AutoencoderKL, UNet2DConditionModel + +from diffusers.schedulers import KarrasDiffusionSchedulers + +from diffusers.utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline + + >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" + + +class StableDiffusionPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + _ , prompt_embeds_text = prompt_embeds.chunk(2) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # perform guidance + if do_classifier_free_guidance: + noise_pred = self.unet(latents, t, encoder_hidden_states=prompt_embeds_text).sample + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + has_nsfw_concept = None + elif output_type == "pil": + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + else: + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/training/stable_diffusion/mytrainbash.sh b/training/stable_diffusion/mytrainbash.sh new file mode 100644 index 000000000..fcd2f8508 --- /dev/null +++ b/training/stable_diffusion/mytrainbash.sh @@ -0,0 +1,21 @@ +export MODEL_NAME="stabilityai/stable-diffusion-2-1-base" +export OUTPUT_DIR="./sd-distill-v21" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir "$OUTPUT_DIR" + echo "Folder '$OUTPUT_DIR' created" +else + echo "Folder '$OUTPUT_DIR' already exists" +fi + + +accelerate launch train_sd_distil_lora.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --output_dir=$OUTPUT_DIR \ + --default_prompt="A man dancing" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=1 \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 diff --git a/training/stable_diffusion/requirements.txt b/training/stable_diffusion/requirements.txt new file mode 100644 index 000000000..7a612982f --- /dev/null +++ b/training/stable_diffusion/requirements.txt @@ -0,0 +1,6 @@ +accelerate>=0.16.0 +torchvision +transformers>=4.25.1 +ftfy +tensorboard +Jinja2 diff --git a/training/stable_diffusion/train_sd_distil_lora.py b/training/stable_diffusion/train_sd_distil_lora.py new file mode 100644 index 000000000..012cb0e0f --- /dev/null +++ b/training/stable_diffusion/train_sd_distil_lora.py @@ -0,0 +1,1288 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import gc +import hashlib +import itertools +import logging +import math +import os +import warnings +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from huggingface_hub import create_repo, model_info, upload_folder +from packaging import version +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +import diffusers +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version, is_wandb_available +from diffusers.utils.import_utils import is_xformers_available + + +if is_wandb_available(): + import wandb + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.17.0.dev0") + +logger = get_logger(__name__) + + +def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None): + img_str = "" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" + + yaml = f""" +--- +license: creativeml-openrail-m +base_model: {base_model} +instance_prompt: {prompt} +tags: +- stable-diffusion +- stable-diffusion-diffusers +- text-to-image +- diffusers +- dreambooth +inference: true +--- + """ + model_card = f""" +# DreamBooth - {repo_id} + +This is a dreambooth model derived from {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). +You can find some example images in the following. \n +{img_str} + +DreamBooth for the text encoder was enabled: {train_text_encoder}. +""" + with open(os.path.join(repo_folder, "README.md"), "w") as f: + f.write(yaml + model_card) + + +def log_validation( + text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch, prompt_embeds, negative_prompt_embeds +): + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + + pipeline_args = {} + + if text_encoder is not None: + pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder) + + if vae is not None: + pipeline_args["vae"] = vae + + # create pipeline (note: unet and vae are loaded again in float32) + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + tokenizer=tokenizer, + unet=accelerator.unwrap_model(unet), + revision=args.revision, + torch_dtype=weight_dtype, + **pipeline_args, + ) + + # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it + scheduler_args = {} + + if "variance_type" in pipeline.scheduler.config: + variance_type = pipeline.scheduler.config.variance_type + + if variance_type in ["learned", "learned_range"]: + variance_type = "fixed_small" + + scheduler_args["variance_type"] = variance_type + + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + if args.pre_compute_text_embeddings: + pipeline_args = { + "prompt_embeds": prompt_embeds, + "negative_prompt_embeds": negative_prompt_embeds, + } + else: + pipeline_args = {"prompt": args.validation_prompt} + + # run inference + generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed) + images = [] + for _ in range(args.num_validation_images): + with torch.autocast("cuda"): + image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0] + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images) + ] + } + ) + + del pipeline + torch.cuda.empty_cache() + + return images + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + elif model_class == "T5EncoderModel": + from transformers import T5EncoderModel + + return T5EncoderModel + else: + raise ValueError(f"{model_class} is not supported.") + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help=( + "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" + " float32 precision." + ), + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--default_prompt", + type=str, + default=None, + required=True, + help="A default prompt to be used if pre-computed prompt version is selected", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", + default=False, + action="store_true", + help=( + "Whether to center crop the input images to the resolution. If not set, the images will be randomly" + " cropped. The images will be resized to the resolution first before cropping." + ), + ) + parser.add_argument( + "--train_text_encoder", + action="store_true", + help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", + ) + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " + "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." + "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." + "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" + "instructions." + ), + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more details" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + + parser.add_argument( + "--offset_noise", + action="store_true", + default=False, + help=( + "Fine-tuning against a modified noise" + " See: https://www.crosslabs.org//blog/diffusion-with-offset-noise for more information." + ), + ) + parser.add_argument( + "--pre_compute_text_embeddings", + action="store_true", + help="Whether or not to pre-compute text embeddings. If text embeddings are pre-computed, the text encoder will not be kept in memory during training and will leave more GPU memory available for training the rest of the model. This is not compatible with `--train_text_encoder`.", + ) + parser.add_argument( + "--tokenizer_max_length", + type=int, + default=None, + required=False, + help="The maximum length of the tokenizer. If not set, will default to the tokenizer's max length.", + ) + parser.add_argument( + "--text_encoder_use_attention_mask", + action="store_true", + required=False, + help="Whether to use attention mask for the text encoder", + ) + parser.add_argument( + "--skip_save_text_encoder", action="store_true", required=False, help="Set to not save text encoder" + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + if args.train_text_encoder and args.pre_compute_text_embeddings: + raise ValueError("`--train_text_encoder` cannot be used with `--pre_compute_text_embeddings`") + + return args + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_prompts, + instance_images, + tokenizer, + class_data_root=None, + class_prompt=None, + class_num=None, + size=512, + center_crop=False, + encoder_hidden_states=None, + instance_prompt_encoder_hidden_states=None, + tokenizer_max_length=None, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + self.encoder_hidden_states = encoder_hidden_states + self.instance_prompt_encoder_hidden_states = instance_prompt_encoder_hidden_states + self.tokenizer_max_length = tokenizer_max_length + self.num_instance_images = len(instance_prompts) + self.instance_images = instance_images + self.instance_prompts = instance_prompts + self._length = self.num_instance_images + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = self.instance_images[index % self.num_instance_images] + uncond_tokens = [""] * args.train_batch_size + + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + + text_inputs = tokenize_prompt( + self.tokenizer, self.instance_prompts[index % self.num_instance_images], tokenizer_max_length=self.tokenizer_max_length + ) + example["instance_prompt_ids"] = text_inputs.input_ids + example["instance_attention_mask"] = text_inputs.attention_mask + + # Compute the unconditional prompt + uncond_inputs = tokenize_prompt( + self.tokenizer, uncond_tokens, tokenizer_max_length=self.tokenizer_max_length + ) + example["uncond_prompt_ids"] = uncond_inputs.input_ids + example["uncond_attention_mask"] = uncond_inputs.attention_mask + return example + + +def collate_fn(examples, with_prior_preservation=False): + has_attention_mask = "instance_attention_mask" in examples[0] + + input_ids = [example["instance_prompt_ids"] for example in examples] + uncond_ids = [example["uncond_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + if has_attention_mask: + attention_mask = [example["instance_attention_mask"] for example in examples] + uncond_attention_mask = [example["uncond_attention_mask"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + uncond_ids += [example["uncond_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + if has_attention_mask: + attention_mask += [example["class_attention_mask"] for example in examples] + uncond_attention_mask += [example["uncond_attention_mask"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + uncond_ids = torch.cat(uncond_ids, dim=0) + + batch = { + "input_ids": input_ids, + "uncond_ids": uncond_ids, + "pixel_values": pixel_values, + } + + if has_attention_mask: + batch["attention_mask"] = attention_mask + batch["uncond_attention_mask"] = uncond_attention_mask + + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +def model_has_vae(args): + config_file_name = os.path.join("vae", AutoencoderKL.config_name) + if os.path.isdir(args.pretrained_model_name_or_path): + config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name) + return os.path.isfile(config_file_name) + else: + files_in_repo = model_info(args.pretrained_model_name_or_path, revision=args.revision).siblings + return any(file.rfilename == config_file_name for file in files_in_repo) + + +def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None): + if tokenizer_max_length is not None: + max_length = tokenizer_max_length + else: + max_length = tokenizer.model_max_length + + text_inputs = tokenizer( + prompt, + truncation=True, + padding="max_length", + max_length=max_length, + return_tensors="pt", + ) + + return text_inputs + + +def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None): + text_input_ids = input_ids.to(text_encoder.device) + + if text_encoder_use_attention_mask: + attention_mask = attention_mask.to(text_encoder.device) + else: + attention_mask = None + + prompt_embeds = text_encoder( + text_input_ids, + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + return prompt_embeds + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + project_config=accelerator_project_config, + ) + + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + if args.push_to_hub: + repo_id = create_repo( + repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token + ).repo_id + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + + if model_has_vae(args): + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision + ) + else: + vae = None + + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + teacher_unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + #Turn off gradients for the teacher + for param in teacher_unet.parameters(): + param.requires_grad = False + + # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format + def save_model_hook(models, weights, output_dir): + for model in models: + sub_dir = "unet" if isinstance(model, type(accelerator.unwrap_model(unet))) else "text_encoder" + model.save_pretrained(os.path.join(output_dir, sub_dir)) + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + def load_model_hook(models, input_dir): + while len(models) > 0: + # pop models so that they are not loaded again + model = models.pop() + + if isinstance(model, type(accelerator.unwrap_model(text_encoder))): + # load transformers style into model + load_model = text_encoder_cls.from_pretrained(input_dir, subfolder="text_encoder") + model.config = load_model.config + else: + # load diffusers style into model + load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet") + model.register_to_config(**load_model.config) + + model.load_state_dict(load_model.state_dict()) + del load_model + + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + + if vae is not None: + vae.requires_grad_(False) + + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + + if args.enable_xformers_memory_efficient_attention: + if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + if args.train_text_encoder: + text_encoder.gradient_checkpointing_enable() + + # Check that all trainable models are in full precision + low_precision_error_string = ( + "Please make sure to always have all model weights in full float32 precision when starting training - even if" + " doing mixed precision training. copy of the weights should still be float32." + ) + + if accelerator.unwrap_model(unet).dtype != torch.float32: + raise ValueError( + f"Unet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}" + ) + + if args.train_text_encoder and accelerator.unwrap_model(text_encoder).dtype != torch.float32: + raise ValueError( + f"Text encoder loaded as datatype {accelerator.unwrap_model(text_encoder).dtype}." + f" {low_precision_error_string}" + ) + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + if args.pre_compute_text_embeddings: + + def compute_text_embeddings(prompt): + with torch.no_grad(): + text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=args.tokenizer_max_length) + prompt_embeds = encode_prompt( + text_encoder, + text_inputs.input_ids, + text_inputs.attention_mask, + text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, + ) + + return prompt_embeds + + pre_computed_encoder_hidden_states = compute_text_embeddings(args.default_prompt) + validation_prompt_negative_prompt_embeds = compute_text_embeddings("") + + if args.validation_prompt is not None: + validation_prompt_encoder_hidden_states = compute_text_embeddings(args.validation_prompt) + else: + validation_prompt_encoder_hidden_states = None + + if args.default_prompt is not None: + pre_computed_instance_prompt_encoder_hidden_states = compute_text_embeddings(args.default_prompt) + else: + pre_computed_instance_prompt_encoder_hidden_states = None + + text_encoder = None + tokenizer = None + + gc.collect() + torch.cuda.empty_cache() + else: + pre_computed_encoder_hidden_states = None + validation_prompt_encoder_hidden_states = None + validation_prompt_negative_prompt_embeds = None + pre_computed_instance_prompt_encoder_hidden_states = None + + from datasets import load_dataset + dataset_hf = load_dataset('poloclub/diffusiondb', '2m_first_10k') + raw_train_dataset = dataset_hf['train'] + + #Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_prompts=raw_train_dataset['prompt'], + instance_images=raw_train_dataset['image'], + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + encoder_hidden_states=pre_computed_encoder_hidden_states, + instance_prompt_encoder_hidden_states=pre_computed_instance_prompt_encoder_hidden_states, + tokenizer_max_length=args.tokenizer_max_length, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.dataloader_num_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + teacher_unet.to(accelerator.device) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + if vae is not None: + vae.to(accelerator.device, dtype=weight_dtype) + + if not args.train_text_encoder and text_encoder is not None: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("dreambooth", config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the mos recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + + if path is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + args.resume_from_checkpoint = None + else: + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = global_step // num_update_steps_per_epoch + resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps) + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + + unet.train() + print("epoch:", epoch) + if args.train_text_encoder: + text_encoder.train() + + # For each prompt* + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + continue + + with accelerator.accumulate(unet): + pixel_values = batch["pixel_values"].to(dtype=weight_dtype) + + if vae is not None: + # Convert images to latent space + model_input = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + model_input = model_input * vae.config.scaling_factor + else: + model_input = pixel_values + + # Sample noise that we'll add to the model input + if args.offset_noise: + noise = torch.randn_like(model_input) + 0.1 * torch.randn( + model_input.shape[0], model_input.shape[1], 1, 1, device=model_input.device + ) + else: + noise = torch.randn_like(model_input) + bsz = model_input.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device + ) + timesteps = timesteps.long() + + # Add noise to the model input according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps) + + # Get the text embedding for conditioning + if args.pre_compute_text_embeddings: + encoder_hidden_states = batch["input_ids"] + else: + encoder_hidden_states = encode_prompt( + text_encoder, + batch["input_ids"], + batch["attention_mask"], + text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, + ) + + encoder_hidden_states_uncond = encode_prompt( + text_encoder, + batch["uncond_ids"], + batch["uncond_attention_mask"], + text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, + ) + + # Predict the student noise residual + model_pred = unet(noisy_model_input, timesteps, encoder_hidden_states).sample #student_noise_pred + # The teacher noise residual is based on the inference pipeline: uncond_noise +gc * (cond_noise - uncond_noise) + gc = 7.5 + teacher_cond_noise = teacher_unet(noisy_model_input, timesteps, encoder_hidden_states).sample + teacher_uncond_noise = teacher_unet(noisy_model_input, timesteps, encoder_hidden_states_uncond).sample + teacher_noise_pred = teacher_uncond_noise + gc * (teacher_cond_noise - teacher_uncond_noise) + + if model_pred.shape[1] == 6: + model_pred, _ = torch.chunk(model_pred, 2, dim=1) + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(model_input, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + loss = F.mse_loss(model_pred.float(), teacher_noise_pred.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=args.set_grads_to_none) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + if accelerator.is_main_process: + images = [] + if global_step % args.checkpointing_steps == 0: + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") + + if args.validation_prompt is not None and global_step % args.validation_steps == 0: + images = log_validation( + text_encoder, + tokenizer, + unet, + vae, + args, + accelerator, + weight_dtype, + epoch, + validation_prompt_encoder_hidden_states, + validation_prompt_negative_prompt_embeds, + ) + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if global_step >= args.max_train_steps: + break + + # Create the pipeline using using the trained modules and save it. + accelerator.wait_for_everyone() + if accelerator.is_main_process: + pipeline_args = {} + + if text_encoder is not None: + pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder) + + if args.skip_save_text_encoder: + pipeline_args["text_encoder"] = None + + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + revision=args.revision, + **pipeline_args, + ) + + # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it + scheduler_args = {} + + if "variance_type" in pipeline.scheduler.config: + variance_type = pipeline.scheduler.config.variance_type + + if variance_type in ["learned", "learned_range"]: + variance_type = "fixed_small" + + scheduler_args["variance_type"] = variance_type + + pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args) + + pipeline.save_pretrained(args.output_dir) + + if args.push_to_hub: + save_model_card( + repo_id, + images=images, + base_model=args.pretrained_model_name_or_path, + train_text_encoder=args.train_text_encoder, + prompt=args.default_prompt, + repo_folder=args.output_dir, + ) + upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ignore_patterns=["step_*", "epoch_*"], + ) + + accelerator.end_training() + +if __name__ == "__main__": + args = parse_args() + main(args) From dd0f181bad814e5c8cb57e38620e766eccba79bf Mon Sep 17 00:00:00 2001 From: SuperHacker UEFI Date: Tue, 12 Dec 2023 07:05:40 +0800 Subject: [PATCH 11/58] Correction training script filename in README and Fix Bug for Step Running Time Display (#815) Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> --- applications/DeepSpeed-Chat/README.md | 6 +++--- applications/DeepSpeed-Chat/e2e_rlhf.py | 21 +++++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md index ac9f3ab6a..91e4854fb 100644 --- a/applications/DeepSpeed-Chat/README.md +++ b/applications/DeepSpeed-Chat/README.md @@ -136,7 +136,7 @@ pip install -e . If you only have around **1-2 hour** for coffee or lunch break, you can also try to train a small/toy model with DeepSpeed-Chat. For example, we prepared a training example for a **1.3B** model with a single dataset to test our framework on your consumer-grade GPUs. The best part is that you will have your model checkpoint ready to play with when you are back from your lunch break! ```bash - python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu + python e2e_rlhf.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu ``` See the following table for the E2E time breakdown for training a 1.3 billion parameter ChatGPT model via DeepSpeed-Chat on a single commodity NVIDIA A6000 GPU with 48GB memory. @@ -156,7 +156,7 @@ If you only have around **1-2 hour** for coffee or lunch break, you can also try If you only have around **half a day** and only a single server node, we suggest using an example of pretrained **OPT-13B** as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model: ```bash - python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node + python e2e_rlhf.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node ``` See the following table for the E2E time breakdown for training a 13 billion parameter ChatGPT model via DeepSpeed-Chat on a single DGX node with 8 NVIDIA A100-40G GPUs. @@ -175,7 +175,7 @@ If you only have around **half a day** and only a single server node, we suggest Want to try different model sizes and configurations? You got it! With DeepSpeed-Chat, users can easily do that. For example, if you have access to multi-nodes cluster or cloud resources and prefer to train a larger and higher-quality model for your research or business, you can simply use a similar script with your desired model sizes, e.g., **66B** and GPU counts=64: ```bash - python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node + python e2e_rlhf.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node ``` See the following table for E2E time breakdown for training a 66 billion parameter ChatGPT model via DeepSpeed-Chat on 8 DGX nodes with 8 NVIDIA A100-80G GPUs/node. diff --git a/applications/DeepSpeed-Chat/e2e_rlhf.py b/applications/DeepSpeed-Chat/e2e_rlhf.py index 38db655ce..8fb928ca5 100644 --- a/applications/DeepSpeed-Chat/e2e_rlhf.py +++ b/applications/DeepSpeed-Chat/e2e_rlhf.py @@ -4,27 +4,27 @@ # DeepSpeed Team """ Run all steps with default settings: -$ python3 train.py +$ python3 e2e_rlhf.py Change the model used for each step: -$ python3 train.py --actor-model 350m --reward-model 1.3b +$ python3 e2e_rlhf.py --actor-model 350m --reward-model 1.3b Change the ZeRO stage used for actor/reward models: -$ python3 train.py --actor-zero-stage 1 --reward-zero-stage 3 +$ python3 e2e_rlhf.py --actor-zero-stage 1 --reward-zero-stage 3 Run a subset of the steps: -$ python3 train.py --step 1 2 +$ python3 e2e_rlhf.py --step 1 2 Note: Step 3 relies on models trained in Steps 1 & 2. If you have already trained these models, you can run just Step 3 and select which models from Steps 1 & 2 to use. For example, let's train models for Steps 1 & 2 using 125m and 350m models: -$ python3 train.py --step 1 2 --actor-model 125m --reward-model 125m -$ python3 train.py --step 1 2 --actor-model 350m --reward-model 350m +$ python3 e2e_rlhf.py --step 1 2 --actor-model 125m --reward-model 125m +$ python3 e2e_rlhf.py --step 1 2 --actor-model 350m --reward-model 350m Now we can run Step 3 with any combination of these models: -$ python3 train.py --step 3 --actor-model 125m --reward-model 350m -$ python3 train.py --step 3 --actor-model 350m --reward-model 125m +$ python3 e2e_rlhf.py --step 3 --actor-model 125m --reward-model 350m +$ python3 e2e_rlhf.py --step 3 --actor-model 350m --reward-model 125m """ import argparse @@ -33,6 +33,7 @@ import os import datetime import time +import sys step_dirs = { 1: "training/step1_supervised_finetuning", @@ -144,7 +145,7 @@ def verify_model(args, step_num): model_file = os.path.join(output_dir, "pytorch_model.bin") if not os.path.isfile(model_file): error_str = f"Step {step_num} model has not been trained. Train it with:\n" - error_str += f"python3 train.py --step {step_num}" + error_str += f"{sys.executable.split('/')[-1]} {sys.argv[0]} --step {step_num}" error_str += f" --{model_type[step_num]}-model {model_size}" raise RuntimeError(error_str) @@ -194,7 +195,7 @@ def main(args): cmd = get_cmd(args, step_num) launch_cmd(args, step_num, cmd) - step_time = int(time.time() - start_time) + step_time = int(time.time() - step_start_time) time_str = str(datetime.timedelta(seconds=step_time)) print(f"---=== Finished Step {step_num} in {time_str} ===---") From 8e4cdd88574a1b3ce2f9f73c30b7ad3434ff1866 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Tue, 19 Dec 2023 17:20:42 -0800 Subject: [PATCH 12/58] Improve Comms Benchmark Timing (#833) * Change to cuda event-based timing * Add event args to called funcs * Add missing comma to args --- benchmarks/communication/all_gather.py | 14 +++++++++----- benchmarks/communication/all_reduce.py | 14 +++++++++----- benchmarks/communication/all_to_all.py | 14 +++++++++----- benchmarks/communication/broadcast.py | 14 +++++++++----- benchmarks/communication/pt2pt.py | 14 +++++++++----- 5 files changed, 45 insertions(+), 25 deletions(-) diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py index 7a34c0d1c..8aa33581d 100644 --- a/benchmarks/communication/all_gather.py +++ b/benchmarks/communication/all_gather.py @@ -16,7 +16,7 @@ # Run all_gather and print metrics -def timed_all_gather(input, output, args): +def timed_all_gather(input, output, start_event, end_event, args): if args.dist == 'torch': import torch.distributed as dist @@ -33,11 +33,12 @@ def timed_all_gather(input, output, args): sync_all() # time the actual comm op trials times and average it - pre = time.perf_counter() + start_event.record() for i in range(args.trials): all_gather_func(output, input, group=None, async_op=args.async_op) + end_event.record() sync_all() - duration = time.perf_counter() - pre + duration = start_event.elapsed_time(end_event) / 1000 # maintain and clean performance data avg_duration = duration / args.trials @@ -63,6 +64,9 @@ def run_all_gather(local_rank, args): global_rank = dist.get_rank() world_size = dist.get_world_size() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + if args.scan: # Create list of message sizes M_LIST = [] @@ -92,7 +96,7 @@ def run_all_gather(local_rank, args): else: raise e sync_all() - timed_all_gather(input, output, args) + timed_all_gather(input, output, start_event, end_event, args) else: # all_gather_into_tensor saves memory if ((args.dist == 'torch' or args.dist == 'deepspeed') and dist.has_all_gather_into_tensor()): @@ -126,7 +130,7 @@ def run_all_gather(local_rank, args): raise e sync_all() - timed_all_gather(input, output, args) + timed_all_gather(input, output, start_event, end_event, args) if __name__ == "__main__": diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py index a474a704f..b9decfd98 100644 --- a/benchmarks/communication/all_reduce.py +++ b/benchmarks/communication/all_reduce.py @@ -14,7 +14,7 @@ from deepspeed.accelerator import get_accelerator -def timed_all_reduce(input, args): +def timed_all_reduce(input, start_event, end_event, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -27,11 +27,12 @@ def timed_all_reduce(input, args): sync_all() # time the actual comm op trials times and average it - pre = time.perf_counter() + start_event.record() for i in range(args.trials): dist.all_reduce(input, async_op=args.async_op) + end_event.record() sync_all() - duration = time.perf_counter() - pre + duration = start_event.elapsed_time(end_event) / 1000 # maintain and clean performance data avg_duration = duration / args.trials @@ -59,6 +60,9 @@ def run_all_reduce(local_rank, args): world_size = dist.get_world_size() global_rank = dist.get_rank() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + if args.scan: M_LIST = [] for x in (2**p for p in range(1, args.maxsize)): @@ -82,7 +86,7 @@ def run_all_reduce(local_rank, args): else: raise e sync_all() - timed_all_reduce(input, args) + timed_all_reduce(input, start_event, end_event, args) else: # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so we double mem_factor @@ -104,7 +108,7 @@ def run_all_reduce(local_rank, args): else: raise e sync_all() - timed_all_reduce(input, args) + timed_all_reduce(input, start_event, end_event, args) if __name__ == "__main__": diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py index 8735b1b4a..7eccfa824 100644 --- a/benchmarks/communication/all_to_all.py +++ b/benchmarks/communication/all_to_all.py @@ -14,7 +14,7 @@ from deepspeed.accelerator import get_accelerator -def timed_all_to_all(input, output, args): +def timed_all_to_all(input, output, start_event, end_event, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -27,11 +27,12 @@ def timed_all_to_all(input, output, args): sync_all() # time the actual comm op trials times and average it - pre = time.perf_counter() + start_event.record() for i in range(args.trials): dist.all_to_all_single(output, input, async_op=args.async_op) + end_event.record() sync_all() - duration = time.perf_counter() - pre + duration = start_event.elapsed_time(end_event) / 1000 # maintain and clean performance data avg_duration = duration / args.trials @@ -58,6 +59,9 @@ def run_all_to_all(local_rank, args): # Prepare benchmark header print_header(args, 'all_to_all') + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + if args.scan: M_LIST = [] for x in (2**p for p in range(1, args.maxsize)): @@ -83,7 +87,7 @@ def run_all_to_all(local_rank, args): else: raise e sync_all() - timed_all_to_all(input, output, args) + timed_all_to_all(input, output, start_event, end_event, args) else: # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor elements_per_gpu = max_numel(comm_op='all_to_all', @@ -118,7 +122,7 @@ def run_all_to_all(local_rank, args): print(f"Before AllToAll Input List at rank {global_rank}: {input}") dist.barrier() - timed_all_to_all(input, output, args) + timed_all_to_all(input, output, start_event, end_event, args) if args.debug: for i in range(world_size): diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py index 551c71f94..860c9555b 100644 --- a/benchmarks/communication/broadcast.py +++ b/benchmarks/communication/broadcast.py @@ -14,7 +14,7 @@ from deepspeed.accelerator import get_accelerator -def timed_broadcast(input, args): +def timed_broadcast(input, start_event, end_event, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -27,11 +27,12 @@ def timed_broadcast(input, args): sync_all() # time the actual comm op trials times and average it - pre = time.perf_counter() + start_event.record() for i in range(args.trials): dist.broadcast(input, 0, async_op=args.async_op) + end_event.record() sync_all() - duration = time.perf_counter() - pre + duration = start_event.elapsed_time(end_event) / 1000 # maintain and clean performance data avg_duration = duration / args.trials @@ -59,6 +60,9 @@ def run_broadcast(local_rank, args): world_size = dist.get_world_size() global_rank = dist.get_rank() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + if args.scan: M_LIST = [] for x in (2**p for p in range(1, args.maxsize)): @@ -82,7 +86,7 @@ def run_broadcast(local_rank, args): else: raise e sync_all() - timed_broadcast(input, args) + timed_broadcast(input, start_event, end_event, args) else: # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so we double mem_factor @@ -102,7 +106,7 @@ def run_broadcast(local_rank, args): sync_all() return sync_all() - timed_broadcast(input, args) + timed_broadcast(input, start_event, end_event, args) if __name__ == "__main__": diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py index 31028f99e..57eab9a66 100644 --- a/benchmarks/communication/pt2pt.py +++ b/benchmarks/communication/pt2pt.py @@ -14,7 +14,7 @@ from deepspeed.accelerator import get_accelerator -def timed_pt2pt(input, args): +def timed_pt2pt(input, start_event, end_event, args): if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -36,7 +36,7 @@ def timed_pt2pt(input, args): sync_all() # time the actual comm op trials times and average it - pre = time.perf_counter() + start_event.record() for i in range(args.trials): if dist.get_rank() == 0: if args.async_op: @@ -49,8 +49,9 @@ def timed_pt2pt(input, args): else: dist.recv(input, src=0) + end_event.record() sync_all() - duration = time.perf_counter() - pre + duration = start_event.elapsed_time(end_event) / 1000 # maintain and clean performance data avg_duration = duration / args.trials @@ -77,6 +78,9 @@ def run_pt2pt(local_rank, args): global_rank = dist.get_rank() world_size = dist.get_world_size() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + if args.scan: # Create list of message sizes M_LIST = [] @@ -101,7 +105,7 @@ def run_pt2pt(local_rank, args): else: raise e sync_all() - timed_pt2pt(input, args) + timed_pt2pt(input, start_event, end_event, args) else: # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor # Don't need output tensor, so double mem_factor @@ -121,7 +125,7 @@ def run_pt2pt(local_rank, args): sync_all() return sync_all() - timed_pt2pt(input, args) + timed_pt2pt(input, start_event, end_event, args) if __name__ == "__main__": From abd7502ccaab72d78ebf229cc670836acf55b9e4 Mon Sep 17 00:00:00 2001 From: A-Cepheus <60658915+A-Cepheus@users.noreply.github.com> Date: Thu, 4 Jan 2024 04:00:15 +0800 Subject: [PATCH 13/58] fix: typo in sa (#838) --- training/bing_bert/nvidia/modelingpreln.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/bing_bert/nvidia/modelingpreln.py b/training/bing_bert/nvidia/modelingpreln.py index a7e398e26..9856f0607 100755 --- a/training/bing_bert/nvidia/modelingpreln.py +++ b/training/bing_bert/nvidia/modelingpreln.py @@ -1041,7 +1041,7 @@ def forward(self, position_ids=None, inputs_embeds=None, pad_token_id=self.pad_token_id, - model_mbeddings=self.embeddings) + model_embeddings=self.embeddings) embedding_output = self.embeddings(input_ids, token_type_ids) encoded_layers = self.encoder( From ff9a0234cf22dd9af03c5c7aa8037fb9143adca6 Mon Sep 17 00:00:00 2001 From: chinainfant <38291328+chinainfant@users.noreply.github.com> Date: Thu, 4 Jan 2024 04:01:16 +0800 Subject: [PATCH 14/58] Update README.md (#827) Modify the hyperlink path, change 'training' to 'dschat' Co-authored-by: Michael Wyatt --- applications/DeepSpeed-Chat/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md index 91e4854fb..fa8fa9d38 100644 --- a/applications/DeepSpeed-Chat/README.md +++ b/applications/DeepSpeed-Chat/README.md @@ -251,9 +251,9 @@ bash training_scripts/opt/single_gpu/run_1.3b.sh ### 🐼 Adding and using your own datasets in DeepSpeed-Chat -In addition to the datasets used in our example scripts, you can also add and use your own datasets. To do so, first you need to add a new Class in [training/utils/data/raw_datasets.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py) to define the format when using your data. You need to make sure to follow the APIs and format defined in the PromptRawDataset class to ensure a consistent data format that DeepSpeed-Chat relies on. You can look at the existing classes to learn how to do so. +In addition to the datasets used in our example scripts, you can also add and use your own datasets. To do so, first you need to add a new Class in [dschat/utils/data/raw_datasets.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py) to define the format when using your data. You need to make sure to follow the APIs and format defined in the PromptRawDataset class to ensure a consistent data format that DeepSpeed-Chat relies on. You can look at the existing classes to learn how to do so. -Second, you need to add an if condition in function get_raw_dataset in [training/utils/data/data_utils.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/utils/data/data_utils.py) corresponding to your new dataset. The dataset_name string in the if condition should be the dataset name you will provide as a arg for the training scripts. Last, you need to add your new dataset's dataset_name into your "--data_path" arg in your training scripts. +Second, you need to add an if condition in function get_raw_dataset in [dschat/utils/data/data_utils.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py) corresponding to your new dataset. The dataset_name string in the if condition should be the dataset name you will provide as a arg for the training scripts. Last, you need to add your new dataset's dataset_name into your "--data_path" arg in your training scripts. If you have downloaded huggingface datasets manually, you can add your local path into "--data_path", such as "--data_path ./relative/Dahoas/rm-static" and "--data_path /absolute/Dahoas/rm-static". Remember you should not make `data/` in your local path, it may cause an exception to `load_dataset`. One thing to note is that some datasets may only have one response instead of two responses. For those datasets, you can only use them in step 1. And in such case, you should add the dataset_name as part of the "--sft_only_data_path" arg instead of the "--data_path" arg. One thing to note is that: If you plan to only do step 1 SFT, adding more single-response datasets is definitely beneficial. However, if you do plan to do steps 2 and 3, then adding too many single-response datasets during SFT could backfire: these data could be different from the data used for steps 2/3, generating different distributions which could cause training instability/worse model quality during step 2/3. That is part of the reason why we focused on trying the datasets with two responses and the preference, and always split a dataset into all 3 steps. From 05120bbd2c42e619787d20020d64857c7ffc500f Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 10 Jan 2024 11:41:57 -0800 Subject: [PATCH 15/58] Update MII Inference Examples (#837) --- inference/mii/README.md | 2 +- inference/mii/client.py | 6 ----- inference/mii/non-persistent/README.md | 28 ++++++++++++++++++++++++ inference/mii/non-persistent/falcon.py | 6 +++++ inference/mii/non-persistent/llama2.py | 6 +++++ inference/mii/non-persistent/mixtral.py | 6 +++++ inference/mii/non-persistent/pipeline.py | 19 ++++++++++++++++ inference/mii/persistent/README.md | 28 ++++++++++++++++++++++++ inference/mii/persistent/client.py | 18 +++++++++++++++ inference/mii/persistent/serve.py | 13 +++++++++++ inference/mii/persistent/terminate.py | 11 ++++++++++ inference/mii/pipeline.py | 6 ----- inference/mii/requirements.txt | 2 +- inference/mii/serve.py | 3 --- inference/mii/terminate.py | 4 ---- 15 files changed, 137 insertions(+), 21 deletions(-) delete mode 100644 inference/mii/client.py create mode 100644 inference/mii/non-persistent/README.md create mode 100644 inference/mii/non-persistent/falcon.py create mode 100644 inference/mii/non-persistent/llama2.py create mode 100644 inference/mii/non-persistent/mixtral.py create mode 100644 inference/mii/non-persistent/pipeline.py create mode 100644 inference/mii/persistent/README.md create mode 100644 inference/mii/persistent/client.py create mode 100644 inference/mii/persistent/serve.py create mode 100644 inference/mii/persistent/terminate.py delete mode 100644 inference/mii/pipeline.py delete mode 100644 inference/mii/serve.py delete mode 100644 inference/mii/terminate.py diff --git a/inference/mii/README.md b/inference/mii/README.md index d701d5537..dfc9fda2b 100644 --- a/inference/mii/README.md +++ b/inference/mii/README.md @@ -2,4 +2,4 @@ Install the requirements by running `pip install -r requirements.txt`. -Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. For details on these files please refer to the [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii). +Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii). diff --git a/inference/mii/client.py b/inference/mii/client.py deleted file mode 100644 index 6d19fec3a..000000000 --- a/inference/mii/client.py +++ /dev/null @@ -1,6 +0,0 @@ -import mii - -client = mii.client("mistralai/Mistral-7B-v0.1") -output = client.generate("Deepspeed is", max_new_tokens=128) - -print(output) diff --git a/inference/mii/non-persistent/README.md b/inference/mii/non-persistent/README.md new file mode 100644 index 000000000..b9ca31acb --- /dev/null +++ b/inference/mii/non-persistent/README.md @@ -0,0 +1,28 @@ +# Non-Persistent Pipeline Examples + +The `pipeline.py` script can be used to run any of the [supported +models](https://github.com/microsoft/DeepSpeed-mii#supported-models). Provide +the HuggingFace model name, maximum generated tokens, and prompt(s). The +generated responses will be printed in the terminal: + +```shell +$ python pipeline.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is" +``` + +Tensor-parallelism can be controlled using the `deepspeed` launcher and setting +`--num_gpus`: + +```shell +$ deepspeed --num_gpus 2 pipeline.py +``` + +## Model-Specific Examples + +For convenience, we also provide a set of scripts to quickly test the MII +Pipeline with some popular text-generation models: + +| Model | Launch command | +|-------|----------------| +| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | `$ python llama2.py` | +| [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) | `$ python falcon.py` | +| [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | `$ deepspeed --num_gpus 2 mixtral.py` | \ No newline at end of file diff --git a/inference/mii/non-persistent/falcon.py b/inference/mii/non-persistent/falcon.py new file mode 100644 index 000000000..7dfc05ecb --- /dev/null +++ b/inference/mii/non-persistent/falcon.py @@ -0,0 +1,6 @@ +import mii + +pipe = mii.pipeline("tiiuae/falcon-7b") +responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) +if pipe.is_rank_0: + print(responses[0]) diff --git a/inference/mii/non-persistent/llama2.py b/inference/mii/non-persistent/llama2.py new file mode 100644 index 000000000..1c519204e --- /dev/null +++ b/inference/mii/non-persistent/llama2.py @@ -0,0 +1,6 @@ +import mii + +pipe = mii.pipeline("meta-llama/Llama-2-7b-hf") +responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) +if pipe.is_rank_0: + print(responses[0]) diff --git a/inference/mii/non-persistent/mixtral.py b/inference/mii/non-persistent/mixtral.py new file mode 100644 index 000000000..a429ea5e1 --- /dev/null +++ b/inference/mii/non-persistent/mixtral.py @@ -0,0 +1,6 @@ +import mii + +pipe = mii.pipeline("mistralai/Mixtral-8x7B-v0.1") +responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True) +if pipe.is_rank_0: + print(responses[0]) diff --git a/inference/mii/non-persistent/pipeline.py b/inference/mii/non-persistent/pipeline.py new file mode 100644 index 000000000..c7baa6716 --- /dev/null +++ b/inference/mii/non-persistent/pipeline.py @@ -0,0 +1,19 @@ +import argparse +import mii + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") +parser.add_argument( + "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"] +) +parser.add_argument("--max-new-tokens", type=int, default=128) +args = parser.parse_args() + +pipe = mii.pipeline(args.model) +responses = pipe( + args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True +) + +if pipe.is_rank_0: + for r in responses: + print(r, "\n", "-" * 80, "\n") diff --git a/inference/mii/persistent/README.md b/inference/mii/persistent/README.md new file mode 100644 index 000000000..e9cb2dc20 --- /dev/null +++ b/inference/mii/persistent/README.md @@ -0,0 +1,28 @@ +# Persistent Deployment Examples + +The `serve.py` script can be used to create an inference server for any of the +[supported models](https://github.com/microsoft/DeepSpeed-mii#supported-models). +Provide the HuggingFace model name and tensor-parallelism (use the default +values and run `$ python serve.py` for a single-GPU +[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +deployment): + +```shell +$ python serve.py --model "mistralai/Mistral-7B-v0.1" tensor-parallel 1 +``` + +Connect to the persistent deployment and generate text with `client.py`. Provide +the HuggingFace model name, maximum generated tokens, and prompt(s) (or if you +are using the default values, run `$ python client.py`): + +```shell +$ python client.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is" +``` + +Shutdown the persistent deployment with `terminate.py`. Provide the HuggingFace +model name (or if you are using the default values, run `$ python +terminate.py`): + +```shell +$ python terminate.py --model "mistralai/Mistral-7B-v0.1 +``` \ No newline at end of file diff --git a/inference/mii/persistent/client.py b/inference/mii/persistent/client.py new file mode 100644 index 000000000..561744a8f --- /dev/null +++ b/inference/mii/persistent/client.py @@ -0,0 +1,18 @@ +import argparse +import mii + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") +parser.add_argument( + "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"] +) +parser.add_argument("--max-new-tokens", type=int, default=128) +args = parser.parse_args() + +client = mii.client(args.model) +responses = client( + args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True +) + +for r in responses: + print(r, "\n", "-" * 80, "\n") diff --git a/inference/mii/persistent/serve.py b/inference/mii/persistent/serve.py new file mode 100644 index 000000000..dd31f983a --- /dev/null +++ b/inference/mii/persistent/serve.py @@ -0,0 +1,13 @@ +import argparse +import mii + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") +parser.add_argument("--tensor-parallel", type=int, default=1) +args = parser.parse_args() + +mii.serve(args.model, tensor_parallel=args.tensor_parallel) + +print(f"Serving model {args.model} on {args.tensor_parallel} GPU(s).") +print(f"Run `python client.py --model {args.model}` to connect.") +print(f"Run `python terminate.py --model {args.model}` to terminate.") diff --git a/inference/mii/persistent/terminate.py b/inference/mii/persistent/terminate.py new file mode 100644 index 000000000..3c430d934 --- /dev/null +++ b/inference/mii/persistent/terminate.py @@ -0,0 +1,11 @@ +import argparse +import mii + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1") +args = parser.parse_args() + +client = mii.client(args.model) +client.terminate_server() + +print(f"Terminated server for model {args.model}.") diff --git a/inference/mii/pipeline.py b/inference/mii/pipeline.py deleted file mode 100644 index dcf9e8b03..000000000 --- a/inference/mii/pipeline.py +++ /dev/null @@ -1,6 +0,0 @@ -from mii import pipeline - -pipe = pipeline("mistralai/Mistral-7B-v0.1") -output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128) - -print(output) diff --git a/inference/mii/requirements.txt b/inference/mii/requirements.txt index 07d9f7e16..48f92a784 100644 --- a/inference/mii/requirements.txt +++ b/inference/mii/requirements.txt @@ -1 +1 @@ -mii>=0.1.0 +deepspeed-mii>=0.1.3 diff --git a/inference/mii/serve.py b/inference/mii/serve.py deleted file mode 100644 index 09c0c306c..000000000 --- a/inference/mii/serve.py +++ /dev/null @@ -1,3 +0,0 @@ -import mii - -mii.serve("mistralai/Mistral-7B-v0.1") diff --git a/inference/mii/terminate.py b/inference/mii/terminate.py deleted file mode 100644 index 2a7ed3211..000000000 --- a/inference/mii/terminate.py +++ /dev/null @@ -1,4 +0,0 @@ -import mii - -client = mii.client("mistralai/Mistral-7B-v0.1") -client.terminate_server() From 6c31d8ddee9e57f6202aeb4ee3c86f2fbd93d4c6 Mon Sep 17 00:00:00 2001 From: foin6 <61218792+foin6@users.noreply.github.com> Date: Fri, 12 Jan 2024 02:22:40 +0800 Subject: [PATCH 16/58] Modify codes so that different accelerators can be called according to specific device conditions (#844) * modify inference-test.py to meet with the requirement of using Intel's device * modify ds-hf-compare.py to meet with the requirement of using Intel's device * use deepspeed.accelerator.get_accelerator() to replace the original hard code about cuda to access and enable the accelerators available(not just Nvidia's GPU) in the current device * column 117: self.model.xpu().to(self.device)--->self.model.to(self.device) for generalization. * For upstream, use get_accelerator() to hide backend. Add bf16 dtype for cpu. * Update README.md * Delete redundant comment code * Delete +123 in README title * delete checkpoints.json * modify inference-test.py * modify inference-test.py v2 * modify inference.py v3 * add bfloat16 for cpu * fix an error in setup commands with conda --------- Co-authored-by: Olatunji Ruwase --- inference/huggingface/text-generation/README.md | 2 +- inference/huggingface/text-generation/arguments.py | 2 +- inference/huggingface/text-generation/ds-hf-compare.py | 5 +++-- inference/huggingface/text-generation/inference-test.py | 5 +++-- inference/huggingface/text-generation/utils.py | 5 +++-- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/inference/huggingface/text-generation/README.md b/inference/huggingface/text-generation/README.md index 8019aa298..318e37416 100644 --- a/inference/huggingface/text-generation/README.md +++ b/inference/huggingface/text-generation/README.md @@ -20,7 +20,7 @@ If you are using conda, the following works: conda create -c conda-forge -n deepspeed python=3.10 conda activate deepspeed pip install -r requirements.txt -deepspeed --num_gpus 1 inference-test.py --name bigscience/bloom-3b --batch_size 2 +deepspeed --num_gpus 1 inference-test.py --model bigscience/bloom-3b --batch_size 2 # Inference Test diff --git a/inference/huggingface/text-generation/arguments.py b/inference/huggingface/text-generation/arguments.py index b50198ff9..a6dade23f 100644 --- a/inference/huggingface/text-generation/arguments.py +++ b/inference/huggingface/text-generation/arguments.py @@ -7,7 +7,7 @@ parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path") parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint") parser.add_argument("--batch_size", default=1, type=int, help="batch size") -parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type") +parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type") parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference") parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection") parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache") diff --git a/inference/huggingface/text-generation/ds-hf-compare.py b/inference/huggingface/text-generation/ds-hf-compare.py index 378a13940..27f307a32 100644 --- a/inference/huggingface/text-generation/ds-hf-compare.py +++ b/inference/huggingface/text-generation/ds-hf-compare.py @@ -3,11 +3,12 @@ from transformers import pipeline from difflib import SequenceMatcher from argparse import ArgumentParser +from deepspeed.accelerator import get_accelerator parser = ArgumentParser() parser.add_argument("--model", required=True, type=str, help="model_name") -parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type") +parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type") parser.add_argument("--num_inputs", default=1, type=int, help="number of test inputs") parser.add_argument("--min_length", default=200, type=int, help="minimum tokens generated") parser.add_argument("--max_length", default=300, type=int, help="maximum tokens generated") @@ -73,7 +74,7 @@ def string_similarity(str1, str2): inputs = test_inputs data_type = getattr(torch, args.dtype) -pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=0) +pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=torch.device(get_accelerator().device_name(0))) base_out_list = [] match_count=0 diff --git a/inference/huggingface/text-generation/inference-test.py b/inference/huggingface/text-generation/inference-test.py index 827d8db35..0ba3b20cd 100644 --- a/inference/huggingface/text-generation/inference-test.py +++ b/inference/huggingface/text-generation/inference-test.py @@ -6,6 +6,7 @@ import time from utils import DSPipeline, Performance from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator import get_accelerator from arguments import parser args = parser.parse_args() @@ -76,12 +77,12 @@ iters = 30 if args.test_performance else 2 #warmup times = [] for i in range(iters): - torch.cuda.synchronize() + get_accelerator().synchronize() start = time.time() outputs = pipe(inputs, num_tokens=args.max_new_tokens, do_sample=(not args.greedy)) - torch.cuda.synchronize() + get_accelerator().synchronize() end = time.time() times.append(end - start) print(f"generation time is {times[1]} sec") diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py index 173eac039..bf727fefc 100644 --- a/inference/huggingface/text-generation/utils.py +++ b/inference/huggingface/text-generation/utils.py @@ -10,6 +10,7 @@ import torch from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast +from deepspeed.accelerator import get_accelerator class DSPipeline(): ''' @@ -34,7 +35,7 @@ def __init__(self, elif device < 0: self.device = torch.device("cpu") else: - self.device = torch.device(f"cuda:{device}") + self.device = torch.device(get_accelerator().device_name(device)) # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time. self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"] @@ -110,7 +111,7 @@ def generate_outputs(self, if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(self.device) - self.model.cuda().to(self.device) + self.model.to(self.device) if isinstance(self.tokenizer, LlamaTokenizerFast): # NOTE: Check if Llamma can work w/ **input_tokens From 57dd8fbc58ffba181b2c5bd94ba3d602d7c9c830 Mon Sep 17 00:00:00 2001 From: Max Kovalenko <75629718+deepcharm@users.noreply.github.com> Date: Wed, 17 Jan 2024 02:59:15 +0200 Subject: [PATCH 17/58] deepspeed-chat: Support zero3 params initialization in the last LN (#839) Zero3 requires that gathering partitioned parameters before they can be accessed. We enable that mechanism for initialization of the last LN weight and bias. Co-authored-by: Olatunji Ruwase --- .../training/step2_reward_model_finetuning/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index 04f178504..8cdf5644d 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -258,8 +258,17 @@ def main(): # the LN that precedes it. force_optimize_params = [] if "bigscience/bloom-" in args.model_name_or_path: - torch.nn.init.ones_(rm_model.rwtransformer.ln_f.weight) - torch.nn.init.zeros_(rm_model.rwtransformer.ln_f.bias) + zero_init_enabled = (args.zero_stage == 3) + params = [ + rm_model.rwtranrsformer.ln_f.weight, + rm_model.rwtranrsformer.ln_f.bias + ] + with deepspeed.zero.GatheredParameters(params, + modifier_rank=0, + enabled=zero_init_enabled): + if deepspeed.comm.get_rank() == 0 or not zero_init_enabled: + torch.nn.init.ones_(rm_model.rwtransformer.ln_f.weight) + torch.nn.init.zeros_(rm_model.rwtransformer.ln_f.bias) force_optimize_params.extend( ['rwtransformer.ln_f.weight', 'rwtransformer.ln_f.bias']) From 8216f5f18f8b5b5d09e920de55347f41970bdb8f Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 19 Jan 2024 15:05:10 -0800 Subject: [PATCH 18/58] Generalize MII benchmark for any model (#851) --- benchmarks/inference/mii/README.md | 49 ++-- benchmarks/inference/mii/plot_th_lat.py | 116 --------- benchmarks/inference/mii/requirements.txt | 5 + benchmarks/inference/mii/run_all.sh | 32 +-- benchmarks/inference/mii/run_all_replica.sh | 25 -- benchmarks/inference/mii/run_all_vllm.sh | 26 -- benchmarks/inference/mii/run_benchmark.py | 40 +++ .../inference/mii/run_benchmark_client.sh | 23 -- benchmarks/inference/mii/run_example.sh | 28 +-- benchmarks/inference/mii/server.py | 83 ------- benchmarks/inference/mii/src/__init__.py | 4 + .../client.py} | 215 ++++++++-------- benchmarks/inference/mii/src/defaults.py | 58 +++++ .../{ => src}/plot_effective_throughput.py | 170 +++++++++---- .../mii/{ => src}/plot_latency_percentile.py | 46 ++-- .../mii/{ => src}/plot_repl_scale.py | 30 ++- benchmarks/inference/mii/src/plot_th_lat.py | 130 ++++++++++ .../inference/mii/{ => src}/plot_tp_sizes.py | 45 +++- .../mii/{ => src}/postprocess_results.py | 81 ++++-- .../mii/{ => src}/random_query_generator.py | 17 +- .../inference/mii/{ => src}/sample_input.py | 8 +- benchmarks/inference/mii/src/server.py | 122 +++++++++ benchmarks/inference/mii/src/utils.py | 235 ++++++++++++++++++ 23 files changed, 1037 insertions(+), 551 deletions(-) delete mode 100644 benchmarks/inference/mii/plot_th_lat.py create mode 100644 benchmarks/inference/mii/requirements.txt delete mode 100644 benchmarks/inference/mii/run_all_replica.sh delete mode 100644 benchmarks/inference/mii/run_all_vllm.sh create mode 100644 benchmarks/inference/mii/run_benchmark.py delete mode 100644 benchmarks/inference/mii/run_benchmark_client.sh delete mode 100644 benchmarks/inference/mii/server.py create mode 100644 benchmarks/inference/mii/src/__init__.py rename benchmarks/inference/mii/{run_benchmark_client.py => src/client.py} (51%) create mode 100644 benchmarks/inference/mii/src/defaults.py rename benchmarks/inference/mii/{ => src}/plot_effective_throughput.py (53%) rename benchmarks/inference/mii/{ => src}/plot_latency_percentile.py (72%) rename benchmarks/inference/mii/{ => src}/plot_repl_scale.py (81%) create mode 100644 benchmarks/inference/mii/src/plot_th_lat.py rename benchmarks/inference/mii/{ => src}/plot_tp_sizes.py (73%) rename benchmarks/inference/mii/{ => src}/postprocess_results.py (53%) rename benchmarks/inference/mii/{ => src}/random_query_generator.py (72%) rename benchmarks/inference/mii/{ => src}/sample_input.py (99%) create mode 100644 benchmarks/inference/mii/src/server.py create mode 100644 benchmarks/inference/mii/src/utils.py diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index d9e475cdb..092ac4867 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -2,38 +2,59 @@ ## Run the Benchmark -The benchmarking scripts use DeepSpeed-FastGen in the persistent mode. -You can start the server with the command below: +The benchmarking scripts use DeepSpeed-FastGen in the persistent mode. You can +run the benchmark using `run_benchmark.py`. This script will run several +combinations of inference servers and clients with different tensor parallel +size, number of model replicas (MII only), number of clients, prompt length, and +max new tokens values. By default, the benchmark will run with the `meta-llama/Llama-2-7b-hf` model. ```bash -python server.py [options] start +python run_benchmark.py ``` -Use the -h option to view all available options. To stop the server, use this command: +Use the -h option to view all available options. Several models have pre-defined +default values, including `meta-llama/Llama-2-{7|13|70}b-hf`, +`tiiuae/falcon-{40|180}B`, `microsoft/phi-2`, and `mistralai/Mixtral-8x7B-v0.1`. +These defaults can be overridden if provided to the `run_benchmark.py` script. +For example, to run `meta-llama/Llama-13b-hf` with a tensor parallel size of `1` +and `2` (instead of the default `1`, `2`, and `4`): -```bash -python server.py stop +```bash +python run_benchmark.py --tp_size 1 2 ``` -Once the server is up and running, initiate the client using the command below. The -h option will display all the possible options. +By default the benchmark runs with DeepSpeed-MII as the backend inference +server. To change the backend to vLLM, provide the `--vllm` flag: ```bash -python run_benchmark_client.py [options] +python run_benchmark.py --vllm ``` -The run_all.sh script performs benchmarks across various model sizes and client numbers. For VLLM benchmarks, use the run_all_vllm.sh script. Results are logged in a directory named logs.[BENCHMARK_PARAMETERS]. +The run_all.sh script performs benchmarks across various models, client numbers, +tensor parallel sizes, etc. This script is intended to be run on a system with +8xA100 (80GB) GPUs available. It will run all the benchmarks (including vLLM) +and collect the data used in our [DeepSpeed-Fastgen +blogs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen). +Results are collected in `./results/`. ## Analyze the Benchmark Results -The scripts mentioned below were used for generating the plots featured in our blog. Specify the root directory for log files using --log_dir. +The scripts mentioned below were used for generating the plots featured in our +blog. Specify the root directory for log files using `--log_dir`. The generated +figures will be saved to `./plots/` -- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts. -- `plot_effective_throughput.py`: Use this to chart effective throughput. -- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies. +- `src/plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts. +- `src/plot_effective_throughput.py`: Use this to chart effective throughput. +- `src/plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies. ## Running an End-to-End Example -To quickly experience the end-to-end process of running our benchmark and getting results, you can use the `run_example.sh`. This script is designed to execute the benchmark with a specific configuration. The plots below will be generated in the charts directory. These plots show the performance as depicted in figure 8 of our blog [post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms) +To quickly experience the end-to-end process of running our benchmark and +getting results, you can use the `run_example.sh`. This script is designed to +execute the benchmark with a specific configuration. The plots below will be +generated in the `./plots/` directory. These plots show the performance as +depicted in figure 8 of our blog +[post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms) ```bash bash run_example.sh diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py deleted file mode 100644 index e99dc5a3e..000000000 --- a/benchmarks/inference/mii/plot_th_lat.py +++ /dev/null @@ -1,116 +0,0 @@ -import glob -import matplotlib.pyplot as plt -import argparse -from pathlib import Path -import numpy as np -import pdb -from postprocess_results import read_json, get_summary - -bs = 768 - -tp_sizes_test = { - "7b": [1] -} - -tp_sizes_all = { - "7b": [1], - "70b": [4, 8], -} - -prompt_gen_pairs_test = [ - (2600, 60) -] - -prompt_gen_pairs_all = [ - (1200, 60), - (1200, 128), - (2600, 60), - (2600, 128), -] - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--test", action="store_true") - parser.add_argument("--no_vllm", action="store_true") - parser.add_argument("--log_dir", type=Path, default=".") - parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency") - args = parser.parse_args() - return args - - -def extract_values(file_pattern): - files = glob.glob(file_pattern) - - print(f"Found {len(files)}") - print('\n'.join(files)) - - clients = [] - throughputs = [] - latencies = [] - for f in files: - prof_args, response_details = read_json(f) - summary = get_summary(prof_args, response_details) - clients.append(prof_args["client_num"]) - throughputs.append(summary.throughput) - latencies.append(summary.latency) - - return clients, throughputs, latencies - - -def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): - if not log_dir.exists(): - print(f"Log directory {log_dir} does not exist") - return - - if not out_dir.exists(): - out_dir.mkdir(parents=True, exist_ok=True) - - mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - if not args.no_vllm: - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" - - _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) - if not args.no_vllm: - _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) - - # Plotting the scatter plot - plt.figure(figsize=(6, 4)) - - if not args.no_vllm: - plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") - fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) - vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) - vllm_model_fn = np.poly1d(vllm_vllm_model) - plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") - - plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue") - fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) - mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) - mii_model_fn = np.poly1d(mii_fit_model) - plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color="blue", alpha=0.5, linestyle="--") - - plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}') - plt.xlabel('Throughput (queries/s)', fontsize=14) - plt.ylabel('Latency', fontsize=14) - plt.legend() - plt.grid(True) - plt.tight_layout() - out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" - print(f"Saving {out_file}") - plt.savefig(out_file) - - -if __name__ == "__main__": - args = get_args() - if args.test: - tp_sizes = tp_sizes_test - prompt_gen_pairs = prompt_gen_pairs_test - else: - tp_sizes = tp_sizes_all - prompt_gen_pairs = prompt_gen_pairs_test_all - - for model_size, tps in tp_sizes.items(): - for tp in tps: - for prompt, gen in prompt_gen_pairs: - output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) - diff --git a/benchmarks/inference/mii/requirements.txt b/benchmarks/inference/mii/requirements.txt new file mode 100644 index 000000000..7ac014ef8 --- /dev/null +++ b/benchmarks/inference/mii/requirements.txt @@ -0,0 +1,5 @@ +transformers +matplotlib +deepspeed-mii>=0.2.0 +vllm>=0.2.7 +numpy \ No newline at end of file diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh index ca504a6c9..095b3ae12 100644 --- a/benchmarks/inference/mii/run_all.sh +++ b/benchmarks/inference/mii/run_all.sh @@ -1,25 +1,15 @@ -RAGGED_BATCH_SIZE=768 -PARAM_SIZES=(7b 13b 70b) +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 -declare -A TP_SIZES -TP_SIZES["7b"]="1" -TP_SIZES["13b"]="1:2:4" -TP_SIZES["70b"]="4:8" +# DeepSpeed Team -for PARAM_SIZE in ${PARAM_SIZES[@]}; do - - IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} - for TP in ${TP_VALUES[@]}; do - DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE} - python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start +MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1) - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh - - echo "Stopping server" - python server.py -d ${DEPLOYMENT_NAME} stop - sleep 120 - done +for MODEL in ${MODELS[@]}; do + python ./run_benchmark.py --model ${MODEL} --stream + python ./run_benchmark.py --model ${MODEL} --stream --vllm done + +# Extra runs for Mixtral with non-default settings +python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 +python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm \ No newline at end of file diff --git a/benchmarks/inference/mii/run_all_replica.sh b/benchmarks/inference/mii/run_all_replica.sh deleted file mode 100644 index b3fba0408..000000000 --- a/benchmarks/inference/mii/run_all_replica.sh +++ /dev/null @@ -1,25 +0,0 @@ -RAGGED_BATCH_SIZE=768 -PARAM_SIZES=(7b) -REPLICA_NUMS=(1) - -declare -A TP_SIZES -TP_SIZES["7b"]="4" -TP_SIZES["13b"]="1" -TP_SIZES["70b"]="4" - -for PARAM_SIZE in ${PARAM_SIZES[@]}; do - IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} - for TP in ${TP_VALUES[@]}; do - for REPL in ${REPLICA_NUMS[@]}; do - DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}_repl${REPL} - python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -r ${REPL} -b ${RAGGED_BATCH_SIZE} start - - REQUEST_NUM=$((256 * ${REPL})) - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 CLIENT_NUMS=$((16 * ${REPL})) REQUEST_NUM=$((256 * ${REPL})) bash ./run_bench_client_num.sh - - echo "Stopping server" - python server.py -d ${DEPLOYMENT_NAME} stop - sleep 120 - done - done -done diff --git a/benchmarks/inference/mii/run_all_vllm.sh b/benchmarks/inference/mii/run_all_vllm.sh deleted file mode 100644 index 572377f13..000000000 --- a/benchmarks/inference/mii/run_all_vllm.sh +++ /dev/null @@ -1,26 +0,0 @@ -RAGGED_BATCH_SIZE=768 -PARAM_SIZES=(7b 13b 70b) - -declare -A TP_SIZES -TP_SIZES["7b"]="1" -TP_SIZES["13b"]="1:2:4" -TP_SIZES["70b"]="4:8" - -for PARAM_SIZE in ${PARAM_SIZES[@]}; do - - IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]} - for TP in ${TP_VALUES[@]}; do - DEPLOYMENT_NAME=vllm-llama2-${PARAM_SIZE}-tp${TP} - python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model meta-llama/Llama-2-${PARAM_SIZE}-hf & - sleep 60 - - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh - DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh - - echo "Stopping server" - pkill -u ${USER} -f vllm.entrypoints.api_server - sleep 30 - done -done diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py new file mode 100644 index 000000000..96e88155f --- /dev/null +++ b/benchmarks/inference/mii/run_benchmark.py @@ -0,0 +1,40 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from src.client import run_client +from src.server import start_server, stop_server +from src.utils import ( + get_args_product, + parse_args, + print_summary, + results_exist, + save_json_results, + CLIENT_PARAMS, + SERVER_PARAMS, +) + + +def run_benchmark() -> None: + args = parse_args(server_args=True, client_args=True) + + for server_args in get_args_product(args, which=SERVER_PARAMS): + start_server(server_args) + + for client_args in get_args_product(server_args, which=CLIENT_PARAMS): + if results_exist(client_args) and not args.overwrite_results: + print( + f"Found existing results and skipping current setting. To ignore existing results, use --overwrite_results" + ) + continue + + response_details = run_client(client_args) + print_summary(client_args, response_details) + save_json_results(client_args, response_details) + + stop_server(server_args) + + +if __name__ == "__main__": + run_benchmark() diff --git a/benchmarks/inference/mii/run_benchmark_client.sh b/benchmarks/inference/mii/run_benchmark_client.sh deleted file mode 100644 index 318e9092e..000000000 --- a/benchmarks/inference/mii/run_benchmark_client.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-llama2-7b} -VLLM=${VLLM:-""} - -CLIENT_NUMS=${CLIENT_NUMS:-1 2 4 6 8 12 16 20 24 28 32} -MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-60} -PROMPT_LENGTH=${PROMPT_LENGTH:-3072} -REQUEST_NUM=${REQUEST_NUM:-512} - -LOG_DIR=logs.${DEPLOYMENT_NAME} -mkdir -p ${LOG_DIR} - -for client_num in ${CLIENT_NUMS[@]}; do - RESULT_FILE=${DEPLOYMENT_NAME}_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.json - - python run_benchmark_client.py -w 1 \ - -d ${DEPLOYMENT_NAME} -n ${REQUEST_NUM} -c ${client_num} \ - -k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} \ - -o ${LOG_DIR}/${RESULT_FILE} \ - ${VLLM} --stream \ - 2>&1 | tee ${LOG_DIR}/bench_client_num_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.log -done diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh index ece8393ed..e80253828 100644 --- a/benchmarks/inference/mii/run_example.sh +++ b/benchmarks/inference/mii/run_example.sh @@ -1,19 +1,19 @@ -### Run the server -RAGGED_BATCH_SIZE=768 -PARAM_SIZES=(7b) -DEPLOYMENT_NAME=llama2-7b-tp1-b768 -python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 -### This command will run the client with 60 generation steps and input prompt length of 2600 -DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh +# DeepSpeed Team -### Stop the server -echo "Stopping server" -python server.py -d ${DEPLOYMENT_NAME} stop -sleep 120 +# Run benchmark +python ./run_benchmark.py \ + --model meta-llama/Llama-2-7b-hf \ + --tp_size 1 \ + --num_replicas 1 \ + --max_ragged_batch_size 768 \ + --mean_prompt_length 2600 \ + --mean_max_new_tokens 60 \ + --stream ### Gernerate the plots -python plot_th_lat.py --log_dir . --test --no_vllm -python plot_effective_throughput.py --log_dir . --test --no_vllm +python ./src/plot_th_lat.py -echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768" +echo "Find figures in ./plots/ and log outputs in ./results/" \ No newline at end of file diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/server.py deleted file mode 100644 index 2e6164187..000000000 --- a/benchmarks/inference/mii/server.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -import mii -import argparse - -from mii.constants import DeploymentType - -from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig -from deepspeed.inference.v2.ragged import DSStateManagerConfig - -def start_server(model_name, - deployment_name, - task, - tensor_parallel, - replica_num, - max_ragged_batch_size): - tp_config = DeepSpeedTPConfig(tp_size=tensor_parallel) - mgr_config = DSStateManagerConfig(max_ragged_batch_size=max_ragged_batch_size, max_ragged_sequence_count=max_ragged_batch_size) - inference_config = RaggedInferenceEngineConfig(tensor_parallel=tp_config, - state_manager=mgr_config) - - mii.serve( - model_name, - deployment_name=deployment_name, - tensor_parallel=tensor_parallel, - task=task, - inference_engine_config=inference_config, - replica_num=replica_num - ) - -def stop_server(deployment_name): - mii.client(deployment_name).terminate_server() - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", - type=str, - default="meta-llama/Llama-2-7b-hf", - help="Name of the model in the model_files to benchmark") - parser.add_argument("-d", - "--deployment_name", - type=str, - default="benchmark_deployment") - parser.add_argument("-t", "--task", type=str, - help="Task type. Currently only text-generation is supported", - default="text-generation") - parser.add_argument("-m", - "--tensor_parallel", - type=int, - help="Degree of tensor (model) parallelism", - default=1) - parser.add_argument("-b", - "--ragged_batch_size", - type=int, - help="Max batch size for ragged batching", - default=768) - parser.add_argument("-r", - "--replica_num", - type=int, - help="Number of replicas for load balancing", - default=1) - parser.add_argument("cmd", help="start, stop, or restart") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - if args.cmd == "start": - start_server(args.model_name, - args.deployment_name, - args.task, - args.tensor_parallel, - args.replica_num, - args.ragged_batch_size) - elif args.cmd == "stop": - print("running stop") - stop_server(args.deployment_name) - else: - raise ValueError(f"Unknown command: {args.cmd}") diff --git a/benchmarks/inference/mii/src/__init__.py b/benchmarks/inference/mii/src/__init__.py new file mode 100644 index 000000000..208299fb8 --- /dev/null +++ b/benchmarks/inference/mii/src/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/src/client.py similarity index 51% rename from benchmarks/inference/mii/run_benchmark_client.py rename to benchmarks/inference/mii/src/client.py index caf20351e..c440d0b63 100644 --- a/benchmarks/inference/mii/run_benchmark_client.py +++ b/benchmarks/inference/mii/src/client.py @@ -1,70 +1,26 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import asyncio +import json +import multiprocessing import os -import time -import random -import argparse import queue -import multiprocessing +import random +import requests import threading -from statistics import mean -from dataclasses import dataclass, asdict +import time from typing import List, Iterable -from pathlib import Path -from datetime import datetime -import numpy as np +import numpy as np from transformers import AutoTokenizer -from random_query_generator import RandomQueryGenerator -from sample_input import all_text -import time -import json -import asyncio -import requests -from postprocess_results import get_summary, ResponseDetails - -MAX_PROMPT_LENGTH = 4000 -PROMPT_LENGTH_VAR = 0.3 -MAX_NEW_TOKENS_VAR = 0.3 - -def parse_args(): - parser = argparse.ArgumentParser(description="Benchmark MII services") - parser.add_argument("-k", - "--max_new_tokens", - type=int, - default=60, - help="min and max num tokens argument for huggingface") - parser.add_argument("-d", - "--deployment_name", - type=str, - default="benchmark_deployment") - parser.add_argument("-n", - "--num_queries", - type=int, - help="number of queries to run", - default=10) - parser.add_argument("-w", - "--warmup", - type=int, - help="number of queries for warming up", - default=1) - parser.add_argument("-c", - "--client_num", - type=int, - help="number of parallel client processes", - default=2) - parser.add_argument("-l", - "--prompt_length", - type=int, - default=2600) - parser.add_argument('--use_thread', action='store_true', - help='use thread to run parallel clients, otherwise use multiprocessing', - default=False) - parser.add_argument('--stream', action='store_true', default=True) - parser.add_argument('--vllm', action='store_true', default=False) - parser.add_argument('-o', '--out_json_path', type=Path, default=None) - - args = parser.parse_args() - return args +from .postprocess_results import ResponseDetails +from .random_query_generator import RandomQueryGenerator +from .sample_input import all_text +from .utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS def call_mii(client, input_tokens, max_new_tokens, stream): @@ -85,11 +41,10 @@ def callback(response): if stream: output_tokens = [] client.generate( - input_tokens, max_new_tokens=max_new_tokens, - streaming_fn=callback) + input_tokens, max_new_tokens=max_new_tokens, streaming_fn=callback + ) else: - result = client.generate( - input_tokens, max_new_tokens=max_new_tokens) + result = client.generate(input_tokens, max_new_tokens=max_new_tokens) output_tokens = result[0].generated_text return ResponseDetails( @@ -98,7 +53,8 @@ def callback(response): start_time=start_time, end_time=time.time(), model_time=0, - token_gen_time=token_gen_time) + token_gen_time=token_gen_time, + ) def call_vllm(input_tokens, max_new_tokens, stream=True): @@ -114,15 +70,19 @@ def call_vllm(input_tokens, max_new_tokens, stream=True): "ignore_eos": False, "stream": stream, } + def clear_line(n: int = 1) -> None: - LINE_UP = '\033[1A' - LINE_CLEAR = '\x1b[2K' + LINE_UP = "\033[1A" + LINE_CLEAR = "\x1b[2K" for _ in range(n): print(LINE_UP, end=LINE_CLEAR, flush=True) - def get_streaming_response(response: requests.Response, time_last_token) -> Iterable[List[str]]: - for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, - delimiter=b"\0"): + def get_streaming_response( + response: requests.Response, time_last_token + ) -> Iterable[List[str]]: + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"\0" + ): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"][0] @@ -149,13 +109,23 @@ def get_response(response: requests.Response) -> List[str]: start_time=start_time, end_time=time.time(), model_time=0, - token_gen_time=token_gen_time) + token_gen_time=token_gen_time, + ) else: output = get_response(response) raise NotImplementedError("Not implemented for non-streaming") -def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm): +def _run_parallel( + deployment_name, + warmup, + barrier, + query_queue, + result_queue, + num_clients, + stream, + vllm, +): pid = os.getpid() session_id = f"test_session_p{pid}_t{threading.get_ident()}" @@ -163,6 +133,7 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c asyncio.set_event_loop(event_loop) if not vllm: import mii + client = mii.client(deployment_name) barrier.wait() @@ -178,7 +149,7 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c barrier.wait() - time.sleep(random.uniform(0, client_num) * 0.01) + time.sleep(random.uniform(0, num_clients) * 0.01) try: while not query_queue.empty(): print(f"queue size: {query_queue.qsize()} ({pid})", flush=True) @@ -197,18 +168,33 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c print(f"Worker ({pid}) finished. session_id: {session_id}") -def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_queries, warmup, stream, vllm, use_thread=False): +def run_client(args): """ Run MII client for benchmarking. The scenario is a bit complicated: - 1. The main process puts `num_queries` queries into the input queue + 1. The main process puts `num_requests` queries into the input queue 2. Each client runs `warmup` iterations () taking the queries from the input queue 3. --- barrier --- 4. The main process marks the start time - 5a. All clients send `num_queries' query in total and put the results into the result queue + 5a. All clients send `num_requests' query in total and put the results into the result queue 5b. The main process takes the results from the result queue (in parallel with 5a) - 6. The main process marks the end time after receiving `num_queries' results + 6. The main process marks the end time after receiving `num_requests' results """ + # Unpack arguments + model = args.model + deployment_name = args.deployment_name + mean_prompt_length = args.mean_prompt_length + mean_max_new_tokens = args.mean_max_new_tokens + num_clients = args.num_clients + num_requests = args.num_requests + warmup = args.warmup + max_prompt_length = args.max_prompt_length + prompt_length_var = args.prompt_length_var + max_new_tokens_var = args.max_new_tokens_var + stream = args.stream + vllm = args.vllm + use_thread = args.use_thread + if use_thread: runnable_cls = threading.Thread barrier_cls = threading.Barrier @@ -218,23 +204,44 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q barrier_cls = multiprocessing.Barrier queue_cls = multiprocessing.Queue - barrier = barrier_cls(client_num + 1) + barrier = barrier_cls(num_clients + 1) query_queue = queue_cls() result_queue = queue_cls() - processes = [runnable_cls(target=_run_parallel, - args=(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm)) - for i in range(client_num)] + processes = [ + runnable_cls( + target=_run_parallel, + args=( + deployment_name, + warmup, + barrier, + query_queue, + result_queue, + num_clients, + stream, + vllm, + ), + ) + for i in range(num_clients) + ] for p in processes: p.start() - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + tokenizer = AutoTokenizer.from_pretrained(model) query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42) - MAX_PROMPT_LENGTH = 4000 - request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, MAX_PROMPT_LENGTH, num_queries + warmup*client_num) + request_text = query_generator.get_random_request_text( + mean_prompt_length, + mean_prompt_length * prompt_length_var, + max_prompt_length, + num_requests + warmup * num_clients, + ) for t in request_text: - req_max_new_tokens = int(np.random.normal(max_new_tokens, MAX_NEW_TOKENS_VAR*max_new_tokens)) + req_max_new_tokens = int( + np.random.normal( + mean_max_new_tokens, max_new_tokens_var * mean_max_new_tokens + ) + ) query_queue.put((t, req_max_new_tokens)) # Tokenizers must be initialized after fork. @@ -245,41 +252,21 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q barrier.wait() response_details = [] - while len(response_details) < num_queries: + while len(response_details) < num_requests: res = result_queue.get() # vLLM returns concatinated tokens if vllm: all_tokens = tokenizer.tokenize(res.generated_tokens) - res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)):] + res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)) :] response_details.append(res) return response_details + if __name__ == "__main__": - args = parse_args() - print(args) - - if args.out_json_path is not None and not args.out_json_path.parent.exists(): - raise ValueError(f"Parent directory of {args.out_json_path}") - - response_details = run_client(args.client_num, args.deployment_name, - args.prompt_length, - args.max_new_tokens, args.num_queries, args.warmup, - args.stream, args.vllm, args.use_thread) - - args_dict = vars(args) - ps = get_summary(args_dict, response_details) - print(f"Deployment: {args.deployment_name} Clients: {args.client_num}, " - + f"Prompt (mean): {args.prompt_length} tokens, " - + f"Generation (mean): {args.max_new_tokens} tokens, " - + f"Query throughput: {ps.throughput:.3f} queries/s, " - + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " - + f"Query latency: {ps.latency:.3f} s, " - + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " - + f"First token received: {ps.first_token_latency:.3f} s") - - if args.out_json_path is not None: - with open(args.out_json_path, "w") as f: - args_dict["out_json_path"] = str(args.out_json_path) # Path is not JSON serializable - data = {"args": args_dict, "time": str(datetime.now()), "response_details": [asdict(r) for r in response_details]} - json.dump(data, f, indent=2) + args = parse_args(client_args=True) + + for client_args in get_args_product(args, which=CLIENT_PARAMS): + response_details = run_client(client_args) + + print_summary(client_args, response_details) diff --git a/benchmarks/inference/mii/src/defaults.py b/benchmarks/inference/mii/src/defaults.py new file mode 100644 index 000000000..79ce91c97 --- /dev/null +++ b/benchmarks/inference/mii/src/defaults.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +ARG_DEFAULTS = { + "tp_size": 1, + "max_ragged_batch_size": 768, + "num_replicas": 1, + "max_prompt_length": 4000, + "mean_prompt_length": 2600, + "mean_max_new_tokens": 60, +} + +MODEL_DEFAULTS = { + "meta-llama/Llama-2-7b-hf": { + "max_prompt_length": 4000, + "mean_prompt_length": (1200, 2600), + "mean_max_new_tokens": (60, 128), + "tp_size": 1, + }, + "meta-llama/Llama-13b-hf": { + "max_prompt_length": 4000, + "mean_prompt_length": (1200, 2600), + "mean_max_new_tokens": (60, 128), + "tp_size": (1, 2, 4), + }, + "meta-llama/Llama-2-70b-hf": { + "max_prompt_length": 4000, + "mean_prompt_length": (1200, 2600), + "mean_max_new_tokens": (60, 128), + "tp_size": (4, 8), + }, + "tiiuae/falcon-40B": { + "max_prompt_length": 2000, + "mean_prompt_length": (1200, 1900), + "mean_max_new_tokens": (60, 128), + "tp_size": (2, 4), + }, + "tiiuae/falcon-180B": { + "max_prompt_length": 2000, + "mean_prompt_length": (1200, 1900), + "mean_max_new_tokens": (60, 128), + "tp_size": 8, + }, + "microsoft/phi-2": { + "max_prompt_length": 2000, + "mean_prompt_length": (1200, 1900), + "mean_max_new_tokens": (60, 128), + "tp_size": 1, + }, + "mistralai/Mixtral-8x7B-v0.1": { + "max_prompt_length": 4000, + "mean_prompt_length": (1200, 2600), + "mean_max_new_tokens": (60, 128), + "tp_size": 4, + }, +} diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py similarity index 53% rename from benchmarks/inference/mii/plot_effective_throughput.py rename to benchmarks/inference/mii/src/plot_effective_throughput.py index 350c269c3..efa471c76 100644 --- a/benchmarks/inference/mii/plot_effective_throughput.py +++ b/benchmarks/inference/mii/src/plot_effective_throughput.py @@ -1,3 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import argparse from pathlib import Path import glob @@ -5,21 +10,16 @@ import numpy as np import pandas as pd -from postprocess_results import read_json, get_tokenizer +from .postprocess_results import read_json, get_tokenizer RAGGED_BATCH_SIZE = 768 SLA_PROMPT_TOKENS_PER_SEC = 512 SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8] EMA_SPAN = 16 -tp_sizes_all = { - "7b": [1], - "70b": [4, 8] -} +tp_sizes_all = {"7b": [1], "70b": [4, 8]} -tp_sizes_test = { - "7b": [1] -} +tp_sizes_test = {"7b": [1]} prompt_gen_pairs_all = [ (1200, 60), @@ -28,9 +28,8 @@ (2600, 128), ] -prompt_gen_pairs_test = [ - (2600, 60) -] +prompt_gen_pairs_test = [(2600, 60)] + def get_args(): parser = argparse.ArgumentParser() @@ -43,23 +42,54 @@ def get_args(): def check_token_latency_step(response_details, token_index): - P50_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 50) - P90_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 90) - P99_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 99) + P50_token_latency = np.percentile( + [ + r.token_gen_time[token_index] + for r in response_details + if len(r.token_gen_time) > token_index + ], + 50, + ) + P90_token_latency = np.percentile( + [ + r.token_gen_time[token_index] + for r in response_details + if len(r.token_gen_time) > token_index + ], + 90, + ) + P99_token_latency = np.percentile( + [ + r.token_gen_time[token_index] + for r in response_details + if len(r.token_gen_time) > token_index + ], + 99, + ) return P50_token_latency, P90_token_latency, P99_token_latency def validate_token_cum_latency_SLA(response_detail, sla_token_gen): cumsum_latencies = np.cumsum(np.array(response_detail.token_gen_time[1:])) - return all([cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1) for i in range(len(cumsum_latencies))]) + return all( + [ + cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1) + for i in range(len(cumsum_latencies)) + ] + ) def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span): - ema_latency = pd.Series(response_detail.token_gen_time[1:]).ewm(span=ema_span).mean().values.tolist() - return all([t < 1. / sla_token_gen for t in ema_latency]) + ema_latency = ( + pd.Series(response_detail.token_gen_time[1:]) + .ewm(span=ema_span) + .mean() + .values.tolist() + ) + return all([t < 1.0 / sla_token_gen for t in ema_latency]) + - def validate_prompt_latency_SLA(response_detail, sla_token_gen, f): tokenizer = get_tokenizer() prompt_length = len(tokenizer.tokenize(response_detail.prompt)) @@ -71,14 +101,14 @@ def validate_prompt_latency_SLA(response_detail, sla_token_gen, f): return True return f[0](response_detail, sla_token_gen, *f[1]) - + def calc_throughput(response_details): start_time = min([r.start_time for r in response_details]) end_time = max([r.end_time for r in response_details]) return len(response_details) / (end_time - start_time) - + def extract_values(file_pattern, sla_token_gen, validate_func): files = glob.glob(file_pattern) print(f"Found {len(files)} files") @@ -87,8 +117,16 @@ def extract_values(file_pattern, sla_token_gen, validate_func): for f in files: prof_args, response_details = read_json(f) client_num = prof_args["client_num"] - num_req_ok = len([r for r in response_details if validate_prompt_latency_SLA(r, sla_token_gen, validate_func)]) - goodputs[client_num] = calc_throughput(response_details) * (num_req_ok / len(response_details)) + num_req_ok = len( + [ + r + for r in response_details + if validate_prompt_latency_SLA(r, sla_token_gen, validate_func) + ] + ) + goodputs[client_num] = calc_throughput(response_details) * ( + num_req_ok / len(response_details) + ) good_ratios[client_num] = num_req_ok / len(response_details) return goodputs, good_ratios @@ -98,11 +136,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return - + if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) - - print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}") + + print( + f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}" + ) mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" if not args.no_vllm: @@ -110,55 +150,91 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out validate_funcs = [ (validate_token_cum_latency_SLA, (), "cum"), - (validate_token_ema_latency_SLA, (EMA_SPAN, ), f"ema{EMA_SPAN}"), + (validate_token_ema_latency_SLA, (EMA_SPAN,), f"ema{EMA_SPAN}"), ] for f in validate_funcs: - - mii_goodputs, mii_good_ratios = extract_values(mii_file_pattern, sla_token_gen, f) + + mii_goodputs, mii_good_ratios = extract_values( + mii_file_pattern, sla_token_gen, f + ) client_num_list = sorted(list(mii_goodputs.keys())) mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list] if not args.no_vllm: - vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) - vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] + vllm_goodputs, vllm_good_ratios = extract_values( + vllm_file_pattern, sla_token_gen, f + ) + vllm_goodputs_list = [ + vllm_goodputs[client_num] for client_num in client_num_list + ] # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}") # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}") # Plotting the scatter plot plt.figure(figsize=(7, 4)) - plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue") + plt.scatter( + client_num_list, + mii_goodputs_list, + label=f"DeepSpeed-FastGen", + marker="o", + color="blue", + ) if not args.no_vllm: - plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") + plt.scatter( + client_num_list, + vllm_goodputs_list, + label=f"vLLM", + marker="x", + color="orange", + ) fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4) mii_model_fn = np.poly1d(mii_fit_model) - plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--") + plt.plot( + fit_x_list, + mii_model_fn(fit_x_list), + color="blue", + alpha=0.5, + linestyle="--", + ) if not args.no_vllm: vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) vllm_model_fn = np.poly1d(vllm_fit_model) - plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") - - title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \ - + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}' + plt.plot( + fit_x_list, + vllm_model_fn(fit_x_list), + color="orange", + alpha=0.5, + linestyle="--", + ) + + title = ( + f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" + + f"Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}" + ) plt.title(title, fontsize=10) - plt.xlabel('Number of clients', fontsize=10) - plt.ylabel('Effective throughput (queries/s)', fontsize=10) + plt.xlabel("Number of clients", fontsize=10) + plt.ylabel("Effective throughput (queries/s)", fontsize=10) # plt.rcParams['figure.subplot.bottom'] = 0.30 plt.ylim(bottom=-0.05) plt.legend() plt.grid(True) # plt.show() - out_file = out_dir / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png" + out_file = ( + out_dir + / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png" + ) plt.savefig(out_file) plt.clf() print(f"Saved {out_file}") - + if __name__ == "__main__": + raise NotImplementedError("This script is not up to date") args = get_args() if args.test: @@ -172,5 +248,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out for tp in tps: for prompt, gen in prompt_gen_pairs: for sla_token_gen in SLA_GEN_TOKENS_PER_SEC: - display_results(model_size, tp, RAGGED_BATCH_SIZE, sla_token_gen, prompt, gen, args.log_dir, args.out_dir) - + display_results( + model_size, + tp, + RAGGED_BATCH_SIZE, + sla_token_gen, + prompt, + gen, + args.log_dir, + args.out_dir, + ) diff --git a/benchmarks/inference/mii/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py similarity index 72% rename from benchmarks/inference/mii/plot_latency_percentile.py rename to benchmarks/inference/mii/src/plot_latency_percentile.py index c91c78bf1..9b08f12da 100644 --- a/benchmarks/inference/mii/plot_latency_percentile.py +++ b/benchmarks/inference/mii/src/plot_latency_percentile.py @@ -1,3 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import argparse import glob from pathlib import Path @@ -5,12 +10,12 @@ import numpy as np import itertools -from postprocess_results import read_json, get_token_latency +from .postprocess_results import read_json, get_token_latency bs = 768 SKIP_HEAD_TOKEN_NUM = 2 SKIP_REQUEST_NUM = 100 - + tp_sizes = { "70b": [4], } @@ -23,14 +28,16 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", type=Path, default=".") - parser.add_argument("--out_dir", type=Path, default="charts/percentile_token_latency") + parser.add_argument( + "--out_dir", type=Path, default="charts/percentile_token_latency" + ) args = parser.parse_args() return args def extract_values(file_pattern): files = glob.glob(file_pattern) - + latencies = {} for f in files: prof_args, response_details = read_json(f) @@ -38,18 +45,20 @@ def extract_values(file_pattern): response_details.sort(key=lambda r: r.start_time) response_details = response_details[SKIP_REQUEST_NUM:-SKIP_REQUEST_NUM] - token_latencies = [r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details] + token_latencies = [ + r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details + ] flat_latency_list = list(itertools.chain(*token_latencies)) latencies[client_num] = flat_latency_list return latencies -def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): +def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return - + if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) @@ -79,7 +88,10 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): # print(f"P95_vllm_val={P95_vllm_val}") # print(f"P95_mii_val={P95_mii_val}") - out_file = out_dir / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png" + out_file = ( + out_dir + / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png" + ) x1 = [1, 2, 3] y1 = [P50_vllm_val, P90_vllm_val, P95_vllm_val] @@ -87,11 +99,13 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): x2 = [1.3, 2.3, 3.3] y2 = [P50_mii_val, P90_mii_val, P95_mii_val] - label_x = ['P50', 'P90', 'P95'] + label_x = ["P50", "P90", "P95"] - plt.bar(x1, y1, width=0.3, label='vLLM', align="center", color="orange") - plt.bar(x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue") - plt.ylabel('Latency', fontsize=14) + plt.bar(x1, y1, width=0.3, label="vLLM", align="center", color="orange") + plt.bar( + x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue" + ) + plt.ylabel("Latency", fontsize=14) plt.legend(loc=2) plt.xticks([1.15, 2.15, 3.15], label_x) @@ -101,10 +115,12 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if __name__ == "__main__": + raise NotImplementedError("This script is not up to date") args = get_args() - + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: - output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) - + output_charts( + model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir + ) diff --git a/benchmarks/inference/mii/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py similarity index 81% rename from benchmarks/inference/mii/plot_repl_scale.py rename to benchmarks/inference/mii/src/plot_repl_scale.py index 394c54588..7791be0ca 100644 --- a/benchmarks/inference/mii/plot_repl_scale.py +++ b/benchmarks/inference/mii/src/plot_repl_scale.py @@ -1,10 +1,15 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import glob import matplotlib.pyplot as plt import argparse from pathlib import Path import numpy as np -from postprocess_results import read_json, get_summary +from .postprocess_results import read_json, get_summary bs = 768 @@ -18,6 +23,7 @@ (2600, 60), ] + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", type=Path, default=".") @@ -46,7 +52,7 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return - + if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) @@ -67,17 +73,19 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): # Plotting the scatter plot plt.figure(figsize=(6, 4)) - + plt.bar(REPLICA_NUMS, throughputs[c], color="blue", alpha=0.9) fit_x_list = np.arange(min(REPLICA_NUMS), max(REPLICA_NUMS), 0.1) mii_fit_model = np.polyfit(REPLICA_NUMS, throughputs[c], 1) mii_model_fn = np.poly1d(mii_fit_model) plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--") - - plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}') - plt.xlabel('Number of replicas', fontsize=14) - plt.ylabel('Throughput (queries/s)', fontsize=14) + + plt.title( + f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}" + ) + plt.xlabel("Number of replicas", fontsize=14) + plt.ylabel("Throughput (queries/s)", fontsize=14) plt.grid(True) plt.tight_layout() # plt.show() @@ -86,10 +94,12 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if __name__ == "__main__": + raise NotImplementedError("This script is not up to date") args = get_args() - + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: - output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir) - + output_charts( + model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir + ) diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py new file mode 100644 index 000000000..9aa292ca6 --- /dev/null +++ b/benchmarks/inference/mii/src/plot_th_lat.py @@ -0,0 +1,130 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import argparse +import glob +import os +import re +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + +from postprocess_results import read_json, get_summary + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--out_dir", type=Path, default="./plots/throughput_latency") + args = parser.parse_args() + return args + + +def extract_values(file_pattern): + files = glob.glob(file_pattern) + + print(f"Found {len(files)}") + print("\n".join(files)) + + clients = [] + throughputs = [] + latencies = [] + for f in files: + prof_args, response_details = read_json(f) + summary = get_summary(prof_args, response_details) + clients.append(prof_args["num_clients"]) + throughputs.append(summary.throughput) + latencies.append(summary.latency) + + return clients, throughputs, latencies + + +def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): + out_dir.mkdir(parents=True, exist_ok=True) + + result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" + mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}" + vllm_file_pattern = f"{log_dir}/vllm/{result_file_pattern}" + + _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) + _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) + + # Plotting the scatter plot + plt.figure(figsize=(6, 4)) + + if len(vllm_throughputs) > 0: + plt.scatter( + vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange" + ) + fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) + vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) + vllm_model_fn = np.poly1d(vllm_vllm_model) + plt.plot( + fit_vllm_x_list, + vllm_model_fn(fit_vllm_x_list), + color="orange", + alpha=0.5, + linestyle="--", + ) + + plt.scatter( + mii_throughputs, + mii_latencies, + label=f"DeepSpeed FastGen", + marker="o", + color="blue", + ) + fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) + mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot( + fit_mii_x_list, + mii_model_fn(fit_mii_x_list), + color="blue", + alpha=0.5, + linestyle="--", + ) + + plt.title(f"Model {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}") + plt.xlabel("Throughput (queries/s)", fontsize=14) + plt.ylabel("Latency", fontsize=14) + plt.legend() + plt.grid(True) + plt.tight_layout() + out_file = ( + out_dir + / f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}.png" + ) + print(f"Saving {out_file}") + plt.savefig(out_file) + + +if __name__ == "__main__": + args = get_args() + + if not args.log_dir.exists(): + raise ValueError(f"Log dir {args.log_dir} does not exist") + + result_params = set() + result_re = re.compile( + r"(.+)-tp(\d+)-bs(\d+)-replicas(\d+)-prompt(\d+)-gen(\d+)-clients.*.json" + ) + for f in os.listdir(os.path.join(args.log_dir, "fastgen")): + match = result_re.match(f) + if match: + result_params.add(match.groups()) + + for model, tp_size, bs, replicas, prompt, gen in result_params: + output_charts( + model=model, + tp_size=tp_size, + bs=bs, + replicas=replicas, + prompt=prompt, + gen=gen, + log_dir=args.log_dir, + out_dir=args.out_dir, + ) diff --git a/benchmarks/inference/mii/plot_tp_sizes.py b/benchmarks/inference/mii/src/plot_tp_sizes.py similarity index 73% rename from benchmarks/inference/mii/plot_tp_sizes.py rename to benchmarks/inference/mii/src/plot_tp_sizes.py index 546310258..f02b643f2 100644 --- a/benchmarks/inference/mii/plot_tp_sizes.py +++ b/benchmarks/inference/mii/src/plot_tp_sizes.py @@ -1,13 +1,18 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import glob import matplotlib.pyplot as plt import argparse from pathlib import Path import numpy as np -from postprocess_results import read_json, get_summary +from .postprocess_results import read_json, get_summary bs = 768 - + tp_sizes = { # "7b": [1], "13b": [1, 2, 4], @@ -22,6 +27,7 @@ (2600, 256), ] + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", type=Path, default="logs.release") @@ -34,7 +40,7 @@ def extract_values(file_pattern): files = glob.glob(file_pattern) print(f"Found {len(files)}") - print('\n'.join(files)) + print("\n".join(files)) clients = [] throughputs = [] @@ -53,7 +59,7 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return - + if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) @@ -73,26 +79,39 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir): tflops_per_query = n_params * (prompt + gen) * 2 * 1e-3 mii_tflops = [th * tflops_per_query / tp for th in mii_throughputs] - plt.scatter(mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color) + plt.scatter( + mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color + ) fit_mii_x_list = np.arange(min(mii_tflops), max(mii_tflops), 0.01) mii_fit_model = np.polyfit(mii_tflops, mii_latencies, 3) mii_model_fn = np.poly1d(mii_fit_model) - plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color=color, alpha=0.5, linestyle="--") - - plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}') - plt.xlabel('TFLOPs (per GPU)', fontsize=14) - plt.ylabel('Latency', fontsize=14) + plt.plot( + fit_mii_x_list, + mii_model_fn(fit_mii_x_list), + color=color, + alpha=0.5, + linestyle="--", + ) + + plt.title( + f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}" + ) + plt.xlabel("TFLOPs (per GPU)", fontsize=14) + plt.ylabel("Latency", fontsize=14) plt.legend() plt.grid(True) # plt.show() - out_file = out_dir / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png" + out_file = ( + out_dir + / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png" + ) plt.savefig(out_file) if __name__ == "__main__": + raise NotImplementedError("This script is not up to date") args = get_args() - + for model_size, tps in tp_sizes.items(): for prompt, gen in prompt_gen_pairs: output_charts(model_size, tps, bs, prompt, gen, args.log_dir, args.out_dir) - diff --git a/benchmarks/inference/mii/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py similarity index 53% rename from benchmarks/inference/mii/postprocess_results.py rename to benchmarks/inference/mii/src/postprocess_results.py index cb2000d5f..7e25bfddc 100644 --- a/benchmarks/inference/mii/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -1,12 +1,17 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import argparse -from pathlib import Path import json -import numpy as np -from statistics import mean -from functools import reduce from dataclasses import dataclass +from functools import reduce +from pathlib import Path +from statistics import mean from typing import List +import numpy as np from transformers import AutoTokenizer @@ -31,10 +36,10 @@ class ProfilingSummary: first_token_latency: float tokens_per_sec: float - + def parse_args(): parser = argparse.ArgumentParser(description="Postprocess results") - parser.add_argument('-i', '--input_path', type=Path, default="results.json") + parser.add_argument("-i", "--input_path", type=Path, default="results.json") args = parser.parse_args() return args @@ -44,13 +49,13 @@ def get_tokenizer(): global tokenizer if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - return tokenizer + return tokenizer def read_json(file_path): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = json.load(f) - + args = data["args"] response_details = [] @@ -61,34 +66,56 @@ def read_json(file_path): def get_summary(args, response_details): - client_num = args["client_num"] + num_clients = args["num_clients"] # Calculate latency and throughput using P95 latency latency = mean([r.end_time - r.start_time for r in response_details]) - throughput = client_num / latency - - tokens_per_sec = mean([(len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details]) + throughput = num_clients / latency + + tokens_per_sec = mean( + [ + (len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) + / (r.end_time - r.start_time) + for r in response_details + ] + ) first_token_latency = mean([r.token_gen_time[0] for r in response_details]) - token_gen_latency_flat = reduce(list.__add__, [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2]) + token_gen_latency_flat = reduce( + list.__add__, + [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2], + ) token_gen_latency = mean([t for t in token_gen_latency_flat]) - return ProfilingSummary(throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec) + return ProfilingSummary( + throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec + ) -def get_token_latency(response_details, percentile=None, variance=False, cumulative=False): +def get_token_latency( + response_details, percentile=None, variance=False, cumulative=False +): req_latencies = [r.token_gen_time for r in response_details] if cumulative: - req_latencies = [np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details] + req_latencies = [ + np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details + ] max_gen_length = max([len(r.generated_tokens) for r in response_details]) latency = [] for i in range(max_gen_length): if variance: - token_latency_step = np.var([latency[i] for latency in req_latencies if len(latency) > i]) + token_latency_step = np.var( + [latency[i] for latency in req_latencies if len(latency) > i] + ) if percentile is None: - token_latency_step = [latency[i] for latency in req_latencies if len(latency) > i] + token_latency_step = [ + latency[i] for latency in req_latencies if len(latency) > i + ] else: - token_latency_step = np.percentile([latency[i] for latency in req_latencies if len(latency) > i], percentile) + token_latency_step = np.percentile( + [latency[i] for latency in req_latencies if len(latency) > i], + percentile, + ) latency.append(token_latency_step) @@ -104,9 +131,11 @@ def get_token_acc_latency(response_details, percentile=99): prof_args, response_details = read_json(args.input_path) ps = get_summary(prof_args, response_details) - print(f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['client_num']}, " - + f"Query throughput: {ps.throughput:.3f} queries/s, " - + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " - + f"Query latency: {ps.latency:.3f} s, " - + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " - + f"First token received: {ps.first_token_latency:.3f} s") + print( + f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['num_clients']}, " + + f"Query throughput: {ps.throughput:.3f} queries/s, " + + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " + + f"Query latency: {ps.latency:.3f} s, " + + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " + + f"First token received: {ps.first_token_latency:.3f} s" + ) diff --git a/benchmarks/inference/mii/random_query_generator.py b/benchmarks/inference/mii/src/random_query_generator.py similarity index 72% rename from benchmarks/inference/mii/random_query_generator.py rename to benchmarks/inference/mii/src/random_query_generator.py index b8442af4f..eca16d8ff 100644 --- a/benchmarks/inference/mii/random_query_generator.py +++ b/benchmarks/inference/mii/src/random_query_generator.py @@ -1,7 +1,12 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import numpy as np import torch import random -import numpy as np -import time + class RandomQueryGenerator: def __init__(self, input_text, tokenizer, seed): @@ -14,9 +19,9 @@ def __init__(self, input_text, tokenizer, seed): def get_random_request_text(self, length, variance, max_length, batch): request_text = [] - tokenized_input = self.tokenizer.batch_encode_plus([self.input_text], - return_tensors="pt", - padding=False) + tokenized_input = self.tokenizer.batch_encode_plus( + [self.input_text], return_tensors="pt", padding=False + ) offset = list(range(512)) random.shuffle(offset) @@ -25,6 +30,6 @@ def get_random_request_text(self, length, variance, max_length, batch): # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens req_prompt_length = min(int(np.random.normal(length, variance)), max_length) - text = self.tokenizer.decode(text_ids[i:req_prompt_length+i]) + text = self.tokenizer.decode(text_ids[i : req_prompt_length + i]) request_text.append(text) return request_text diff --git a/benchmarks/inference/mii/sample_input.py b/benchmarks/inference/mii/src/sample_input.py similarity index 99% rename from benchmarks/inference/mii/sample_input.py rename to benchmarks/inference/mii/src/sample_input.py index 77d02af5f..bae18ce62 100644 --- a/benchmarks/inference/mii/sample_input.py +++ b/benchmarks/inference/mii/src/sample_input.py @@ -1,8 +1,12 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team # This is a sample input consisting of: # Code & Text -all_text = '''Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output. +all_text = """Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output. During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss. The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates. Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). @@ -218,4 +222,4 @@ def top_p_sampling(self, logits, p=0.9): print("Top-k Sampling:", top_k_text) print("Top-p Sampling:", top_p_text) Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API. - ''' \ No newline at end of file + """ diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py new file mode 100644 index 000000000..d0ecabaf3 --- /dev/null +++ b/benchmarks/inference/mii/src/server.py @@ -0,0 +1,122 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import subprocess +import time + +import mii +from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig +from deepspeed.inference.v2.ragged import DSStateManagerConfig + +from .utils import parse_args, SERVER_PARAMS + + +def start_server(args): + vllm = args.vllm + model = args.model + deployment_name = args.deployment_name + tp_size = args.tp_size + num_replicas = args.num_replicas + max_ragged_batch_size = args.max_ragged_batch_size + + if vllm: + start_vllm_server(model=model, tp_size=tp_size) + else: + start_mii_server( + model=model, + deployment_name=deployment_name, + tp_size=tp_size, + num_replicas=num_replicas, + max_ragged_batch_size=max_ragged_batch_size, + ) + + +def start_vllm_server(model: str, tp_size: int) -> None: + vllm_cmd = ( + "python", + "-m", + "vllm.entrypoints.api_server", + "--host", + "127.0.0.1", + "--port", + "26500", + "--tensor-parallel-size", + str(tp_size), + "--model", + model, + ) + p = subprocess.Popen( + vllm_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, close_fds=True + ) + start_time = time.time() + timeout_after = 60 * 5 # 5 minutes + while True: + line = p.stderr.readline().decode("utf-8") + if "Application startup complete" in line: + break + if "error" in line.lower(): + p.terminate() + stop_vllm_server() + raise RuntimeError(f"Error starting VLLM server: {line}") + if time.time() - start_time > timeout_after: + p.terminate() + stop_vllm_server() + raise TimeoutError("Timed out waiting for VLLM server to start") + time.sleep(0.01) + + +def start_mii_server( + model, deployment_name, tp_size, num_replicas, max_ragged_batch_size +): + tp_config = DeepSpeedTPConfig(tp_size=tp_size) + mgr_config = DSStateManagerConfig( + max_ragged_batch_size=max_ragged_batch_size, + max_ragged_sequence_count=max_ragged_batch_size, + ) + inference_config = RaggedInferenceEngineConfig( + tensor_parallel=tp_config, state_manager=mgr_config + ) + + mii.serve( + model, + deployment_name=deployment_name, + tensor_parallel=tp_size, + inference_engine_config=inference_config, + replica_num=num_replicas, + ) + + +def stop_server(args): + vllm = args.vllm + deployment_name = args.deployment_name + + if vllm: + stop_vllm_server() + else: + stop_mii_server(deployment_name) + + +def stop_vllm_server(): + vllm_cmd = ("pkill", "-f", "vllm.entrypoints.api_server") + p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.wait() + + +def stop_mii_server(deployment_name): + mii.client(deployment_name).terminate_server() + + +if __name__ == "__main__": + args = parse_args(server_args=True) + + if args.cmd == "start": + start_server(args) + elif args.cmd == "stop": + stop_server(args) + elif args.cmd == "restart": + stop_server(args) + start_server(args) + else: + raise ValueError(f"Invalid command {args.cmd}") diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py new file mode 100644 index 000000000..6499a54b4 --- /dev/null +++ b/benchmarks/inference/mii/src/utils.py @@ -0,0 +1,235 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import argparse +import copy +import itertools +import json +import os + +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import Iterator, List + +from .defaults import ARG_DEFAULTS, MODEL_DEFAULTS +from .postprocess_results import get_summary, ResponseDetails + +# For these arguments, users can provide multiple values when running the +# benchmark. The benchmark will iterate over all possible combinations. +SERVER_PARAMS = ["tp_size", "max_ragged_batch_size", "num_replicas"] +CLIENT_PARAMS = ["mean_prompt_length", "mean_max_new_tokens", "num_clients"] + + +def parse_args( + server_args: bool = False, client_args: bool = False +) -> argparse.Namespace: + if not (server_args or client_args): + raise ValueError("Must specify server_args or client_args or both") + + # Server args + server_parser = argparse.ArgumentParser(add_help=False) + server_parser.add_argument( + "--tp_size", type=int, nargs="+", default=None, help="Tensor parallelism size" + ) + server_parser.add_argument( + "--max_ragged_batch_size", + type=int, + nargs="+", + default=None, + help="Max batch size for ragged batching", + ) + server_parser.add_argument( + "--num_replicas", + type=int, + nargs="+", + default=None, + help="Number of MII model replicas", + ) + server_parser.add_argument( + "cmd", + type=str, + nargs="?", + choices=["start", "stop", "restart"], + help="Command for running server.py to manually start/stop/restart a server", + ) + + # Client args + client_parser = argparse.ArgumentParser(add_help=False) + client_parser.add_argument( + "--max_prompt_length", type=int, default=None, help="Max length a prompt can be" + ) + client_parser.add_argument( + "--mean_prompt_length", + type=int, + nargs="+", + default=None, + help="Mean prompt length in tokens", + ) + client_parser.add_argument( + "--mean_max_new_tokens", + type=int, + nargs="+", + default=None, + help="Mean number of new tokens to generate per prompt", + ) + client_parser.add_argument( + "--num_clients", + type=int, + nargs="+", + default=[1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32], + help="Number of concurrent clients", + ) + client_parser.add_argument( + "--num_requests", + type=int, + default=512, + help="Number of requests to process by clients", + ) + client_parser.add_argument( + "--prompt_length_var", type=float, default=0.3, help="Variance of prompt length" + ) + client_parser.add_argument( + "--max_new_tokens_var", + type=float, + default=0.3, + help="Variance of max new tokens", + ) + client_parser.add_argument( + "--warmup", type=int, default=1, help="Number of warmup requests to process" + ) + client_parser.add_argument( + "--use_thread", action="store_true", help="Use threads instead of processes" + ) + client_parser.add_argument( + "--stream", action="store_true", help="Stream generated tokens" + ) + client_parser.add_argument( + "--out_json_dir", + type=Path, + default="./results/", + help="Directory to save result JSON files", + ) + + # Create the parser, inheriting from the server and/or client parsers + parents = [] + if server_args: + parents.append(server_parser) + if client_args: + parents.append(client_parser) + + # Common args + parser = argparse.ArgumentParser(parents=parents) + parser.add_argument( + "--model", type=str, default="meta-llama/Llama-2-7b-hf", help="Model name" + ) + parser.add_argument( + "--deployment_name", + type=str, + default="mii-benchmark-deployment", + help="Deployment name for MII server", + ) + parser.add_argument("--vllm", action="store_true", help="Use VLLM instead of MII") + parser.add_argument( + "--overwrite_results", action="store_true", help="Overwrite existing results" + ) + + # Parse arguments + args = parser.parse_args() + + # Set default values for model-specific parameters + if args.model in MODEL_DEFAULTS: + for k, v in MODEL_DEFAULTS[args.model].items(): + if hasattr(args, k) and getattr(args, k) is None: + setattr(args, k, v) + + # Grab any remaining default values not specified for a model + for k, v in ARG_DEFAULTS.items(): + if hasattr(args, k) and getattr(args, k) is None: + setattr(args, k, v) + + if server_args and not client_args: + # If we are not running the benchmark, we need to make sure to only have one value for the server args + for k in SERVER_PARAMS: + if not isinstance(getattr(args, k), int): + setattr(args, k, getattr(args, k)[0]) + + return args + + +def get_args_product( + args: argparse.Namespace, which: List[str] = None +) -> Iterator[argparse.Namespace]: + if which is None: + return copy.deepcopy(args) + for k in which: + if isinstance(getattr(args, k), int): + setattr(args, k, [getattr(args, k)]) + arg_values_product = itertools.product(*[getattr(args, k) for k in which]) + for arg_values in arg_values_product: + args_copy = copy.deepcopy(args) + for k, v in zip(which, arg_values): + setattr(args_copy, k, v) + yield args_copy + + +def get_results_path(args: argparse.Namespace) -> Path: + if args.vllm: + lib_path = "vllm" + else: + lib_path = "fastgen" + return Path( + args.out_json_dir, + f"{lib_path}/", + "-".join( + ( + args.model.replace("/", "_"), + f"tp{args.tp_size}", + f"bs{args.max_ragged_batch_size}", + f"replicas{args.num_replicas}", + f"prompt{args.mean_prompt_length}", + f"gen{args.mean_max_new_tokens}", + f"clients{args.num_clients}", + ) + ) + + ".json", + ) + + +def print_summary( + args: argparse.Namespace, response_details: List[ResponseDetails] +) -> None: + ps = get_summary(vars(args), response_details) + print( + f"Deployment: {args.deployment_name} Clients: {args.num_clients}, " + + f"Prompt (mean): {args.mean_prompt_length} tokens, " + + f"Generation (mean): {args.mean_max_new_tokens} tokens, " + + f"Query throughput: {ps.throughput:.3f} queries/s, " + + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, " + + f"Query latency: {ps.latency:.3f} s, " + + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " + + f"First token received: {ps.first_token_latency:.3f} s" + ) + + +def save_json_results( + args: argparse.Namespace, response_details: List[ResponseDetails] +) -> None: + args_dict = vars(args) + out_json_path = get_results_path(args) + os.makedirs(out_json_path.parent, exist_ok=True) + + with open(out_json_path, "w") as f: + args_dict["out_json_dir"] = str(out_json_path) # Path is not JSON serializable + data = { + "args": args_dict, + "time": str(datetime.now()), + "response_details": [asdict(r) for r in response_details], + } + json.dump(data, f, indent=2) + + +def results_exist(args: argparse.Namespace) -> bool: + return get_results_path(args).exists() From 107681e647b7558c25116b277af7a90f562504d2 Mon Sep 17 00:00:00 2001 From: LastWhisper Date: Fri, 26 Jan 2024 08:01:59 +0800 Subject: [PATCH 19/58] [Example] Refactor and Polish Cifar10-DeepSpeed Code Example. (#843) * Polish and Refactor Cifar10 Code Example * fix typos --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Conglong Li --- training/cifar/README.md | 23 +- training/cifar/cifar10_deepspeed.py | 690 +++++++++++++--------------- training/cifar/run_ds_moe.sh | 1 - training/cifar/run_ds_prmoe.sh | 1 - 4 files changed, 331 insertions(+), 384 deletions(-) diff --git a/training/cifar/README.md b/training/cifar/README.md index 7c58f3b98..878b28157 100644 --- a/training/cifar/README.md +++ b/training/cifar/README.md @@ -1,21 +1,22 @@ Thanks Gopi Kumar for contributing this example, demonstrating how to apply DeepSpeed to CIFAR-10 model. -cifar10_tutorial.py +`cifar10_tutorial.py` Baseline CIFAR-10 model. -cifar10_deepspeed.py +`cifar10_deepspeed.py` DeepSpeed applied CIFAR-10 model. -ds_config.json - DeepSpeed configuration file. - -run_ds.sh +`run_ds.sh` Script for running DeepSpeed applied model. -run_ds_moe.sh +`run_ds_moe.sh` Script for running DeepSpeed model with Mixture of Experts (MoE) integration. -* To run baseline CIFAR-10 model - "python cifar10_tutorial.py" -* To run DeepSpeed CIFAR-10 model - "bash run_ds.sh" -* To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - "bash run_ds_moe.sh" -* To run with different data type (default='fp16') and zero stages (default=0) - "bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}" +`run_ds_prmoe.sh` + Script for running DeepSpeed model with Pyramid Residual MoE (PR-MoE) integration. + +* To run baseline CIFAR-10 model - `python cifar10_tutorial.py` +* To run DeepSpeed CIFAR-10 model - `bash run_ds.sh` +* To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - `bash run_ds_moe.sh` +* To run DeepSpeed CIFAR-10 model with Pyramid Residual MoE (PR-MoE) - `bash run_ds_prmoe.sh` +* To run with different data type (default=`fp16`) and zero stages (default=`0`) - `bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}` diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py index da82e60db..521a75cdf 100755 --- a/training/cifar/cifar10_deepspeed.py +++ b/training/cifar/cifar10_deepspeed.py @@ -1,112 +1,105 @@ +import argparse + +import deepspeed import torch +import torch.nn as nn +import torch.nn.functional as F import torchvision import torchvision.transforms as transforms -import argparse -import deepspeed from deepspeed.accelerator import get_accelerator +from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer def add_argument(): + parser = argparse.ArgumentParser(description="CIFAR") - parser = argparse.ArgumentParser(description='CIFAR') - - #data - # cuda - parser.add_argument('--with_cuda', - default=False, - action='store_true', - help='use CPU in case there\'s no GPU support') - parser.add_argument('--use_ema', - default=False, - action='store_true', - help='whether use exponential moving average') - - # train - parser.add_argument('-b', - '--batch_size', - default=32, - type=int, - help='mini-batch size (default: 32)') - parser.add_argument('-e', - '--epochs', - default=30, - type=int, - help='number of total epochs (default: 30)') - parser.add_argument('--local_rank', - type=int, - default=-1, - help='local rank passed from distributed launcher') - - parser.add_argument('--log-interval', - type=int, - default=2000, - help="output logging information at a given interval") - - parser.add_argument('--moe', - default=False, - action='store_true', - help='use deepspeed mixture of experts (moe)') - - parser.add_argument('--ep-world-size', - default=1, - type=int, - help='(moe) expert parallel world size') - parser.add_argument('--num-experts', - type=int, - nargs='+', - default=[ - 1, - ], - help='number of experts list, MoE related.') + # For train. parser.add_argument( - '--mlp-type', - type=str, - default='standard', - help= - 'Only applicable when num-experts > 1, accepts [standard, residual]') - parser.add_argument('--top-k', - default=1, - type=int, - help='(moe) gating top 1 and 2 supported') + "-e", + "--epochs", + default=30, + type=int, + help="number of total epochs (default: 30)", + ) parser.add_argument( - '--min-capacity', - default=0, + "--local_rank", type=int, - help= - '(moe) minimum capacity of an expert regardless of the capacity_factor' + default=-1, + help="local rank passed from distributed launcher", ) parser.add_argument( - '--noisy-gate-policy', - default=None, + "--log-interval", + type=int, + default=2000, + help="output logging information at a given interval", + ) + + # For mixed precision training. + parser.add_argument( + "--dtype", + default="fp16", type=str, - help= - '(moe) noisy gating (only supported with top-1). Valid values are None, RSample, and Jitter' + choices=["bf16", "fp16", "fp32"], + help="Datatype used for training", + ) + + # For ZeRO Optimization. + parser.add_argument( + "--stage", + default=0, + type=int, + choices=[0, 1, 2, 3], + help="Datatype used for training", ) + + # For MoE (Mixture of Experts). parser.add_argument( - '--moe-param-group', + "--moe", default=False, - action='store_true', - help= - '(moe) create separate moe param groups, required when using ZeRO w. MoE' + action="store_true", + help="use deepspeed mixture of experts (moe)", + ) + parser.add_argument( + "--ep-world-size", default=1, type=int, help="(moe) expert parallel world size" + ) + parser.add_argument( + "--num-experts", + type=int, + nargs="+", + default=[ + 1, + ], + help="number of experts list, MoE related.", ) parser.add_argument( - '--dtype', - default='fp16', + "--mlp-type", type=str, - choices=['bf16', 'fp16', 'fp32'], - help= - 'Datatype used for training' + default="standard", + help="Only applicable when num-experts > 1, accepts [standard, residual]", + ) + parser.add_argument( + "--top-k", default=1, type=int, help="(moe) gating top 1 and 2 supported" ) parser.add_argument( - '--stage', + "--min-capacity", default=0, type=int, - choices=[0, 1, 2, 3], - help= - 'Datatype used for training' + help="(moe) minimum capacity of an expert regardless of the capacity_factor", + ) + parser.add_argument( + "--noisy-gate-policy", + default=None, + type=str, + help="(moe) noisy gating (only supported with top-1). Valid values are None, RSample, and Jitter", + ) + parser.add_argument( + "--moe-param-group", + default=False, + action="store_true", + help="(moe) create separate moe param groups, required when using ZeRO w. MoE", ) - # Include DeepSpeed configuration arguments + # Include DeepSpeed configuration arguments. parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() @@ -114,110 +107,87 @@ def add_argument(): return args -deepspeed.init_distributed() - -######################################################################## -# The output of torchvision datasets are PILImage images of range [0, 1]. -# We transform them to Tensors of normalized range [-1, 1]. -# .. note:: -# If running on Windows and you get a BrokenPipeError, try setting -# the num_worker of torch.utils.data.DataLoader() to 0. - -transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) - -if torch.distributed.get_rank() != 0: - # might be downloading cifar data, let rank 0 download first - torch.distributed.barrier() - -trainset = torchvision.datasets.CIFAR10(root='./data', - train=True, - download=True, - transform=transform) - -if torch.distributed.get_rank() == 0: - # cifar data is downloaded, indicate other ranks can proceed - torch.distributed.barrier() - -trainloader = torch.utils.data.DataLoader(trainset, - batch_size=16, - shuffle=True, - num_workers=2) - -testset = torchvision.datasets.CIFAR10(root='./data', - train=False, - download=True, - transform=transform) -testloader = torch.utils.data.DataLoader(testset, - batch_size=4, - shuffle=False, - num_workers=2) - -classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', - 'ship', 'truck') - -######################################################################## -# Let us show some of the training images, for fun. - -import matplotlib.pyplot as plt -import numpy as np - -# functions to show an image - - -def imshow(img): - img = img / 2 + 0.5 # unnormalize - npimg = img.numpy() - plt.imshow(np.transpose(npimg, (1, 2, 0))) - plt.show() - - -# get some random training images -dataiter = iter(trainloader) -images, labels = next(dataiter) - -# show images -imshow(torchvision.utils.make_grid(images)) -# print labels -print(' '.join('%5s' % classes[labels[j]] for j in range(4))) - -######################################################################## -# 2. Define a Convolutional Neural Network -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Copy the neural network from the Neural Networks section before and modify it to -# take 3-channel images (instead of 1-channel images as it was defined). +def create_moe_param_groups(model): + """Create separate parameter groups for each expert.""" + parameters = {"params": [p for p in model.parameters()], "name": "parameters"} + return split_params_into_different_moe_groups_for_optimizer(parameters) -import torch.nn as nn -import torch.nn.functional as F -args = add_argument() +def get_ds_config(args): + """Get the DeepSpeed configuration dictionary.""" + ds_config = { + "train_batch_size": 16, + "steps_per_print": 2000, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7, + }, + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000, + }, + }, + "gradient_clipping": 1.0, + "prescale_gradients": False, + "bf16": {"enabled": args.dtype == "bf16"}, + "fp16": { + "enabled": args.dtype == "fp16", + "fp16_master_weights_and_grads": False, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 15, + }, + "wall_clock_breakdown": False, + "zero_optimization": { + "stage": args.stage, + "allgather_partitions": True, + "reduce_scatter": True, + "allgather_bucket_size": 50000000, + "reduce_bucket_size": 50000000, + "overlap_comm": True, + "contiguous_gradients": True, + "cpu_offload": False, + }, + } + return ds_config class Net(nn.Module): - def __init__(self): + def __init__(self, args): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) - if args.moe: + self.moe = args.moe + if self.moe: fc3 = nn.Linear(84, 84) self.moe_layer_list = [] for n_e in args.num_experts: - # create moe layers based on the number of experts + # Create moe layers based on the number of experts. self.moe_layer_list.append( deepspeed.moe.layer.MoE( hidden_size=84, expert=fc3, num_experts=n_e, ep_size=args.ep_world_size, - use_residual=args.mlp_type == 'residual', + use_residual=args.mlp_type == "residual", k=args.top_k, min_capacity=args.min_capacity, - noisy_gate_policy=args.noisy_gate_policy)) + noisy_gate_policy=args.noisy_gate_policy, + ) + ) self.moe_layer_list = nn.ModuleList(self.moe_layer_list) self.fc4 = nn.Linear(84, 10) else: @@ -229,7 +199,7 @@ def forward(self, x): x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) - if args.moe: + if self.moe: for layer in self.moe_layer_list: x, _, _ = layer(x) x = self.fc4(x) @@ -238,214 +208,192 @@ def forward(self, x): return x -net = Net() +def test(model_engine, testset, local_device, target_dtype, test_batch_size=4): + """Test the network on the test data. + + Args: + model_engine (deepspeed.runtime.engine.DeepSpeedEngine): the DeepSpeed engine. + testset (torch.utils.data.Dataset): the test dataset. + local_device (str): the local device name. + target_dtype (torch.dtype): the target datatype for the test data. + test_batch_size (int): the test batch size. + + """ + # The 10 classes for CIFAR10. + classes = ( + "plane", + "car", + "bird", + "cat", + "deer", + "dog", + "frog", + "horse", + "ship", + "truck", + ) + # Define the test dataloader. + testloader = torch.utils.data.DataLoader( + testset, batch_size=test_batch_size, shuffle=False, num_workers=0 + ) -def create_moe_param_groups(model): - from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer + # For total accuracy. + correct, total = 0, 0 + # For accuracy per class. + class_correct = list(0.0 for i in range(10)) + class_total = list(0.0 for i in range(10)) + + # Start testing. + model_engine.eval() + with torch.no_grad(): + for data in testloader: + images, labels = data + if target_dtype != None: + images = images.to(target_dtype) + outputs = model_engine(images.to(local_device)) + _, predicted = torch.max(outputs.data, 1) + # Count the total accuracy. + total += labels.size(0) + correct += (predicted == labels.to(local_device)).sum().item() + + # Count the accuracy per class. + batch_correct = (predicted == labels.to(local_device)).squeeze() + for i in range(test_batch_size): + label = labels[i] + class_correct[label] += batch_correct[i].item() + class_total[label] += 1 + + if model_engine.local_rank == 0: + print( + f"Accuracy of the network on the {total} test images: {100 * correct / total : .0f} %" + ) + + # For all classes, print the accuracy. + for i in range(10): + print( + f"Accuracy of {classes[i] : >5s} : {100 * class_correct[i] / class_total[i] : 2.0f} %" + ) + + +def main(args): + # Initialize DeepSpeed distributed backend. + deepspeed.init_distributed() + + ######################################################################## + # Step1. Data Preparation. + # + # The output of torchvision datasets are PILImage images of range [0, 1]. + # We transform them to Tensors of normalized range [-1, 1]. + # + # Note: + # If running on Windows and you get a BrokenPipeError, try setting + # the num_worker of torch.utils.data.DataLoader() to 0. + ######################################################################## + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + ) - parameters = { - 'params': [p for p in model.parameters()], - 'name': 'parameters' - } + if torch.distributed.get_rank() != 0: + # Might be downloading cifar data, let rank 0 download first. + torch.distributed.barrier() - return split_params_into_different_moe_groups_for_optimizer(parameters) + # Load or download cifar data. + trainset = torchvision.datasets.CIFAR10( + root="./data", train=True, download=True, transform=transform + ) + testset = torchvision.datasets.CIFAR10( + root="./data", train=False, download=True, transform=transform + ) + if torch.distributed.get_rank() == 0: + # Cifar data is downloaded, indicate other ranks can proceed. + torch.distributed.barrier() + + ######################################################################## + # Step 2. Define the network with DeepSpeed. + # + # First, we define a Convolution Neural Network. + # Then, we define the DeepSpeed configuration dictionary and use it to + # initialize the DeepSpeed engine. + ######################################################################## + net = Net(args) + + # Get list of parameters that require gradients. + parameters = filter(lambda p: p.requires_grad, net.parameters()) + + # If using MoE, create separate param groups for each expert. + if args.moe_param_group: + parameters = create_moe_param_groups(net) + + # Initialize DeepSpeed to use the following features. + # 1) Distributed model. + # 2) Distributed data loader. + # 3) DeepSpeed optimizer. + ds_config = get_ds_config(args) + model_engine, optimizer, trainloader, __ = deepspeed.initialize( + args=args, + model=net, + model_parameters=parameters, + training_data=trainset, + config=ds_config, + ) -parameters = filter(lambda p: p.requires_grad, net.parameters()) -if args.moe_param_group: - parameters = create_moe_param_groups(net) - -# Initialize DeepSpeed to use the following features -# 1) Distributed model -# 2) Distributed data loader -# 3) DeepSpeed optimizer -ds_config = { - "train_batch_size": 16, - "steps_per_print": 2000, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.001, - "betas": [ - 0.8, - 0.999 - ], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 - } - }, - "gradient_clipping": 1.0, - "prescale_gradients": False, - "bf16": { - "enabled": args.dtype == "bf16" - }, - "fp16": { - "enabled": args.dtype == "fp16", - "fp16_master_weights_and_grads": False, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 15 - }, - "wall_clock_breakdown": False, - "zero_optimization": { - "stage": args.stage, - "allgather_partitions": True, - "reduce_scatter": True, - "allgather_bucket_size": 50000000, - "reduce_bucket_size": 50000000, - "overlap_comm": True, - "contiguous_gradients": True, - "cpu_offload": False - } -} - -model_engine, optimizer, trainloader, __ = deepspeed.initialize( - args=args, model=net, model_parameters=parameters, training_data=trainset, config=ds_config) - -local_device = get_accelerator().device_name(model_engine.local_rank) -local_rank = model_engine.local_rank - -# For float32, target_dtype will be None so no datatype conversion needed -target_dtype = None -if model_engine.bfloat16_enabled(): - target_dtype=torch.bfloat16 -elif model_engine.fp16_enabled(): - target_dtype=torch.half - -#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -#net.to(device) -######################################################################## -# 3. Define a Loss function and optimizer -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Let's use a Classification Cross-Entropy loss and SGD with momentum. - -import torch.optim as optim - -criterion = nn.CrossEntropyLoss() -#optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - -######################################################################## -# 4. Train the network -# ^^^^^^^^^^^^^^^^^^^^ -# -# This is when things start to get interesting. -# We simply have to loop over our data iterator, and feed the inputs to the -# network and optimize. - -for epoch in range(args.epochs): # loop over the dataset multiple times - - running_loss = 0.0 - for i, data in enumerate(trainloader): - # get the inputs; data is a list of [inputs, labels] - inputs, labels = data[0].to(local_device), data[1].to(local_device) - if target_dtype != None: - inputs = inputs.to(target_dtype) - outputs = model_engine(inputs) - loss = criterion(outputs, labels) - - model_engine.backward(loss) - model_engine.step() - - # print statistics - running_loss += loss.item() - if local_rank == 0 and i % args.log_interval == ( - args.log_interval - - 1): # print every log_interval mini-batches - print('[%d, %5d] loss: %.3f' % - (epoch + 1, i + 1, running_loss / args.log_interval)) - running_loss = 0.0 - -print('Finished Training') - -######################################################################## -# 5. Test the network on the test data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We have trained the network for 2 passes over the training dataset. -# But we need to check if the network has learnt anything at all. -# -# We will check this by predicting the class label that the neural network -# outputs, and checking it against the ground-truth. If the prediction is -# correct, we add the sample to the list of correct predictions. -# -# Okay, first step. Let us display an image from the test set to get familiar. - -dataiter = iter(testloader) -images, labels = next(dataiter) - -# print images -imshow(torchvision.utils.make_grid(images)) -print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) - -######################################################################## -# Okay, now let us see what the neural network thinks these examples above are: -if target_dtype != None: - images = images.to(target_dtype) -outputs = net(images.to(local_device)) - -######################################################################## -# The outputs are energies for the 10 classes. -# The higher the energy for a class, the more the network -# thinks that the image is of the particular class. -# So, let's get the index of the highest energy: -_, predicted = torch.max(outputs, 1) - -print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(4))) - -######################################################################## -# The results seem pretty good. -# -# Let us look at how the network performs on the whole dataset. - -correct = 0 -total = 0 -with torch.no_grad(): - for data in testloader: - images, labels = data - if target_dtype != None: - images = images.to(target_dtype) - outputs = net(images.to(local_device)) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels.to(local_device)).sum().item() - -print('Accuracy of the network on the 10000 test images: %d %%' % - (100 * correct / total)) - -######################################################################## -# That looks way better than chance, which is 10% accuracy (randomly picking -# a class out of 10 classes). -# Seems like the network learnt something. -# -# Hmmm, what are the classes that performed well, and the classes that did -# not perform well: - -class_correct = list(0. for i in range(10)) -class_total = list(0. for i in range(10)) -with torch.no_grad(): - for data in testloader: - images, labels = data - if target_dtype != None: - images = images.to(target_dtype) - outputs = net(images.to(local_device)) - _, predicted = torch.max(outputs, 1) - c = (predicted == labels.to(local_device)).squeeze() - for i in range(4): - label = labels[i] - class_correct[label] += c[i].item() - class_total[label] += 1 - -for i in range(10): - print('Accuracy of %5s : %2d %%' % - (classes[i], 100 * class_correct[i] / class_total[i])) + # Get the local device name (str) and local rank (int). + local_device = get_accelerator().device_name(model_engine.local_rank) + local_rank = model_engine.local_rank + + # For float32, target_dtype will be None so no datatype conversion needed. + target_dtype = None + if model_engine.bfloat16_enabled(): + target_dtype = torch.bfloat16 + elif model_engine.fp16_enabled(): + target_dtype = torch.half + + # Define the Classification Cross-Entropy loss function. + criterion = nn.CrossEntropyLoss() + + ######################################################################## + # Step 3. Train the network. + # + # This is when things start to get interesting. + # We simply have to loop over our data iterator, and feed the inputs to the + # network and optimize. (DeepSpeed handles the distributed details for us!) + ######################################################################## + + for epoch in range(args.epochs): # loop over the dataset multiple times + running_loss = 0.0 + for i, data in enumerate(trainloader): + # Get the inputs. ``data`` is a list of [inputs, labels]. + inputs, labels = data[0].to(local_device), data[1].to(local_device) + + # Try to convert to target_dtype if needed. + if target_dtype != None: + inputs = inputs.to(target_dtype) + + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + + model_engine.backward(loss) + model_engine.step() + + # Print statistics + running_loss += loss.item() + if local_rank == 0 and i % args.log_interval == ( + args.log_interval - 1 + ): # Print every log_interval mini-batches. + print( + f"[{epoch + 1 : d}, {i + 1 : 5d}] loss: {running_loss / args.log_interval : .3f}" + ) + running_loss = 0.0 + print("Finished Training") + + ######################################################################## + # Step 4. Test the network on the test data. + ######################################################################## + test(model_engine, testset, local_device, target_dtype) + + +if __name__ == "__main__": + args = add_argument() + main(args) diff --git a/training/cifar/run_ds_moe.sh b/training/cifar/run_ds_moe.sh index b7dcb7fa7..f87a29628 100755 --- a/training/cifar/run_ds_moe.sh +++ b/training/cifar/run_ds_moe.sh @@ -15,7 +15,6 @@ deepspeed --num_nodes=${NUM_NODES}\ cifar10_deepspeed.py \ --log-interval 100 \ --deepspeed \ - --deepspeed_config ds_config.json \ --moe \ --ep-world-size ${EP_SIZE} \ --num-experts ${EXPERTS} \ diff --git a/training/cifar/run_ds_prmoe.sh b/training/cifar/run_ds_prmoe.sh index 72731b0d5..d9755a331 100644 --- a/training/cifar/run_ds_prmoe.sh +++ b/training/cifar/run_ds_prmoe.sh @@ -12,7 +12,6 @@ EXPERTS='2 4' deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \ --log-interval 100 \ --deepspeed \ - --deepspeed_config ds_config.json \ --moe \ --ep-world-size ${EP_SIZE} \ --num-experts ${EXPERTS} \ From 6863634fc2fb974956272e33ecc0ac311bf6f33b Mon Sep 17 00:00:00 2001 From: stceum <50257864+stceum@users.noreply.github.com> Date: Fri, 2 Feb 2024 01:04:18 +0800 Subject: [PATCH 20/58] Not a bug, just missing a space in README.md (#857) --- .../training/step2_reward_model_finetuning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md index ede072a79..3c62b9f82 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md @@ -6,7 +6,7 @@ Finetuning the Reward Model (RM) is more or less similar to Step-1 Supervised F For SFT finetuning, the data is the concatenation of a query and an answer. However, for RM finetuning, each batch of data consists of two query-answer pairs, i.e., the same query with a high-score answer and a low-score answer. This also leads to the second difference as describe below. -👉**The training objective difference** +👉 **The training objective difference** For RW, the training objective is the pairwise ranking score, i.e., for the two query-answer pairs, RM is supposed to give a higher score to the better answer. There are multiple ways to achieve this. In our implementation, we use either the end token of the sequence or the first padding token as the aggregated score and compare them. Others may also use the average score for the entire answer as an alternative. From 19e0efb78134c19c67fe4275214548f364bf6dab Mon Sep 17 00:00:00 2001 From: Ming Liu <95666491+mlzoo@users.noreply.github.com> Date: Fri, 2 Feb 2024 01:09:25 +0800 Subject: [PATCH 21/58] Fix errors of AttributeError: 'str' object has no attribute 'stdout' (#826) * Update train_bert.py Fix errors of stdout * Update train_bert.py fix errors of stdout --------- Co-authored-by: Michael Wyatt --- training/HelloDeepSpeed/train_bert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/HelloDeepSpeed/train_bert.py b/training/HelloDeepSpeed/train_bert.py index a55215dbe..05e360d9c 100644 --- a/training/HelloDeepSpeed/train_bert.py +++ b/training/HelloDeepSpeed/train_bert.py @@ -465,7 +465,7 @@ def create_experiment_dir(checkpoint_dir: pathlib.Path, try: gitlog = sh.git.log("-1", format="%H", _tty_out=False, _fg=False) with (exp_dir / "githash.log").open("w") as handle: - handle.write(gitlog.stdout.decode("utf-8")) + handle.write(gitlog) except sh.ErrorReturnCode_128: logger.info("Seems like the code is not running from" " within a git repo, so hash will" @@ -476,7 +476,7 @@ def create_experiment_dir(checkpoint_dir: pathlib.Path, try: gitdiff = sh.git.diff(_fg=False, _tty_out=False) with (exp_dir / "gitdiff.log").open("w") as handle: - handle.write(gitdiff.stdout.decode("utf-8")) + handle.write(gitdiff) except sh.ErrorReturnCode_129: logger.info("Seems like the code is not running from" " within a git repo, so diff will" From b338d1e52f142254d57a5d4ebb109ee05132fc9b Mon Sep 17 00:00:00 2001 From: Zixu Wang <61218792+foin6@users.noreply.github.com> Date: Tue, 6 Feb 2024 02:43:20 +0800 Subject: [PATCH 22/58] Control the kernel injection with new argument. And compare the outputs only on rank 0 (#853) * disable kernel injection when using CPUs * add an argument to control kernel injection * test on rank 0 only * keep using tp for ds model inference * print match rate only on rank 0 * calculate cross perplexity between ds & baseline model * delete ds-hf-compare-fidelity.py * --use_kernel acts as an enabling argument if provided --------- Co-authored-by: Olatunji Ruwase --- .../huggingface/text-generation/README.md | 3 +- .../text-generation/ds-hf-compare.py | 42 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/inference/huggingface/text-generation/README.md b/inference/huggingface/text-generation/README.md index 318e37416..65e82bfe7 100644 --- a/inference/huggingface/text-generation/README.md +++ b/inference/huggingface/text-generation/README.md @@ -91,8 +91,9 @@ The DSPipeline class helps to load the model and run inference on it, given thes # DeepSpeed HuggingFace Compare The ds-hf-compare script can be used to compare the text generated outputs of DeepSpeed with kernel injection and HuggingFace inference of a model with the same parameters on a single GPU. +(p.s. kernel injection will not be used by default and is only enabled when the "--use_kernel" argument is provided.) ## Usage Examples can be run as follows: -
deepspeed --num_gpus 1 ds-hf-compare.py --model [model name/path] --dtype [data type] --num_inputs [number of test inputs] --print_outputs
+
deepspeed --num_gpus 1 ds-hf-compare.py --model [model name/path] --dtype [data type] --num_inputs [number of test inputs] --print_outputs --use_kernel[enable kernel injection]
 
\ No newline at end of file diff --git a/inference/huggingface/text-generation/ds-hf-compare.py b/inference/huggingface/text-generation/ds-hf-compare.py index 27f307a32..bad82e9d8 100644 --- a/inference/huggingface/text-generation/ds-hf-compare.py +++ b/inference/huggingface/text-generation/ds-hf-compare.py @@ -14,8 +14,13 @@ parser.add_argument("--max_length", default=300, type=int, help="maximum tokens generated") parser.add_argument("--print_outputs", action='store_true', help="print generated text outputs") parser.add_argument("--local_rank", type=int, default=0, help="local rank") +parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection") args = parser.parse_args() +def print_0(output): + if args.local_rank == 0: + print(output) + def string_similarity(str1, str2): matcher = SequenceMatcher(None, str1, str2) similarity_ratio = matcher.ratio() @@ -70,7 +75,7 @@ def string_similarity(str1, str2): if args.num_inputs < len(test_inputs): inputs = test_inputs[:args.num_inputs] else: - print(f"Warning: num_inputs ({args.num_inputs}) is greater than the number of test inputs ({len(test_inputs)}). Using all test inputs.") + print_0(f"Warning: num_inputs ({args.num_inputs}) is greater than the number of test inputs ({len(test_inputs)}). Using all test inputs.") inputs = test_inputs data_type = getattr(torch, args.dtype) @@ -81,25 +86,28 @@ def string_similarity(str1, str2): mismatch_count=0 # Run the baseline model -for prompt in inputs: - base_out_list += pipe(prompt, do_sample=False, min_length=args.min_length, max_length=args.max_length) +if args.local_rank == 0: + for prompt in inputs: + base_out_list += pipe(prompt, do_sample=False, min_length=args.min_length, max_length=args.max_length) # Initialize the model with DeepSpeed -pipe.model = deepspeed.init_inference(pipe.model, dtype=data_type, replace_with_kernel_inject=True) +pipe.model = deepspeed.init_inference(pipe.model, dtype=data_type, replace_with_kernel_inject=args.use_kernel) # Run the DeepSpeed model and compare outputs for prompt, base_out in zip(inputs, base_out_list): ds_out = pipe(prompt, do_sample=False, min_length=args.min_length, max_length=args.max_length) - if args.print_outputs: - print(f"baseline output: {base_out}") - print(f"deepspeed output: {ds_out}") - print(f"{'-'*60}") - if base_out == ds_out[0]: - if args.print_outputs: print("outputs match") - match_count += 1 - else: - if args.print_outputs: print("outputs do not match") - mismatch_count += 1 - similarity = string_similarity(base_out['generated_text'], ds_out[0]['generated_text']) - if args.print_outputs: print(f"The similarity ratio is: {similarity*100}%") -print(f"Matches: {match_count}\nMismatches: {mismatch_count}") + if args.local_rank == 0: + if args.print_outputs: + print(f"baseline output: {base_out}") + print(f"deepspeed output: {ds_out}") + print(f"{'-'*60}") + if base_out == ds_out[0]: + if args.print_outputs: print("outputs match") + match_count += 1 + else: + if args.print_outputs: print("outputs do not match") + mismatch_count += 1 + similarity = string_similarity(base_out['generated_text'], ds_out[0]['generated_text']) + if args.print_outputs: print(f"The similarity ratio is: {similarity*100}%") + +print_0(f"Matches: {match_count}\nMismatches: {mismatch_count}") From 48177db77fdd33bada90d7a02fd3fcc8bd080507 Mon Sep 17 00:00:00 2001 From: Zixu Wang <61218792+foin6@users.noreply.github.com> Date: Sat, 10 Feb 2024 06:53:31 +0800 Subject: [PATCH 23/58] Different accelerators can be called according to specific device conditions (#859) * test xpu in local * use get_accelerator() to get device * delete load_from_disk --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Michael Wyatt --- .../automatic-speech-recognition/test-wav2vec2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py b/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py index f319928f2..18b5406bc 100644 --- a/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py +++ b/inference/huggingface/automatic-speech-recognition/test-wav2vec2.py @@ -7,12 +7,14 @@ import deepspeed from deepspeed import module_inject from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2EncoderLayer +from deepspeed.accelerator import get_accelerator librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # Get local gpu rank from torch.distributed/deepspeed launcher local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) +device = torch.device(get_accelerator().device_name(local_rank)) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" @@ -27,7 +29,7 @@ dtype=torch.float, injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')}, replace_with_kernel_inject=False) -model.to(f'cuda:{local_rank}') +model.to(device) def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech @@ -38,7 +40,7 @@ def map_to_array(batch): def map_to_pred(batch): input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values with torch.no_grad(): - logits = model(input_values.to(f'cuda:{local_rank}')).logits + logits = model(input_values.to(device)).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) From 0b1ea40332e8793dd645213b8ce8c38fd66181f3 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Thu, 22 Feb 2024 12:36:17 -0800 Subject: [PATCH 24/58] Add Human Eval Example (#856) This PR adds a HumanEval example and an associated README. The example will run through the human-eval problem set using a standard HuggingFace Pipeline and DeepSpeed-MII's FastGen, performing a simple result comparison at the end. A new evaluation folder is added to DeepSpeedExamples more generally to house evaluation scripts. --- evaluation/inference/human_eval/README.md | 45 ++++++++++++ .../inference/human_eval/run_human_eval.py | 69 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 evaluation/inference/human_eval/README.md create mode 100644 evaluation/inference/human_eval/run_human_eval.py diff --git a/evaluation/inference/human_eval/README.md b/evaluation/inference/human_eval/README.md new file mode 100644 index 000000000..018667267 --- /dev/null +++ b/evaluation/inference/human_eval/README.md @@ -0,0 +1,45 @@ +# HumanEval Evaluation Script for DeepSpeed-FastGen + +## DISCLAIMER + +This human-eval evaluation will execute untrusted model-generated code. As per the OpenAI warning, we +strongly recommend you sandbox your environment as described in the [human-eval paper](https://arxiv.org/pdf/2107.03374.pdf). + +## Setup + +Running the human-eval evaluation requires installation of `human_eval` with the execution code enabled, +which requires local changes to `execution.py`. The following steps will setup `human-eval` for execution: + +```bash +git clone https://github.com/openai/human-eval.git +sed -i '/exec(check_program, exec_globals)/ s/^# //' he_test/human_eval/execution.py +cd human-eval +python -m pip install -e . +``` + +This evaluation also requires the installation of DeepSpeed-MII: + +```bash +python -m pip install deepspeed-mii +``` + +Additional DeepSpeed-MII installation details can be found [here](https://github.com/microsoft/DeepSpeed-MII#installation). + +## Run the Evaluation + +The following command shows how to run a benchmark using the `codellama/CodeLlama-7b-Python-hf` model: + +```bash +python run_human_eval.py --model codellama/CodeLlama-7b-Python-hf --max-tokens 512 --num-samples-per-task 20 +``` + +## Run Evaluation on Samples + +Once samples have been generated, they can be evaluated independently using the `evaluate_functional_correctness` command. +For example, the following command will evaluate `mii_samples.jsonl`: + +```bash +evaluate_functional_correctness mii_samples.jsonl +``` + +The evaluation results will be saved to `mii_samples.jsonl_results.jsonl`. diff --git a/evaluation/inference/human_eval/run_human_eval.py b/evaluation/inference/human_eval/run_human_eval.py new file mode 100644 index 000000000..3acad8ece --- /dev/null +++ b/evaluation/inference/human_eval/run_human_eval.py @@ -0,0 +1,69 @@ +import os +import torch +import mii +import numpy +import argparse +from deepspeed.accelerator import get_accelerator +from transformers import pipeline +from human_eval.data import write_jsonl, read_problems +from human_eval.evaluation import evaluate_functional_correctness + +parser = argparse.ArgumentParser() +parser.add_argument("--model", "-m", type=str, default="codellama/CodeLlama-7b-Python-hf", help="evaluation model name") +parser.add_argument("--max-tokens", type=int, default=512, help="max new tokens") +parser.add_argument("--num-samples-per-task", type=int, default=20, help="number of samples to gen/eval per task") +parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") +args = parser.parse_args() + +def generate_base_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, do_sample=True)[0]["generated_text"] + +def generate_mii_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, max_new_tokens=args.max_tokens)[0].generated_text + +def generate_samples(pipe, generation_function): + samples = [ + dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"])) for task_id in problems + for _ in range(args.num_samples_per_task) + ] + return samples + +print("Loading Problems") +problems = read_problems("human-eval/data/HumanEval.jsonl.gz") + +print("Initializing HuggingFace Pipeline") +device = torch.device(get_accelerator().device_name(args.local_rank)) +base_pipe = pipeline(model=args.model, + device=torch.device(get_accelerator().device_name(args.local_rank)), + max_length=args.max_tokens, + return_full_text=False) + +print("Generating Base Samples") +base_samples = generate_samples(base_pipe, generate_base_completion) + +print("Base Pipeline Teardown") +del base_pipe +torch.cuda.empty_cache() + +print("Initializing DeepSpeed-MII Pipeline") +mii_pipe = mii.pipeline(args.model) + +print("Generating MII Samples") +mii_samples = generate_samples(mii_pipe, generate_mii_completion) + +print("MII Pipeline Teardown") +mii_pipe.destroy() + +print("Writing Samples") +write_jsonl("base_samples.jsonl", base_samples) +write_jsonl("mii_samples.jsonl", mii_samples) + +print("Evaluating Samples") +base_results = evaluate_functional_correctness("base_samples.jsonl") +mii_results = evaluate_functional_correctness("mii_samples.jsonl") + +print(f"Base Results = {base_results}") +print(f"MII Results = {mii_results}") + +for key in base_results.keys(): + print(f"{key} - Base Result: {base_results[key]}, MII result: {mii_results[key]}") From 0ac02da7531df4823d55519d62f30fdac3820c26 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:26:52 -0800 Subject: [PATCH 25/58] Fix path in human-eval example README (#862) Fix sed path in human-eval example README. --- evaluation/inference/human_eval/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/inference/human_eval/README.md b/evaluation/inference/human_eval/README.md index 018667267..d3b254ea2 100644 --- a/evaluation/inference/human_eval/README.md +++ b/evaluation/inference/human_eval/README.md @@ -12,7 +12,7 @@ which requires local changes to `execution.py`. The following steps will setup ` ```bash git clone https://github.com/openai/human-eval.git -sed -i '/exec(check_program, exec_globals)/ s/^# //' he_test/human_eval/execution.py +sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py cd human-eval python -m pip install -e . ``` From 6540db6c23c87cc3dd7a9238f1a056771d96a06e Mon Sep 17 00:00:00 2001 From: Zixu Wang <61218792+foin6@users.noreply.github.com> Date: Mon, 26 Feb 2024 22:54:43 +0800 Subject: [PATCH 26/58] Modify codes so that different accelerators can be called according to specific device conditions (#863) * use get_accelerator to get device * bfloat16 --> bf16 --------- Co-authored-by: Olatunji Ruwase --- inference/huggingface/fill-mask/test-bert.py | 9 +++++---- inference/huggingface/fill-mask/test-electra.py | 3 ++- inference/huggingface/fill-mask/test-roberta.py | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/inference/huggingface/fill-mask/test-bert.py b/inference/huggingface/fill-mask/test-bert.py index d317710a2..fb2af691a 100644 --- a/inference/huggingface/fill-mask/test-bert.py +++ b/inference/huggingface/fill-mask/test-bert.py @@ -4,13 +4,14 @@ import torch import os import argparse +from deepspeed.accelerator import get_accelerator parser = argparse.ArgumentParser() parser.add_argument("--model", "-m", type=str, help="hf model name") -parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32") +parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32 or bf16") parser.add_argument("--local_rank", type=int, default=0, help="local rank") parser.add_argument("--trials", type=int, default=8, help="number of trials") -parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on") +parser.add_argument("--kernel_inject", action="store_true", help="inject kernels on") parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on") parser.add_argument("--triton", action="store_true", help="triton kernels on") parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference") @@ -26,11 +27,11 @@ pipe.model, mp_size=world_size, dtype=torch.float16 if args.triton else torch.float, - replace_with_kernel_inject=True, + replace_with_kernel_inject=args.kernel_inject, use_triton=args.triton, ) -pipe.device = torch.device(f'cuda:{local_rank}') +pipe.device = torch.device(get_accelerator().device_name(local_rank)) output = pipe("In Autumn the [MASK] fall from the trees.") if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: diff --git a/inference/huggingface/fill-mask/test-electra.py b/inference/huggingface/fill-mask/test-electra.py index 5c5448ace..28760f9f6 100644 --- a/inference/huggingface/fill-mask/test-electra.py +++ b/inference/huggingface/fill-mask/test-electra.py @@ -4,6 +4,7 @@ import torch import os from transformers.models.electra.modeling_electra import ElectraLayer +from deepspeed.accelerator import get_accelerator local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '4')) @@ -21,7 +22,7 @@ dtype=torch.float, injection_policy={ElectraLayer: ('output.dense')} ) -pipe.device = torch.device(f'cuda:{local_rank}') +pipe.device = torch.device(get_accelerator().device_name(local_rank)) output = pipe(f"HuggingFace is creating a {pipe.tokenizer.mask_token} that the community uses to solve NLP tasks.") if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: diff --git a/inference/huggingface/fill-mask/test-roberta.py b/inference/huggingface/fill-mask/test-roberta.py index 16bcec041..c625e6cf4 100644 --- a/inference/huggingface/fill-mask/test-roberta.py +++ b/inference/huggingface/fill-mask/test-roberta.py @@ -4,6 +4,7 @@ import torch import os from transformers.models.roberta.modeling_roberta import RobertaLayer +from deepspeed.accelerator import get_accelerator local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '4')) @@ -22,7 +23,7 @@ injection_policy={RobertaLayer: ('output.dense')} ) -pipe.device = torch.device(f'cuda:{local_rank}') +pipe.device = torch.device(get_accelerator().device_name(local_rank)) output = pipe("The invention of the revolutionized the way we communicate with each other.") if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: From 8182a8b4e9df5b8c41e1778f818df80b6ff11b4a Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 29 Feb 2024 12:29:03 -0800 Subject: [PATCH 27/58] Extend FastGen benchmark to use AML endpoints (#865) Add AML backend to MII benchmarking suite. Co-authored-by: Lev Kurilenko --- benchmarks/inference/mii/README.md | 16 +- benchmarks/inference/mii/run_all.sh | 8 +- benchmarks/inference/mii/run_benchmark.py | 6 +- benchmarks/inference/mii/run_example.sh | 3 +- benchmarks/inference/mii/src/client.py | 191 ++++++++++-------- benchmarks/inference/mii/src/defaults.py | 2 + .../inference/mii/src/postprocess_results.py | 20 +- benchmarks/inference/mii/src/server.py | 97 ++++----- benchmarks/inference/mii/src/utils.py | 53 +++-- 9 files changed, 242 insertions(+), 154 deletions(-) diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index 092ac4867..6d20be752 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -24,10 +24,22 @@ python run_benchmark.py --tp_size 1 2 ``` By default the benchmark runs with DeepSpeed-MII as the backend inference -server. To change the backend to vLLM, provide the `--vllm` flag: +server. The benchmark also supports vLLM and Azure endpoints. To change the +backend to vLLM, provide the `--backend vllm` arg: ```bash -python run_benchmark.py --vllm +python run_benchmark.py --backend vllm +``` + +To benchmark against an Azure endpoint, provide the `--backend aml` as well as +the following values: +- `--aml_api_url`: API URL that points to an AML endpoint +- `--aml_api_key`: API key for the given AML endpoint +- `--deployment_name`: The name of the AML endpoint deployment you want to test against +- `--model`: The name of the HuggingFace-hosted model deployed on the AML endpoint. This is used to load a tokenizer and correctly calculate the number of tokens in the prompts and responses. + +```bash +python run_benchmark.py --backend aml --model mistralai/Mixtral-8x7B-v0.1 --deployment_name mistralai-mixtral-8x7b-v01-4 --aml_api_url --aml_api_key ``` The run_all.sh script performs benchmarks across various models, client numbers, diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh index 095b3ae12..7c9311aea 100644 --- a/benchmarks/inference/mii/run_all.sh +++ b/benchmarks/inference/mii/run_all.sh @@ -6,10 +6,10 @@ MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1) for MODEL in ${MODELS[@]}; do - python ./run_benchmark.py --model ${MODEL} --stream - python ./run_benchmark.py --model ${MODEL} --stream --vllm + python ./run_benchmark.py --model ${MODEL} --stream --backend fastgen + python ./run_benchmark.py --model ${MODEL} --stream --backend vllm done # Extra runs for Mixtral with non-default settings -python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 -python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm \ No newline at end of file +python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend fastgen +python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --backend vllm \ No newline at end of file diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py index 96e88155f..801d45b85 100644 --- a/benchmarks/inference/mii/run_benchmark.py +++ b/benchmarks/inference/mii/run_benchmark.py @@ -20,7 +20,8 @@ def run_benchmark() -> None: args = parse_args(server_args=True, client_args=True) for server_args in get_args_product(args, which=SERVER_PARAMS): - start_server(server_args) + if server_args.backend != "aml": + start_server(server_args) for client_args in get_args_product(server_args, which=CLIENT_PARAMS): if results_exist(client_args) and not args.overwrite_results: @@ -33,7 +34,8 @@ def run_benchmark() -> None: print_summary(client_args, response_details) save_json_results(client_args, response_details) - stop_server(server_args) + if server_args.backend != "aml": + stop_server(server_args) if __name__ == "__main__": diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh index e80253828..07af03260 100644 --- a/benchmarks/inference/mii/run_example.sh +++ b/benchmarks/inference/mii/run_example.sh @@ -11,7 +11,8 @@ python ./run_benchmark.py \ --max_ragged_batch_size 768 \ --mean_prompt_length 2600 \ --mean_max_new_tokens 60 \ - --stream + --stream \ + --backend fastgen \ ### Gernerate the plots python ./src/plot_th_lat.py diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index c440d0b63..916fe4f23 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -3,6 +3,7 @@ # DeepSpeed Team +import argparse import asyncio import json import multiprocessing @@ -12,18 +13,30 @@ import requests import threading import time -from typing import List, Iterable +from typing import List, Iterable, Union import numpy as np from transformers import AutoTokenizer -from .postprocess_results import ResponseDetails -from .random_query_generator import RandomQueryGenerator -from .sample_input import all_text -from .utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS +try: + from .postprocess_results import ResponseDetails + from .random_query_generator import RandomQueryGenerator + from .sample_input import all_text + from .utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS +except ImportError: + from postprocess_results import ResponseDetails + from random_query_generator import RandomQueryGenerator + from sample_input import all_text + from utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS -def call_mii(client, input_tokens, max_new_tokens, stream): +def call_fastgen( + input_tokens: str, max_new_tokens: int, args: argparse.Namespace +) -> ResponseDetails: + import mii + + client = mii.client(args.deployment_name) + output_tokens = [] token_gen_time = [] time_last_token = 0 @@ -38,7 +51,7 @@ def callback(response): time_last_token = start_time = time.time() token_gen_time = [] - if stream: + if args.stream: output_tokens = [] client.generate( input_tokens, max_new_tokens=max_new_tokens, streaming_fn=callback @@ -57,7 +70,12 @@ def callback(response): ) -def call_vllm(input_tokens, max_new_tokens, stream=True): +def call_vllm( + input_tokens: str, max_new_tokens: int, args: argparse.Namespace +) -> ResponseDetails: + if not args.stream: + raise NotImplementedError("Not implemented for non-streaming") + api_url = "http://localhost:26500/generate" headers = {"User-Agent": "Benchmark Client"} pload = { @@ -68,7 +86,7 @@ def call_vllm(input_tokens, max_new_tokens, stream=True): "top_p": 0.9, "max_tokens": max_new_tokens, "ignore_eos": False, - "stream": stream, + "stream": args.stream, } def clear_line(n: int = 1) -> None: @@ -90,76 +108,104 @@ def get_streaming_response( yield output, time_now - time_last_token time_last_token = time_now + # For non-streaming, but currently non-streaming is not fully implemented def get_response(response: requests.Response) -> List[str]: data = json.loads(response.content) output = data["text"] return output + token_gen_time = [] start_time = time.time() - response = requests.post(api_url, headers=headers, json=pload, stream=stream) - if stream: - token_gen_time = [] - for h, t in get_streaming_response(response, start_time): - output = h - token_gen_time.append(t) - - return ResponseDetails( - generated_tokens=output, - prompt=input_tokens, - start_time=start_time, - end_time=time.time(), - model_time=0, - token_gen_time=token_gen_time, - ) - else: - output = get_response(response) - raise NotImplementedError("Not implemented for non-streaming") + response = requests.post(api_url, headers=headers, json=pload, stream=args.stream) + for h, t in get_streaming_response(response, start_time): + output = h + token_gen_time.append(t) + + return ResponseDetails( + generated_tokens=output, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time, + ) + + +def call_aml( + input_tokens: str, max_new_tokens: int, args: argparse.Namespace +) -> ResponseDetails: + if args.stream: + raise NotImplementedError("Not implemented for streaming") + + headers = { + "Content-Type": "application/json", + "Authorization": ("Bearer " + args.aml_api_key), + "azureml-model-deployment": args.deployment_name, + } + pload = { + "input_data": { + "input_string": [ + input_tokens, + ], + "parameters": { + "max_new_tokens": max_new_tokens, + "do_sample": True, + "return_full_text": False, + }, + } + } + + def get_response(response: requests.Response) -> List[str]: + data = json.loads(response.content) + output = data[0]["0"] + return output + + token_gen_time = [] + start_time = time.time() + response = requests.post(args.aml_api_url, headers=headers, json=pload) + output = get_response(response) + + return ResponseDetails( + generated_tokens=output, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time, + ) def _run_parallel( - deployment_name, - warmup, - barrier, - query_queue, - result_queue, - num_clients, - stream, - vllm, + barrier: Union[threading.Barrier, multiprocessing.Barrier], + query_queue: Union[queue.Queue, multiprocessing.Queue], + result_queue: Union[queue.Queue, multiprocessing.Queue], + args: argparse.Namespace, ): pid = os.getpid() session_id = f"test_session_p{pid}_t{threading.get_ident()}" event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) - if not vllm: - import mii - client = mii.client(deployment_name) + backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml} + call_fn = backend_call_fns[args.backend] barrier.wait() - for _ in range(warmup): + for _ in range(args.warmup): print(f"warmup queue size: {query_queue.qsize()} ({pid})", flush=True) input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0) - - if vllm: - call_vllm(input_tokens, req_max_new_tokens, stream) - else: - call_mii(client, input_tokens, req_max_new_tokens, stream) + _ = call_fn(input_tokens, req_max_new_tokens, args) barrier.wait() - time.sleep(random.uniform(0, num_clients) * 0.01) + time.sleep(random.uniform(0, args.num_clients) * 0.01) try: while not query_queue.empty(): print(f"queue size: {query_queue.qsize()} ({pid})", flush=True) input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0) - # Set max_new_tokens following normal distribution - if vllm: - r = call_vllm(input_tokens, req_max_new_tokens) - else: - r = call_mii(client, input_tokens, req_max_new_tokens, stream) + r = call_fn(input_tokens, req_max_new_tokens, args) result_queue.put(r) except queue.Empty: @@ -180,22 +226,7 @@ def run_client(args): 6. The main process marks the end time after receiving `num_requests' results """ - # Unpack arguments - model = args.model - deployment_name = args.deployment_name - mean_prompt_length = args.mean_prompt_length - mean_max_new_tokens = args.mean_max_new_tokens - num_clients = args.num_clients - num_requests = args.num_requests - warmup = args.warmup - max_prompt_length = args.max_prompt_length - prompt_length_var = args.prompt_length_var - max_new_tokens_var = args.max_new_tokens_var - stream = args.stream - vllm = args.vllm - use_thread = args.use_thread - - if use_thread: + if args.use_thread: runnable_cls = threading.Thread barrier_cls = threading.Barrier queue_cls = queue.Queue @@ -204,7 +235,7 @@ def run_client(args): barrier_cls = multiprocessing.Barrier queue_cls = multiprocessing.Queue - barrier = barrier_cls(num_clients + 1) + barrier = barrier_cls(args.num_clients + 1) query_queue = queue_cls() result_queue = queue_cls() @@ -212,34 +243,32 @@ def run_client(args): runnable_cls( target=_run_parallel, args=( - deployment_name, - warmup, barrier, query_queue, result_queue, - num_clients, - stream, - vllm, + args, ), ) - for i in range(num_clients) + for i in range(args.num_clients) ] for p in processes: p.start() - tokenizer = AutoTokenizer.from_pretrained(model) + tokenizer = AutoTokenizer.from_pretrained(args.model) query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42) request_text = query_generator.get_random_request_text( - mean_prompt_length, - mean_prompt_length * prompt_length_var, - max_prompt_length, - num_requests + warmup * num_clients, + args.mean_prompt_length, + args.mean_prompt_length * args.prompt_length_var, + args.max_prompt_length, + args.num_requests + args.warmup * args.num_clients, ) for t in request_text: + # Set max_new_tokens following normal distribution req_max_new_tokens = int( np.random.normal( - mean_max_new_tokens, max_new_tokens_var * mean_max_new_tokens + args.mean_max_new_tokens, + args.max_new_tokens_var * args.mean_max_new_tokens, ) ) query_queue.put((t, req_max_new_tokens)) @@ -252,10 +281,10 @@ def run_client(args): barrier.wait() response_details = [] - while len(response_details) < num_requests: + while len(response_details) < args.num_requests: res = result_queue.get() # vLLM returns concatinated tokens - if vllm: + if args.backend == "vllm": all_tokens = tokenizer.tokenize(res.generated_tokens) res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)) :] response_details.append(res) diff --git a/benchmarks/inference/mii/src/defaults.py b/benchmarks/inference/mii/src/defaults.py index 79ce91c97..89255dfa6 100644 --- a/benchmarks/inference/mii/src/defaults.py +++ b/benchmarks/inference/mii/src/defaults.py @@ -4,6 +4,8 @@ # DeepSpeed Team ARG_DEFAULTS = { + "model": "meta-llama/Llama-2-7b-hf", + "deployment_name": "benchmark-deployment", "tp_size": 1, "max_ragged_batch_size": 768, "num_replicas": 1, diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 7e25bfddc..4260f1341 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -79,13 +79,21 @@ def get_summary(args, response_details): for r in response_details ] ) - first_token_latency = mean([r.token_gen_time[0] for r in response_details]) - token_gen_latency_flat = reduce( - list.__add__, - [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2], - ) - token_gen_latency = mean([t for t in token_gen_latency_flat]) + # For non-streaming results, we don't have any token_gen_time information + first_token_latency = 0.0 + token_gen_latency = 0.0 + if response_details[0].token_gen_time: + first_token_latency = mean([r.token_gen_time[0] for r in response_details]) + token_gen_latency_flat = reduce( + list.__add__, + [ + r.token_gen_time[1:-1] + for r in response_details + if len(r.token_gen_time) > 2 + ], + ) + token_gen_latency = mean([t for t in token_gen_latency_flat]) return ProfilingSummary( throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py index d0ecabaf3..ec04338b5 100644 --- a/benchmarks/inference/mii/src/server.py +++ b/benchmarks/inference/mii/src/server.py @@ -3,37 +3,28 @@ # DeepSpeed Team +import argparse import subprocess import time -import mii -from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig -from deepspeed.inference.v2.ragged import DSStateManagerConfig -from .utils import parse_args, SERVER_PARAMS +try: + from .utils import parse_args, SERVER_PARAMS +except ImportError: + from utils import parse_args, SERVER_PARAMS -def start_server(args): - vllm = args.vllm - model = args.model - deployment_name = args.deployment_name - tp_size = args.tp_size - num_replicas = args.num_replicas - max_ragged_batch_size = args.max_ragged_batch_size - - if vllm: - start_vllm_server(model=model, tp_size=tp_size) - else: - start_mii_server( - model=model, - deployment_name=deployment_name, - tp_size=tp_size, - num_replicas=num_replicas, - max_ragged_batch_size=max_ragged_batch_size, - ) +def start_server(args: argparse.Namespace) -> None: + start_server_fns = { + "fastgen": start_fastgen_server, + "vllm": start_vllm_server, + "aml": start_aml_server, + } + start_fn = start_server_fns[args.backend] + start_fn(args) -def start_vllm_server(model: str, tp_size: int) -> None: +def start_vllm_server(args: argparse.Namespace) -> None: vllm_cmd = ( "python", "-m", @@ -43,9 +34,9 @@ def start_vllm_server(model: str, tp_size: int) -> None: "--port", "26500", "--tensor-parallel-size", - str(tp_size), + str(args.tp_size), "--model", - model, + args.model, ) p = subprocess.Popen( vllm_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, close_fds=True @@ -67,45 +58,61 @@ def start_vllm_server(model: str, tp_size: int) -> None: time.sleep(0.01) -def start_mii_server( - model, deployment_name, tp_size, num_replicas, max_ragged_batch_size -): - tp_config = DeepSpeedTPConfig(tp_size=tp_size) +def start_fastgen_server(args: argparse.Namespace) -> None: + import mii + from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig + from deepspeed.inference.v2.ragged import DSStateManagerConfig + + tp_config = DeepSpeedTPConfig(tp_size=args.tp_size) mgr_config = DSStateManagerConfig( - max_ragged_batch_size=max_ragged_batch_size, - max_ragged_sequence_count=max_ragged_batch_size, + max_ragged_batch_size=args.max_ragged_batch_size, + max_ragged_sequence_count=args.max_ragged_batch_size, ) inference_config = RaggedInferenceEngineConfig( tensor_parallel=tp_config, state_manager=mgr_config ) mii.serve( - model, - deployment_name=deployment_name, - tensor_parallel=tp_size, + args.model, + deployment_name=args.deployment_name, + tensor_parallel=args.tp_size, inference_engine_config=inference_config, - replica_num=num_replicas, + replica_num=args.num_replicas, ) -def stop_server(args): - vllm = args.vllm - deployment_name = args.deployment_name +def start_aml_server(args: argparse.Namespace) -> None: + raise NotImplementedError( + "AML server start not implemented. Please use Azure Portal to start the server." + ) - if vllm: - stop_vllm_server() - else: - stop_mii_server(deployment_name) +def stop_server(args: argparse.Namespace) -> None: + stop_server_fns = { + "fastgen": stop_fastgen_server, + "vllm": stop_vllm_server, + "aml": stop_aml_server, + } + stop_fn = stop_server_fns[args.backend] + stop_fn(args) -def stop_vllm_server(): + +def stop_vllm_server(args: argparse.Namespace) -> None: vllm_cmd = ("pkill", "-f", "vllm.entrypoints.api_server") p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() -def stop_mii_server(deployment_name): - mii.client(deployment_name).terminate_server() +def stop_fastgen_server(args: argparse.Namespace) -> None: + import mii + + mii.client(args.deployment_name).terminate_server() + + +def stop_aml_server(args: argparse.Namespace) -> None: + raise NotImplementedError( + "AML server stop not implemented. Please use Azure Portal to stop the server." + ) if __name__ == "__main__": diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index 6499a54b4..ca28cb818 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -14,14 +14,20 @@ from pathlib import Path from typing import Iterator, List -from .defaults import ARG_DEFAULTS, MODEL_DEFAULTS -from .postprocess_results import get_summary, ResponseDetails +try: + from .defaults import ARG_DEFAULTS, MODEL_DEFAULTS + from .postprocess_results import get_summary, ResponseDetails +except ImportError: + from defaults import ARG_DEFAULTS, MODEL_DEFAULTS + from postprocess_results import get_summary, ResponseDetails # For these arguments, users can provide multiple values when running the # benchmark. The benchmark will iterate over all possible combinations. SERVER_PARAMS = ["tp_size", "max_ragged_batch_size", "num_replicas"] CLIENT_PARAMS = ["mean_prompt_length", "mean_max_new_tokens", "num_clients"] +AML_REQUIRED_PARAMS = ["aml_api_url", "aml_api_key", "deployment_name", "model"] + def parse_args( server_args: bool = False, client_args: bool = False @@ -46,7 +52,7 @@ def parse_args( type=int, nargs="+", default=None, - help="Number of MII model replicas", + help="Number of FastGen model replicas", ) server_parser.add_argument( "cmd", @@ -112,6 +118,18 @@ def parse_args( default="./results/", help="Directory to save result JSON files", ) + client_parser.add_argument( + "--aml_api_url", + type=str, + default=None, + help="When using the AML backend, this is the API URL that points to an AML endpoint", + ) + client_parser.add_argument( + "--aml_api_key", + type=str, + default=None, + help="When using the AML backend, this is the API key for a given aml_api_url", + ) # Create the parser, inheriting from the server and/or client parsers parents = [] @@ -123,15 +141,21 @@ def parse_args( # Common args parser = argparse.ArgumentParser(parents=parents) parser.add_argument( - "--model", type=str, default="meta-llama/Llama-2-7b-hf", help="Model name" + "--model", type=str, default=None, help="HuggingFace.co model name" ) parser.add_argument( "--deployment_name", type=str, - default="mii-benchmark-deployment", - help="Deployment name for MII server", + default=None, + help="When using FastGen backend, specifies which model deployment to use. When using AML backend, specifies the name of the deployment", + ) + parser.add_argument( + "--backend", + type=str, + choices=["aml", "fastgen", "vllm"], + default="fastgen", + help="Which backend to benchmark", ) - parser.add_argument("--vllm", action="store_true", help="Use VLLM instead of MII") parser.add_argument( "--overwrite_results", action="store_true", help="Overwrite existing results" ) @@ -139,6 +163,12 @@ def parse_args( # Parse arguments args = parser.parse_args() + # Verify that AML required parameters are defined before filling in defaults + if args.backend == "aml": + for k in AML_REQUIRED_PARAMS: + if getattr(args, k) is None: + raise ValueError(f"AML backend requires {k} to be specified") + # Set default values for model-specific parameters if args.model in MODEL_DEFAULTS: for k, v in MODEL_DEFAULTS[args.model].items(): @@ -150,8 +180,9 @@ def parse_args( if hasattr(args, k) and getattr(args, k) is None: setattr(args, k, v) + # If we are not running the benchmark, we need to make sure to only have one + # value for the server args if server_args and not client_args: - # If we are not running the benchmark, we need to make sure to only have one value for the server args for k in SERVER_PARAMS: if not isinstance(getattr(args, k), int): setattr(args, k, getattr(args, k)[0]) @@ -176,13 +207,9 @@ def get_args_product( def get_results_path(args: argparse.Namespace) -> Path: - if args.vllm: - lib_path = "vllm" - else: - lib_path = "fastgen" return Path( args.out_json_dir, - f"{lib_path}/", + f"{args.backend}/", "-".join( ( args.model.replace("/", "_"), From ffb8a4b8b090195a530c28b6a0881c8dc669eb85 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Fri, 1 Mar 2024 14:49:22 -0800 Subject: [PATCH 28/58] catch AML error response, add aml script (#869) --- benchmarks/inference/mii/run_aml.sh | 20 ++++++++++++++++++++ benchmarks/inference/mii/src/client.py | 6 +++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 benchmarks/inference/mii/run_aml.sh diff --git a/benchmarks/inference/mii/run_aml.sh b/benchmarks/inference/mii/run_aml.sh new file mode 100644 index 000000000..90ad50e2c --- /dev/null +++ b/benchmarks/inference/mii/run_aml.sh @@ -0,0 +1,20 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# Run benchmark against AML endpoint +python ./run_benchmark.py \ + --model \ + --deployment_name \ + --aml_api_url \ + --aml_api_key \ + --mean_prompt_length 2600 \ + --mean_max_new_tokens 60 \ + --num_requests 256 \ + --backend aml + +### Gernerate the plots +python ./src/plot_th_lat.py + +echo "Find figures in ./plots/ and log outputs in ./results/" \ No newline at end of file diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index 916fe4f23..c0fd6a767 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -163,7 +163,11 @@ def get_response(response: requests.Response) -> List[str]: token_gen_time = [] start_time = time.time() response = requests.post(args.aml_api_url, headers=headers, json=pload) - output = get_response(response) + # Sometimes the AML endpoint will return an error, so we send the request again + try: + output = get_response(response) + except Exception as e: + return call_aml(input_tokens, max_new_tokens, args) return ResponseDetails( generated_tokens=output, From b7ec5c3268c0f21f79055938a3efc8b1d35f47b8 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Wed, 6 Mar 2024 10:46:16 -0800 Subject: [PATCH 29/58] Remove AML key from args dict when saving results (#870) This PR removes the aml_api_key from the output results files generated by MII inference benchmarks. --- benchmarks/inference/mii/src/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index ca28cb818..21d152fae 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -245,6 +245,9 @@ def save_json_results( args: argparse.Namespace, response_details: List[ResponseDetails] ) -> None: args_dict = vars(args) + # Remove AML key from args dictionary + if "aml_api_key" in args_dict: + args_dict["aml_api_key"] = None out_json_path = get_results_path(args) os.makedirs(out_json_path.parent, exist_ok=True) From 6e9ada6b53ed782823f19b448713edea584486b9 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:57:21 -0800 Subject: [PATCH 30/58] Update Inference Benchmarking Scripts - Support AML (#868) This PR fixes/updates the inference benchmarking analysis scripts to support [fastgen, vllm, aml] backends. The scripts are generalized to support models beyond just Llama, which was hardcoded in the scripts previously. A number of bugs and formatting issues are also resolved. The scripts that were fixed/updated are: plot_effective_throughput.py plot_latency_percentile.py plot_repl_scale.py plot_th_lat.py plot_tp_sizes.py --- benchmarks/inference/mii/README.md | 11 +- benchmarks/inference/mii/requirements.txt | 3 +- .../mii/src/plot_effective_throughput.py | 164 +++++++----------- .../mii/src/plot_latency_percentile.py | 131 +++++++------- .../inference/mii/src/plot_repl_scale.py | 108 +++++++----- benchmarks/inference/mii/src/plot_th_lat.py | 114 +++++++----- benchmarks/inference/mii/src/plot_tp_sizes.py | 98 ++++++----- .../inference/mii/src/postprocess_results.py | 36 ++++ benchmarks/inference/mii/src/server.py | 4 +- 9 files changed, 357 insertions(+), 312 deletions(-) diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index 6d20be752..e5b43f1c3 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -52,12 +52,19 @@ Results are collected in `./results/`. ## Analyze the Benchmark Results The scripts mentioned below were used for generating the plots featured in our -blog. Specify the root directory for log files using `--log_dir`. The generated +blog. Specify the root directory for log files using `--log_dir` and the backends you wish to run for, e.g. `--backend vllm fastgen aml`. The generated figures will be saved to `./plots/` - `src/plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts. - `src/plot_effective_throughput.py`: Use this to chart effective throughput. - `src/plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies. +- `src/plot_repl_scale.py`: This script will plot the throughput and number of replicas for a fixed clients/replica per plot. +- `src/plot_tp_sizes.py`: This script will plot latency and TFLOPs per GPU across different tensor parallelism sizes. + +The following command shows an example of `plot_th_lat.py` execution using the `vllm`, `fastgen`, and `aml` backends. +```bash +DeepSpeedExamples/benchmarks/inference/mii$ python3 src/plot_th_lat.py --backend vllm fastgen aml --log_dir results/ +``` ## Running an End-to-End Example @@ -76,4 +83,4 @@ bash run_example.sh
*Figure 1: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. Runs the client with 60 generation steps and input prompt length of 2600.*
- \ No newline at end of file + diff --git a/benchmarks/inference/mii/requirements.txt b/benchmarks/inference/mii/requirements.txt index 7ac014ef8..9f338ace5 100644 --- a/benchmarks/inference/mii/requirements.txt +++ b/benchmarks/inference/mii/requirements.txt @@ -2,4 +2,5 @@ transformers matplotlib deepspeed-mii>=0.2.0 vllm>=0.2.7 -numpy \ No newline at end of file +numpy +tabulate diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py index efa471c76..196f70211 100644 --- a/benchmarks/inference/mii/src/plot_effective_throughput.py +++ b/benchmarks/inference/mii/src/plot_effective_throughput.py @@ -10,33 +10,18 @@ import numpy as np import pandas as pd -from .postprocess_results import read_json, get_tokenizer - -RAGGED_BATCH_SIZE = 768 -SLA_PROMPT_TOKENS_PER_SEC = 512 -SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8] -EMA_SPAN = 16 - -tp_sizes_all = {"7b": [1], "70b": [4, 8]} - -tp_sizes_test = {"7b": [1]} - -prompt_gen_pairs_all = [ - (1200, 60), - (1200, 128), - (2600, 60), - (2600, 128), -] - -prompt_gen_pairs_test = [(2600, 60)] +from postprocess_results import read_json, get_tokenizer, get_result_sets def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--test", action="store_true") - parser.add_argument("--no_vllm", action="store_true") - parser.add_argument("--log_dir", type=Path, default=".") - parser.add_argument("--out_dir", type=Path, default="charts/goodtput") + parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \ + nargs="+", help="Specify the backends to generate plots for") + parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--out_dir", type=Path, default="./plots/goodtput") + parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second") + parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets") + parser.add_argument("--ema_span", type=int, default=16, help="EMA span") args = parser.parse_args() return args @@ -90,10 +75,10 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span): return all([t < 1.0 / sla_token_gen for t in ema_latency]) -def validate_prompt_latency_SLA(response_detail, sla_token_gen, f): +def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ): tokenizer = get_tokenizer() prompt_length = len(tokenizer.tokenize(response_detail.prompt)) - prompt_latency_SLA = prompt_length / SLA_PROMPT_TOKENS_PER_SEC + prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec if prompt_latency_SLA < response_detail.token_gen_time[0]: return False @@ -109,19 +94,19 @@ def calc_throughput(response_details): return len(response_details) / (end_time - start_time) -def extract_values(file_pattern, sla_token_gen, validate_func): +def extract_values(file_pattern, sla_token_gen, validate_func, sla_prompt_tokens_per_sec): files = glob.glob(file_pattern) print(f"Found {len(files)} files") goodputs = {} good_ratios = {} for f in files: prof_args, response_details = read_json(f) - client_num = prof_args["client_num"] + client_num = prof_args["num_clients"] num_req_ok = len( [ r for r in response_details - if validate_prompt_latency_SLA(r, sla_token_gen, validate_func) + if validate_prompt_latency_SLA(r, sla_token_gen, validate_func, sla_prompt_tokens_per_sec) ] ) goodputs[client_num] = calc_throughput(response_details) * ( @@ -132,7 +117,7 @@ def extract_values(file_pattern, sla_token_gen, validate_func): return goodputs, good_ratios -def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out_dir): +def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return @@ -141,92 +126,63 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out out_dir.mkdir(parents=True, exist_ok=True) print( - f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}" + f"Model: {model} Prompt: {prompt}, Generation: {gen}, TP: {tp_size} sla_token_gen: {sla_token_gen}" ) - mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - if not args.no_vllm: - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" validate_funcs = [ (validate_token_cum_latency_SLA, (), "cum"), - (validate_token_ema_latency_SLA, (EMA_SPAN,), f"ema{EMA_SPAN}"), + (validate_token_ema_latency_SLA, (args.ema_span,), f"ema{args.ema_span}"), ] - for f in validate_funcs: + plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\ + 'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}} - mii_goodputs, mii_good_ratios = extract_values( - mii_file_pattern, sla_token_gen, f - ) - client_num_list = sorted(list(mii_goodputs.keys())) - mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list] + for f in validate_funcs: + plt.figure() - if not args.no_vllm: - vllm_goodputs, vllm_good_ratios = extract_values( - vllm_file_pattern, sla_token_gen, f + for backend in args.backend: + file_pattern = f"{log_dir}/{backend}/{result_file_pattern}" + goodputs, good_ratios = extract_values( + file_pattern, sla_token_gen, f, args.sla_prompt_tokens_per_sec ) - vllm_goodputs_list = [ - vllm_goodputs[client_num] for client_num in client_num_list - ] + client_num_list = sorted(list(goodputs.keys())) + goodputs_list = [goodputs[client_num] for client_num in client_num_list] - # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}") - # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}") - - # Plotting the scatter plot - plt.figure(figsize=(7, 4)) - plt.scatter( - client_num_list, - mii_goodputs_list, - label=f"DeepSpeed-FastGen", - marker="o", - color="blue", - ) - if not args.no_vllm: + # Plotting the scatter plot plt.scatter( client_num_list, - vllm_goodputs_list, - label=f"vLLM", - marker="x", - color="orange", + goodputs_list, + label=plt_cfg[backend]['label'], + marker=plt_cfg[backend]['marker'], + color=plt_cfg[backend]['color'], ) - fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) - mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4) - mii_model_fn = np.poly1d(mii_fit_model) - plt.plot( - fit_x_list, - mii_model_fn(fit_x_list), - color="blue", - alpha=0.5, - linestyle="--", - ) - - if not args.no_vllm: - vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) - vllm_model_fn = np.poly1d(vllm_fit_model) + fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) + fit_model = np.polyfit(client_num_list, goodputs_list, 4) + model_fn = np.poly1d(fit_model) plt.plot( fit_x_list, - vllm_model_fn(fit_x_list), - color="orange", + model_fn(fit_x_list), alpha=0.5, linestyle="--", + color=plt_cfg[backend]['color'], ) title = ( - f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" - + f"Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}" + f"Effective throughput (SLA prompt: {args.sla_prompt_tokens_per_sec} tokens/s, generation: {sla_token_gen} tokens/s)\n" + + f"Model: {model} Prompt: {prompt}, Generation: {gen}, TP: {tp_size}" ) plt.title(title, fontsize=10) plt.xlabel("Number of clients", fontsize=10) plt.ylabel("Effective throughput (queries/s)", fontsize=10) - # plt.rcParams['figure.subplot.bottom'] = 0.30 plt.ylim(bottom=-0.05) plt.legend() plt.grid(True) - # plt.show() out_file = ( out_dir - / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png" + / f"{model}_SLAp{args.sla_prompt_tokens_per_sec}g{sla_token_gen}_tp{tp_size}_b{bs}_p{prompt}g{gen}_{f[2]}.png" ) plt.savefig(out_file) plt.clf() @@ -234,27 +190,23 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out if __name__ == "__main__": - raise NotImplementedError("This script is not up to date") args = get_args() - if args.test: - tp_sizes = tp_sizes_test - prompt_gen_pairs = prompt_gen_pairs_test - else: - tp_sizes = tp_sizes_all - prompt_gen_pairs = prompt_gen_pairs_all - - for model_size, tps in tp_sizes.items(): - for tp in tps: - for prompt, gen in prompt_gen_pairs: - for sla_token_gen in SLA_GEN_TOKENS_PER_SEC: - display_results( - model_size, - tp, - RAGGED_BATCH_SIZE, - sla_token_gen, - prompt, - gen, - args.log_dir, - args.out_dir, - ) + assert "aml" not in args.backend, "Effective throughput analysis is not supported for AML." + + result_params = get_result_sets(args) + + for model, tp_size, bs, replicas, prompt, gen in result_params: + for sla_token_gen in args.sla_gen_tokens_per_sec: + output_charts( + args=args, + model=model, + tp_size=tp_size, + bs=bs, + replicas=replicas, + sla_token_gen=sla_token_gen, + prompt=prompt, + gen=gen, + log_dir=args.log_dir, + out_dir=args.out_dir, + ) diff --git a/benchmarks/inference/mii/src/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py index 9b08f12da..daeb8cc5a 100644 --- a/benchmarks/inference/mii/src/plot_latency_percentile.py +++ b/benchmarks/inference/mii/src/plot_latency_percentile.py @@ -5,56 +5,52 @@ import argparse import glob +import re +import os from pathlib import Path import matplotlib.pyplot as plt import numpy as np import itertools -from .postprocess_results import read_json, get_token_latency - -bs = 768 -SKIP_HEAD_TOKEN_NUM = 2 -SKIP_REQUEST_NUM = 100 - -tp_sizes = { - "70b": [4], -} - -prompt_gen_pairs = [ - (2600, 128), -] - +from postprocess_results import read_json, get_token_latency, get_result_sets def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--log_dir", type=Path, default=".") + parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \ + nargs="+", help="Specify the backends to generate plots for") + parser.add_argument("--log_dir", type=Path, default="./results") parser.add_argument( - "--out_dir", type=Path, default="charts/percentile_token_latency" + "--out_dir", type=Path, default="./plots/percentile_token_latency" ) + parser.add_argument("--skip_head_token_num", type=int, default=1, help="Specify number of head tokens to skip") + parser.add_argument("--skip_request_num", type=int, default=1, help="Specify number of requests to skip") args = parser.parse_args() return args -def extract_values(file_pattern): +def extract_values(args, file_pattern): files = glob.glob(file_pattern) + print(f"Found {len(files)}") + print("\n".join(files)) + latencies = {} for f in files: prof_args, response_details = read_json(f) - client_num = prof_args["client_num"] + client_num = prof_args["num_clients"] response_details.sort(key=lambda r: r.start_time) - response_details = response_details[SKIP_REQUEST_NUM:-SKIP_REQUEST_NUM] + + response_details = response_details[args.skip_request_num:-args.skip_request_num] token_latencies = [ - r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details + r.token_gen_time[args.skip_head_token_num:-1] for r in response_details ] - flat_latency_list = list(itertools.chain(*token_latencies)) latencies[client_num] = flat_latency_list return latencies -def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): +def output_charts(args, model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return @@ -62,65 +58,70 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) - mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" - mii_latencies = extract_values(mii_file_pattern) - vllm_latencies = extract_values(vllm_file_pattern) - client_num_list = sorted(list(mii_latencies.keys())) - - for client_num in client_num_list: - plt.figure(figsize=(6, 4)) + plt_cfg = {'vllm': {'bar_x': [1, 2.5, 4], 'label': 'vLLM', 'color': 'orange'},\ + 'fastgen': {'bar_x': [1.3, 2.8, 4.3], 'label': 'DeepSpeed-FastGen', 'color': 'blue'}} + latencies = {} + client_num_dict = {} + for backend in args.backend: + file_pattern = f"{log_dir}/{backend}/{result_file_pattern}" + latencies[backend] = extract_values(args, file_pattern) + client_num_dict[backend] = set(sorted(list(latencies[backend].keys()))) + + # Intersection of clients across all backends + client_num_set = set() + for backend in args.backend: + if not client_num_set: + client_num_set = client_num_dict[backend] + else: + client_num_set = client_num_set.intersection(client_num_dict[backend]) + + for client_num in client_num_set: + plt.figure() percentile = 95 - P50_vllm_val = np.percentile(vllm_latencies[client_num], 50) - P50_mii_val = np.percentile(mii_latencies[client_num], 50) - P90_vllm_val = np.percentile(vllm_latencies[client_num], 90) - P90_mii_val = np.percentile(mii_latencies[client_num], 90) - P95_vllm_val = np.percentile(vllm_latencies[client_num], 95) - P95_mii_val = np.percentile(mii_latencies[client_num], 95) - - # print(f"P50_vllm_val={P50_vllm_val}") - # print(f"P50_mii_val={P50_mii_val}") - # print(f"P90_vllm_val={P90_vllm_val}") - # print(f"P90_mii_val={P90_mii_val}") - # print(f"P95_vllm_val={P95_vllm_val}") - # print(f"P95_mii_val={P95_mii_val}") + for backend in args.backend: + print(f"Generating data for plot, {backend=}") + P50_val = np.percentile(latencies[backend][client_num], 50) + P90_val = np.percentile(latencies[backend][client_num], 90) + P95_val = np.percentile(latencies[backend][client_num], 95) + y = [P50_val, P90_val, P95_val] + plt.bar(plt_cfg[backend]['bar_x'], y, width=0.3, label=plt_cfg[backend]['label'], align="center", color=plt_cfg[backend]['color']) out_file = ( out_dir - / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png" + / f"p{percentile}_token_latency_{model}_c{client_num}_tp{tp_size}_p{prompt}g{gen}.png" ) - x1 = [1, 2, 3] - y1 = [P50_vllm_val, P90_vllm_val, P95_vllm_val] - - x2 = [1.3, 2.3, 3.3] - y2 = [P50_mii_val, P90_mii_val, P95_mii_val] - - label_x = ["P50", "P90", "P95"] - - plt.bar(x1, y1, width=0.3, label="vLLM", align="center", color="orange") - plt.bar( - x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue" - ) - plt.ylabel("Latency", fontsize=14) + plt.ylabel("Latency (s)", fontsize=14) plt.legend(loc=2) - plt.xticks([1.15, 2.15, 3.15], label_x) + label_x = ["P50", "P90", "P95"] + plt.xticks([1, 2.5, 4], label_x) + plt.title(f"Model: {model}, Clients: {client_num}, Prompt: {prompt}, Gen: {gen}, TP: {tp_size}") plt.savefig(out_file) print(f"Saved {out_file}") if __name__ == "__main__": - raise NotImplementedError("This script is not up to date") args = get_args() - for model_size, tps in tp_sizes.items(): - for tp in tps: - for prompt, gen in prompt_gen_pairs: - output_charts( - model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir - ) + assert "aml" not in args.backend, "Percentile latency analysis is not supported for AML." + + result_params = get_result_sets(args) + + for model, tp_size, bs, replicas, prompt, gen in result_params: + output_charts( + args=args, + model=model, + tp_size=tp_size, + bs=bs, + replicas=replicas, + prompt=prompt, + gen=gen, + log_dir=args.log_dir, + out_dir=args.out_dir, + ) diff --git a/benchmarks/inference/mii/src/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py index 7791be0ca..074bfb81a 100644 --- a/benchmarks/inference/mii/src/plot_repl_scale.py +++ b/benchmarks/inference/mii/src/plot_repl_scale.py @@ -8,26 +8,18 @@ import argparse from pathlib import Path import numpy as np +from collections import defaultdict -from .postprocess_results import read_json, get_summary - -bs = 768 - -REPLICA_NUMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] - -tp_sizes = { - "70b": [4], -} - -prompt_gen_pairs = [ - (2600, 60), -] - +from postprocess_results import read_json, get_summary, get_result_sets def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--log_dir", type=Path, default=".") - parser.add_argument("--out_dir", type=Path, default="charts/repl_scale") + parser.add_argument("--backend", type=str, choices=["fastgen"], default=["fastgen"], \ + nargs=1, help="Specify the single backend to generate plots for") + parser.add_argument("--clients_per_replica", type=int, required=False, default=None, help="Optional \ + argument to specify explicit clients/replica to generate plot for") + parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--out_dir", type=Path, default="./plots/repl_scale") args = parser.parse_args() return args @@ -41,14 +33,14 @@ def extract_values(file_pattern): for f in files: prof_args, response_details = read_json(f) summary = get_summary(prof_args, response_details) - clients.append(prof_args["client_num"]) + clients.append(prof_args["num_clients"]) throughputs.append(summary.throughput) latencies.append(summary.latency) return clients, throughputs, latencies -def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): +def output_charts(args, model, tp_size, bs, replica_nums, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return @@ -57,8 +49,9 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): out_dir.mkdir(parents=True, exist_ok=True) throughputs = {} - for repl in REPLICA_NUMS: - mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}_repl{repl}/llama2-{model_size}-tp{tp}-b{bs}_repl{repl}_c*_p{prompt}_g{gen}.json" + for repl in replica_nums: + result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{repl}-prompt{prompt}-gen{gen}-clients*.json" + mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}" print(f"Looking for {mii_file_pattern}") clients, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) @@ -70,36 +63,55 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): throughputs[client_per_repl].append(th) for c in throughputs: - - # Plotting the scatter plot - plt.figure(figsize=(6, 4)) - - plt.bar(REPLICA_NUMS, throughputs[c], color="blue", alpha=0.9) - - fit_x_list = np.arange(min(REPLICA_NUMS), max(REPLICA_NUMS), 0.1) - mii_fit_model = np.polyfit(REPLICA_NUMS, throughputs[c], 1) - mii_model_fn = np.poly1d(mii_fit_model) - plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--") - - plt.title( - f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}" - ) - plt.xlabel("Number of replicas", fontsize=14) - plt.ylabel("Throughput (queries/s)", fontsize=14) - plt.grid(True) - plt.tight_layout() - # plt.show() - out_file = out_dir / f"repl_scale_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" - plt.savefig(out_file) + if args.clients_per_replica != None and args.clients_per_replica != c: + continue + if len(throughputs[c]) == len(replica_nums): + print(f"Generating figure for {c} clients/replica.") + # Plotting the scatter plot + plt.figure() + + plt.bar(replica_nums, throughputs[c], color="blue", alpha=0.9) + + fit_x_list = np.arange(min(replica_nums), max(replica_nums), 0.1) + mii_fit_model = np.polyfit(replica_nums, throughputs[c], 1) + mii_model_fn = np.poly1d(mii_fit_model) + plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--") + + plt.title( + f"Model: {model}, Prompt: {prompt}, Generation: {gen}\n\ + TP: {tp_size}, Clients/Replica: {c}" + ) + plt.xlabel("Number of replicas", fontsize=14) + plt.ylabel("Throughput (queries/s)", fontsize=14) + plt.grid(True) + plt.tight_layout() + out_file = out_dir / f"repl_scale_{model}_tp{tp_size}_p{prompt}g{gen}_c_per_r{c}.png" + plt.savefig(out_file) if __name__ == "__main__": - raise NotImplementedError("This script is not up to date") args = get_args() - for model_size, tps in tp_sizes.items(): - for tp in tps: - for prompt, gen in prompt_gen_pairs: - output_charts( - model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir - ) + replica_sets = defaultdict(lambda: defaultdict(set)) + result_params = get_result_sets(args) + + # Find all replicas across same sets + for model, tp_size, bs, replicas, prompt, gen in result_params: + key = f'{model}_{tp_size}_{bs}_{prompt}_{gen}' + replica_sets[key]['config'].add((model, tp_size, bs, prompt, gen)) + replica_sets[key]['replicas'].add(int(replicas)) + + for replica_set in replica_sets.values(): + for model, tp_size, bs, prompt, gen in replica_set['config']: + replica_nums = sorted(replica_set['replicas']) + output_charts( + args=args, + model=model, + tp_size=tp_size, + bs=bs, + replica_nums=replica_nums, + prompt=prompt, + gen=gen, + log_dir=args.log_dir, + out_dir=args.out_dir, + ) diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py index 9aa292ca6..1191abd8a 100644 --- a/benchmarks/inference/mii/src/plot_th_lat.py +++ b/benchmarks/inference/mii/src/plot_th_lat.py @@ -12,11 +12,13 @@ import matplotlib.pyplot as plt import numpy as np -from postprocess_results import read_json, get_summary +from postprocess_results import read_json, get_summary, get_result_sets def get_args(): parser = argparse.ArgumentParser() + parser.add_argument("--backend", type=str, choices=["aml", "fastgen", "vllm"], default=["aml", "fastgen", "vllm"], \ + nargs="+", help="Specify the backends to generate plots for") parser.add_argument("--log_dir", type=Path, default="./results") parser.add_argument("--out_dir", type=Path, default="./plots/throughput_latency") args = parser.parse_args() @@ -32,6 +34,7 @@ def extract_values(file_pattern): clients = [] throughputs = [] latencies = [] + extra_args = {} for f in files: prof_args, response_details = read_json(f) summary = get_summary(prof_args, response_details) @@ -39,58 +42,90 @@ def extract_values(file_pattern): throughputs.append(summary.throughput) latencies.append(summary.latency) - return clients, throughputs, latencies + if "aml" in args.backend: + extra_args["aml_api_url"] = prof_args["aml_api_url"] + extra_args["deployment_name"] = prof_args["deployment_name"] + + return clients, throughputs, latencies, extra_args def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): out_dir.mkdir(parents=True, exist_ok=True) result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" - mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}" - vllm_file_pattern = f"{log_dir}/vllm/{result_file_pattern}" - _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) - _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) + plt.figure() # Plotting the scatter plot - plt.figure(figsize=(6, 4)) - - if len(vllm_throughputs) > 0: + # vLLM plot formatting + if "vllm" in args.backend: + vllm_file_pattern = f"{log_dir}/vllm/{result_file_pattern}" + _, vllm_throughputs, vllm_latencies, _ = extract_values(vllm_file_pattern) + if len(vllm_throughputs) > 0: + plt.scatter( + vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange" + ) + fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) + vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) + vllm_model_fn = np.poly1d(vllm_vllm_model) + plt.plot( + fit_vllm_x_list, + vllm_model_fn(fit_vllm_x_list), + color="orange", + alpha=0.5, + linestyle="--", + ) + + # FastGen plot formatting + if "fastgen" in args.backend: + mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}" + _, mii_throughputs, mii_latencies, _ = extract_values(mii_file_pattern) plt.scatter( - vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange" + mii_throughputs, + mii_latencies, + label=f"DeepSpeed FastGen", + marker="o", + color="blue", ) - fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) - vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) - vllm_model_fn = np.poly1d(vllm_vllm_model) + fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) + mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) + mii_model_fn = np.poly1d(mii_fit_model) plt.plot( - fit_vllm_x_list, - vllm_model_fn(fit_vllm_x_list), - color="orange", + fit_mii_x_list, + mii_model_fn(fit_mii_x_list), + color="blue", alpha=0.5, linestyle="--", ) - plt.scatter( - mii_throughputs, - mii_latencies, - label=f"DeepSpeed FastGen", - marker="o", - color="blue", - ) - fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) - mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) - mii_model_fn = np.poly1d(mii_fit_model) - plt.plot( - fit_mii_x_list, - mii_model_fn(fit_mii_x_list), - color="blue", - alpha=0.5, - linestyle="--", - ) + # AML plot formatting + if "aml" in args.backend: + aml_file_pattern = f"{log_dir}/aml/{result_file_pattern}" + _, aml_throughputs, aml_latencies, aml_args = extract_values(aml_file_pattern) + aml_endpoint_name = re.match('^https://(.+?)\.', aml_args["aml_api_url"]).groups()[0] + aml_deployment_name = aml_args["deployment_name"] + plt.scatter( + aml_throughputs, + aml_latencies, + label=f"AML {aml_endpoint_name.capitalize()}", + marker="o", + color="purple", + ) + fit_aml_x_list = np.arange(min(aml_throughputs), max(aml_throughputs), 0.01) + aml_fit_model = np.polyfit(aml_throughputs, aml_latencies, 3) + aml_model_fn = np.poly1d(aml_fit_model) + plt.plot( + fit_aml_x_list, + aml_model_fn(fit_aml_x_list), + color="purple", + alpha=0.5, + linestyle="--", + ) - plt.title(f"Model {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}") + # Generic plot formatting + plt.title(f"Model: {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}") plt.xlabel("Throughput (queries/s)", fontsize=14) - plt.ylabel("Latency", fontsize=14) + plt.ylabel("Latency (s)", fontsize=14) plt.legend() plt.grid(True) plt.tight_layout() @@ -108,14 +143,7 @@ def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): if not args.log_dir.exists(): raise ValueError(f"Log dir {args.log_dir} does not exist") - result_params = set() - result_re = re.compile( - r"(.+)-tp(\d+)-bs(\d+)-replicas(\d+)-prompt(\d+)-gen(\d+)-clients.*.json" - ) - for f in os.listdir(os.path.join(args.log_dir, "fastgen")): - match = result_re.match(f) - if match: - result_params.add(match.groups()) + result_params = get_result_sets(args) for model, tp_size, bs, replicas, prompt, gen in result_params: output_charts( diff --git a/benchmarks/inference/mii/src/plot_tp_sizes.py b/benchmarks/inference/mii/src/plot_tp_sizes.py index f02b643f2..596a40de2 100644 --- a/benchmarks/inference/mii/src/plot_tp_sizes.py +++ b/benchmarks/inference/mii/src/plot_tp_sizes.py @@ -8,30 +8,17 @@ import argparse from pathlib import Path import numpy as np +import re +from collections import defaultdict -from .postprocess_results import read_json, get_summary - -bs = 768 - -tp_sizes = { - # "7b": [1], - "13b": [1, 2, 4], - # "70b": [4, 8], -} - -prompt_gen_pairs = [ - (1200, 60), - (1200, 128), - (2600, 60), - (2600, 128), - (2600, 256), -] - +from postprocess_results import read_json, get_summary, get_result_sets def get_args(): parser = argparse.ArgumentParser() + parser.add_argument("--backend", type=str, choices=["aml", "fastgen", "vllm"], default=["aml", "fastgen", "vllm"], \ + nargs=1, help="Specify the single backend to generate plots for") parser.add_argument("--log_dir", type=Path, default="logs.release") - parser.add_argument("--out_dir", type=Path, default="charts/tp_sizes") + parser.add_argument("--out_dir", type=Path, default="./plots/tp_sizes") args = parser.parse_args() return args @@ -48,14 +35,14 @@ def extract_values(file_pattern): for f in files: prof_args, response_details = read_json(f) summary = get_summary(prof_args, response_details) - clients.append(prof_args["client_num"]) + clients.append(prof_args["num_clients"]) throughputs.append(summary.throughput) latencies.append(summary.latency) return clients, throughputs, latencies -def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir): +def output_charts(args, model, tp_list, bs, replicas, prompt, gen, log_dir, out_dir): if not log_dir.exists(): print(f"Log directory {log_dir} does not exist") return @@ -64,54 +51,75 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir): out_dir.mkdir(parents=True, exist_ok=True) # Plotting the scatter plot - plt.figure(figsize=(6, 4)) - - colors = ["orange", "green", "brown"] + plt.figure() - for tp, color in zip(tps, colors): - mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) + for tp in tp_list: + result_file_pattern = f"{model}-tp{tp}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" + file_pattern = f"{log_dir}/{args.backend[0]}/{result_file_pattern}" + _, throughputs, latencies = extract_values(file_pattern) - if len(mii_throughputs) == 0: + if len(throughputs) == 0: continue + model_size = re.match('.*?(\d+[b|B|m|M])', model).groups()[0] n_params = int(model_size[:-1]) - tflops_per_query = n_params * (prompt + gen) * 2 * 1e-3 - mii_tflops = [th * tflops_per_query / tp for th in mii_throughputs] + if model_size[-1].lower() == 'm': + # Scale n_params approriately for millions + n_params = n_params / 1000 + tflops_per_query = n_params * (int(prompt) + int(gen)) * 2 * 1e-3 + tflops = [th * tflops_per_query / tp for th in throughputs] plt.scatter( - mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color + tflops, latencies, label=f"TP={tp}", marker="o" ) - fit_mii_x_list = np.arange(min(mii_tflops), max(mii_tflops), 0.01) - mii_fit_model = np.polyfit(mii_tflops, mii_latencies, 3) - mii_model_fn = np.poly1d(mii_fit_model) + fit_x_list = np.arange(min(tflops), max(tflops), 0.01) + fit_model = np.polyfit(tflops, latencies, 3) + model_fn = np.poly1d(fit_model) plt.plot( - fit_mii_x_list, - mii_model_fn(fit_mii_x_list), - color=color, + fit_x_list, + model_fn(fit_x_list), alpha=0.5, linestyle="--", ) plt.title( - f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}" + f"Model: {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_list}\n\ + Replicas: {replicas}, Backend: {args.backend[0]}" ) plt.xlabel("TFLOPs (per GPU)", fontsize=14) - plt.ylabel("Latency", fontsize=14) + plt.ylabel("Latency (s)", fontsize=14) plt.legend() plt.grid(True) - # plt.show() out_file = ( out_dir - / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png" + / f"tp_sizes_{model}_tp{'_'.join([str(tp) for tp in tp_list])}_p{prompt}g{gen}r{replicas}.png" ) plt.savefig(out_file) if __name__ == "__main__": - raise NotImplementedError("This script is not up to date") args = get_args() - for model_size, tps in tp_sizes.items(): - for prompt, gen in prompt_gen_pairs: - output_charts(model_size, tps, bs, prompt, gen, args.log_dir, args.out_dir) + tp_sets = defaultdict(lambda: defaultdict(set)) + result_params = get_result_sets(args) + + # Find all tp_sizes across same sets + for model, tp_size, bs, replicas, prompt, gen in result_params: + key = f'{model}_{bs}_{replicas}_{prompt}_{gen}' + tp_sets[key]['config'].add((model, bs, replicas, prompt, gen)) + tp_sets[key]['tp_list'].add(int(tp_size)) + + for tp_set in tp_sets.values(): + for model, bs, replicas, prompt, gen in tp_set['config']: + tp_list = sorted(tp_set['tp_list']) + output_charts( + args=args, + model=model, + tp_list=tp_list, + bs=bs, + replicas=replicas, + prompt=prompt, + gen=gen, + log_dir=args.log_dir, + out_dir=args.out_dir, + ) diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 4260f1341..5941242d9 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -5,11 +5,15 @@ import argparse import json +import re +import os +from tabulate import tabulate from dataclasses import dataclass from functools import reduce from pathlib import Path from statistics import mean from typing import List +from collections import defaultdict import numpy as np from transformers import AutoTokenizer @@ -147,3 +151,35 @@ def get_token_acc_latency(response_details, percentile=99): + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, " + f"First token received: {ps.first_token_latency:.3f} s" ) + +def get_result_sets(args: argparse.Namespace) -> set(): + result_params = None + result_re = re.compile( + r"(.+)-tp(\d+)-bs(\d+)-replicas(\d+)-prompt(\d+)-gen(\d+)-clients.*.json" + ) + + backend_sets = defaultdict(set) + + # Generate backend sets + for backend in args.backend: + for f in os.listdir(os.path.join(args.log_dir, backend)): + match = result_re.match(f) + if match: + backend_sets[backend].add(match.groups()) + + # Intersection between all sets + for backend_set in backend_sets.values(): + if result_params == None: + result_params = backend_set + else: + result_params = result_params.intersection(backend_set) + + # Warning messages about skipped sets + for key, backend_set in backend_sets.items(): + difference = backend_set.difference(result_params) + if difference: + print(f"WARNING: backend {key} has result combinations that are not present in all backends:") + print(tabulate(difference, headers=["model", "tp_size", "bs", "replicas", "prompt", "gen"])) + print("") + + return result_params diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py index ec04338b5..1fee4311f 100644 --- a/benchmarks/inference/mii/src/server.py +++ b/benchmarks/inference/mii/src/server.py @@ -49,11 +49,11 @@ def start_vllm_server(args: argparse.Namespace) -> None: break if "error" in line.lower(): p.terminate() - stop_vllm_server() + stop_vllm_server(args) raise RuntimeError(f"Error starting VLLM server: {line}") if time.time() - start_time > timeout_after: p.terminate() - stop_vllm_server() + stop_vllm_server(args) raise TimeoutError("Timed out waiting for VLLM server to start") time.sleep(0.01) From f415ec82cb2256e061be7676f139fc787aa94569 Mon Sep 17 00:00:00 2001 From: "Xiaoxia (Shirley) Wu" <94406484+xiaoxiawu-microsoft@users.noreply.github.com> Date: Thu, 7 Mar 2024 20:00:04 -0800 Subject: [PATCH 31/58] Xiaoxia/fp v1 (#871) * add FP6-benchmark * update * updatefile * update tpsize --------- Co-authored-by: Ubuntu --- benchmarks/inference/mii/run_fp6.sh | 10 ++++++++++ benchmarks/inference/mii/src/server.py | 6 +++++- benchmarks/inference/mii/src/utils.py | 3 +++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 benchmarks/inference/mii/run_fp6.sh diff --git a/benchmarks/inference/mii/run_fp6.sh b/benchmarks/inference/mii/run_fp6.sh new file mode 100644 index 000000000..42c4fdbf8 --- /dev/null +++ b/benchmarks/inference/mii/run_fp6.sh @@ -0,0 +1,10 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +MODELS=(NousResearch/Llama-2-70b-hf) + +for MODEL in ${MODELS[@]}; do + python ./run_benchmark.py --model ${MODEL} --num_requests 128 --stream --backend fastgen --fp6 --tp_size 1 +done \ No newline at end of file diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py index 1fee4311f..56fd7930e 100644 --- a/benchmarks/inference/mii/src/server.py +++ b/benchmarks/inference/mii/src/server.py @@ -71,13 +71,17 @@ def start_fastgen_server(args: argparse.Namespace) -> None: inference_config = RaggedInferenceEngineConfig( tensor_parallel=tp_config, state_manager=mgr_config ) - + if args.fp6: + quantization_mode = 'wf6af16' + else: + quantization_mode = None mii.serve( args.model, deployment_name=args.deployment_name, tensor_parallel=args.tp_size, inference_engine_config=inference_config, replica_num=args.num_replicas, + quantization_mode=quantization_mode ) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index 21d152fae..a3b156fd2 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -159,6 +159,9 @@ def parse_args( parser.add_argument( "--overwrite_results", action="store_true", help="Overwrite existing results" ) + parser.add_argument( + "--fp6", action="store_true", help="Enable FP6" + ) # Parse arguments args = parser.parse_args() From b0a5533329acf5bfbc13c6d8a865d2e4314672c0 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 14 Mar 2024 14:45:08 -0700 Subject: [PATCH 32/58] Fix AML benchmark E2E measurment (#874) In the case where a request sent to an AML endpoint fails, we were incorrectly resetting the start_time, causing bad measurements. Fixed in this PR. @lekurile --- benchmarks/inference/mii/src/client.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index c0fd6a767..0779d51eb 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -132,7 +132,10 @@ def get_response(response: requests.Response) -> List[str]: def call_aml( - input_tokens: str, max_new_tokens: int, args: argparse.Namespace + input_tokens: str, + max_new_tokens: int, + args: argparse.Namespace, + start_time: Union[None, float] = None, ) -> ResponseDetails: if args.stream: raise NotImplementedError("Not implemented for streaming") @@ -161,13 +164,15 @@ def get_response(response: requests.Response) -> List[str]: return output token_gen_time = [] - start_time = time.time() - response = requests.post(args.aml_api_url, headers=headers, json=pload) + if start_time is None: + start_time = time.time() + response = requests.post(args.aml_api_url, headers=headers, json=pload, timeout=180) # Sometimes the AML endpoint will return an error, so we send the request again try: output = get_response(response) - except Exception as e: - return call_aml(input_tokens, max_new_tokens, args) + except (Exception, requests.exceptions.SSLError) as e: + print("Request failed... re-submitting") + return call_aml(input_tokens, max_new_tokens, args, start_time) return ResponseDetails( generated_tokens=output, From c3ffec25bbf5cdcfb2a5521f1f0f191adc4e1c9a Mon Sep 17 00:00:00 2001 From: Ammar Ahmad Awan Date: Mon, 18 Mar 2024 16:12:35 -0500 Subject: [PATCH 33/58] Update README.md --- benchmarks/inference/mii/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index e5b43f1c3..7196b0445 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -1,4 +1,4 @@ -# Benchmarking Scripts for DeepSpeed-FastGen +# Inference Benchmarking Scripts for vLLM, DeepSpeed-FastGen, and Azure ML endpoints ## Run the Benchmark From 18200d54107cbe944ce79adf0e4850278894859d Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Tue, 19 Mar 2024 14:10:57 -0700 Subject: [PATCH 34/58] Improve robustness of infernece AML benchmark (#875) --- benchmarks/inference/mii/src/client.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index 0779d51eb..b06e617ef 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -160,19 +160,26 @@ def call_aml( def get_response(response: requests.Response) -> List[str]: data = json.loads(response.content) - output = data[0]["0"] + try: + output = data[0]["0"] + except (KeyError, TypeError): + try: + output = data[0] + except (KeyError, TypeError): + output = data return output token_gen_time = [] if start_time is None: start_time = time.time() - response = requests.post(args.aml_api_url, headers=headers, json=pload, timeout=180) - # Sometimes the AML endpoint will return an error, so we send the request again - try: - output = get_response(response) - except (Exception, requests.exceptions.SSLError) as e: - print("Request failed... re-submitting") - return call_aml(input_tokens, max_new_tokens, args, start_time) + while True: + try: # Sometimes the AML endpoint will return an error, so we send the request again + response = requests.post(args.aml_api_url, headers=headers, json=pload, timeout=180) + output = get_response(response) + break + except Exception as e: + print(f"Connection failed with {e}. Retrying AML request") + print(f"{response.status_code}:{response.content}") return ResponseDetails( generated_tokens=output, From 279a8fe45aa0f16ad30dabef455da9a9ca118637 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 19 Mar 2024 14:28:21 -0700 Subject: [PATCH 35/58] change kwargs for AML call to match vllm kwargs (#876) --- benchmarks/inference/mii/src/client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index b06e617ef..7a7333294 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -151,8 +151,7 @@ def call_aml( input_tokens, ], "parameters": { - "max_new_tokens": max_new_tokens, - "do_sample": True, + "max_tokens": max_new_tokens, "return_full_text": False, }, } From 02fc5781c817a20ea61ef1ba5090bc0d3c11eabb Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 27 Mar 2024 15:44:37 -0700 Subject: [PATCH 36/58] dynamic setting of requst num and formatting (#880) --- benchmarks/inference/mii/run_benchmark.py | 2 ++ benchmarks/inference/mii/src/utils.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py index 801d45b85..858498884 100644 --- a/benchmarks/inference/mii/run_benchmark.py +++ b/benchmarks/inference/mii/run_benchmark.py @@ -30,6 +30,8 @@ def run_benchmark() -> None: ) continue + if client_args.num_requests is None: + client_args.num_requests = client_args.num_clients * 4 + 32 response_details = run_client(client_args) print_summary(client_args, response_details) save_json_results(client_args, response_details) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index a3b156fd2..23e108f8d 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -91,7 +91,7 @@ def parse_args( client_parser.add_argument( "--num_requests", type=int, - default=512, + default=None, help="Number of requests to process by clients", ) client_parser.add_argument( @@ -159,9 +159,7 @@ def parse_args( parser.add_argument( "--overwrite_results", action="store_true", help="Overwrite existing results" ) - parser.add_argument( - "--fp6", action="store_true", help="Enable FP6" - ) + parser.add_argument("--fp6", action="store_true", help="Enable FP6") # Parse arguments args = parser.parse_args() From df7119ed264bfac747969f9bb5bed8a61aed5e5d Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Thu, 28 Mar 2024 19:40:44 -0700 Subject: [PATCH 37/58] Fix response check in call_aml function (#882) --- benchmarks/inference/mii/src/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index 7a7333294..47a11ee5a 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -169,6 +169,7 @@ def get_response(response: requests.Response) -> List[str]: return output token_gen_time = [] + response = None if start_time is None: start_time = time.time() while True: @@ -178,7 +179,9 @@ def get_response(response: requests.Response) -> List[str]: break except Exception as e: print(f"Connection failed with {e}. Retrying AML request") - print(f"{response.status_code}:{response.content}") + # make sure response exist before we call it + if response: + print(f"{response.status_code}:{response.content}") return ResponseDetails( generated_tokens=output, From fab5d06606a7e808dd8bd1961a1a9a56adc744c0 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Tue, 9 Apr 2024 10:52:25 -0700 Subject: [PATCH 38/58] Update throughput-latency plot script (#881) This PR updates the plot_th_lat.py throughput-latency plot generation script to remove the concept of a backend (aml, fastgen, vllm) and generalize for any result output directory, irrespective of where it was run. The PR also introduces the concept of an optional plot_config.yaml that resides within each result directory and allows for overrides in the plot formatting. --- benchmarks/inference/mii/README.md | 29 ++- benchmarks/inference/mii/plot_config.yaml | 7 + benchmarks/inference/mii/src/plot_th_lat.py | 174 ++++++++++-------- .../inference/mii/src/postprocess_results.py | 32 ++-- benchmarks/inference/mii/src/utils.py | 3 +- 5 files changed, 155 insertions(+), 90 deletions(-) create mode 100644 benchmarks/inference/mii/plot_config.yaml diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index 7196b0445..726cad462 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -61,9 +61,34 @@ figures will be saved to `./plots/` - `src/plot_repl_scale.py`: This script will plot the throughput and number of replicas for a fixed clients/replica per plot. - `src/plot_tp_sizes.py`: This script will plot latency and TFLOPs per GPU across different tensor parallelism sizes. -The following command shows an example of `plot_th_lat.py` execution using the `vllm`, `fastgen`, and `aml` backends. +## Throughput Latency Plot Generation Script +The `plot_th_lat.py` throughput-latency plot generation script is generalized for any result output directory, irrespective of where it was run. + +The script uses an **_optional_** `plot_config.yaml` that resides within each result directory and allows for overrides in the plot formatting. An example config file may look like this: +```yaml +label: "vLLM" +color: "purple" +marker: "o" +linestyle: "--" +polyfit_degree: 0 +x_max : 30 +y_max : 10 +``` + +Each of the config parameters is optional, allowing for overriding of only the specific plot aspects required, however, all parameters may also be provided. + +A few nuances for the `polyfit_degree` and `x/y_max` parameters: +- `polyfit_degree`: Specifies the polynomial degree for the 'best fit line'. Specifying `0` removes the best fit line and simply connects the scatter plot points. +- `x/y_max`: Clips the x or y axis data using the specified value as the upper bound. + +An example command executing the script may look something like this: +```bash +DeepSpeedExamples/benchmarks/inference/mii$ python3 src/plot_th_lat.py --data_dirs ./results/results-* --model_name +``` + +Or each result directory can be enumerated explicitly: ```bash -DeepSpeedExamples/benchmarks/inference/mii$ python3 src/plot_th_lat.py --backend vllm fastgen aml --log_dir results/ +DeepSpeedExamples/benchmarks/inference/mii$ python3 src/plot_th_lat.py --data_dirs ./results/results-1 ./results/results-2 ./results/results-3 --model_name ``` ## Running an End-to-End Example diff --git a/benchmarks/inference/mii/plot_config.yaml b/benchmarks/inference/mii/plot_config.yaml new file mode 100644 index 000000000..48a5a3171 --- /dev/null +++ b/benchmarks/inference/mii/plot_config.yaml @@ -0,0 +1,7 @@ +label: "vLLM" +color: "purple" +marker: "o" +linestyle: "--" +polyfit_degree: 0 +x_max : 30 +y_max : 10 diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py index 1191abd8a..18f115206 100644 --- a/benchmarks/inference/mii/src/plot_th_lat.py +++ b/benchmarks/inference/mii/src/plot_th_lat.py @@ -7,6 +7,7 @@ import glob import os import re +import yaml from pathlib import Path import matplotlib.pyplot as plt @@ -17,10 +18,10 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--backend", type=str, choices=["aml", "fastgen", "vllm"], default=["aml", "fastgen", "vllm"], \ - nargs="+", help="Specify the backends to generate plots for") - parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--data_dirs", type=str, nargs="+", \ + help="Specify the data directories to generate plots for") parser.add_argument("--out_dir", type=Path, default="./plots/throughput_latency") + parser.add_argument("--model_name", type=str, default="", help="Optional model name override") args = parser.parse_args() return args @@ -42,88 +43,115 @@ def extract_values(file_pattern): throughputs.append(summary.throughput) latencies.append(summary.latency) - if "aml" in args.backend: - extra_args["aml_api_url"] = prof_args["aml_api_url"] - extra_args["deployment_name"] = prof_args["deployment_name"] + return clients, throughputs, latencies, prof_args - return clients, throughputs, latencies, extra_args - -def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): +def output_charts(model, tp_size, bs, replicas, prompt, gen, out_dir): out_dir.mkdir(parents=True, exist_ok=True) result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json" plt.figure() - # Plotting the scatter plot - # vLLM plot formatting - if "vllm" in args.backend: - vllm_file_pattern = f"{log_dir}/vllm/{result_file_pattern}" - _, vllm_throughputs, vllm_latencies, _ = extract_values(vllm_file_pattern) - if len(vllm_throughputs) > 0: - plt.scatter( - vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange" + for data_dir in args.data_dirs: + file_pattern = f"{data_dir}/{result_file_pattern}" + _, throughputs, latencies, prof_args = extract_values(file_pattern) + + kwargs = {} + kwargs["label"] = str(data_dir) + kwargs["marker"] = "o" + kwargs["linestyle"] = "--" + + fit_kwargs = {} + fit_kwargs["linestyle"] = "--" + plot_fit_line = True + + polyfit_degree = 3 + plot_fn = plt.scatter + + plot_config = glob.glob(f"{data_dir}/plot_config.yaml") + + latencies = sorted(latencies) + throughputs = sorted(throughputs) + + if plot_config: + plot_config = plot_config[0] + plot_config = yaml.safe_load(Path(plot_config).read_text()) + plot_keys = plot_config.keys() + + # If x_max specified, clip data + if "x_max" in plot_keys: + for i, throughput in enumerate(throughputs): + if throughput > plot_config["x_max"]: + latencies = latencies[:i] + throughputs = throughputs[:i] + break + + # If y_max specified, clip data + if "y_max" in plot_keys: + for i, latency in enumerate(latencies): + if latency > plot_config["y_max"]: + latencies = latencies[:i] + throughputs = throughputs[:i] + break + + # Set polyfit degree + polyfit_degree = plot_config.get("polyfit_degree", polyfit_degree) + + # Select plot type + if polyfit_degree == 0: + plot_fit_line = False + + # Main plot kwargs + if "label" in plot_keys: + kwargs["label"] = plot_config["label"] + if "marker" in plot_keys: + kwargs["marker"] = plot_config["marker"] + if "color" in plot_keys: + kwargs["color"] = plot_config["color"] + if "linestyle" in plot_keys: + kwargs["linestyle"] = plot_config["linestyle"] + + # Fit line kwargs + if "color" in plot_keys: + fit_kwargs["color"] = plot_config["color"] + if "linestyle" in plot_keys: + fit_kwargs["linestyle"] = plot_config["linestyle"] + + if len(throughputs) > 0: + plot = plot_fn( + throughputs, + latencies, + **kwargs, ) - fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) - vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) - vllm_model_fn = np.poly1d(vllm_vllm_model) + + if plot_fn == plt.plot: + plot_color = plot[0].get_color() + else: + plot_color = plot.get_facecolor()[0] + + if not "color" in fit_kwargs.keys(): + fit_kwargs["color"] = plot_color + + fit_x_list = np.arange(min(throughputs), max(throughputs), 0.01) + data_model = np.polyfit(throughputs, latencies, polyfit_degree) + model_fn = np.poly1d(data_model) + x = fit_x_list if plot_fit_line else throughputs + y = model_fn(fit_x_list) if plot_fit_line else latencies plt.plot( - fit_vllm_x_list, - vllm_model_fn(fit_vllm_x_list), - color="orange", + x, + y, alpha=0.5, - linestyle="--", + **fit_kwargs, ) - # FastGen plot formatting - if "fastgen" in args.backend: - mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}" - _, mii_throughputs, mii_latencies, _ = extract_values(mii_file_pattern) - plt.scatter( - mii_throughputs, - mii_latencies, - label=f"DeepSpeed FastGen", - marker="o", - color="blue", - ) - fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) - mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3) - mii_model_fn = np.poly1d(mii_fit_model) - plt.plot( - fit_mii_x_list, - mii_model_fn(fit_mii_x_list), - color="blue", - alpha=0.5, - linestyle="--", - ) - - # AML plot formatting - if "aml" in args.backend: - aml_file_pattern = f"{log_dir}/aml/{result_file_pattern}" - _, aml_throughputs, aml_latencies, aml_args = extract_values(aml_file_pattern) - aml_endpoint_name = re.match('^https://(.+?)\.', aml_args["aml_api_url"]).groups()[0] - aml_deployment_name = aml_args["deployment_name"] - plt.scatter( - aml_throughputs, - aml_latencies, - label=f"AML {aml_endpoint_name.capitalize()}", - marker="o", - color="purple", - ) - fit_aml_x_list = np.arange(min(aml_throughputs), max(aml_throughputs), 0.01) - aml_fit_model = np.polyfit(aml_throughputs, aml_latencies, 3) - aml_model_fn = np.poly1d(aml_fit_model) - plt.plot( - fit_aml_x_list, - aml_model_fn(fit_aml_x_list), - color="purple", - alpha=0.5, - linestyle="--", - ) - # Generic plot formatting - plt.title(f"Model: {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}") + if args.model_name: + model_label = args.model_name + else: + model_label = model + + plt.title(f"Model: {model_label}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}") plt.xlabel("Throughput (queries/s)", fontsize=14) plt.ylabel("Latency (s)", fontsize=14) plt.legend() @@ -140,9 +168,6 @@ def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): if __name__ == "__main__": args = get_args() - if not args.log_dir.exists(): - raise ValueError(f"Log dir {args.log_dir} does not exist") - result_params = get_result_sets(args) for model, tp_size, bs, replicas, prompt, gen in result_params: @@ -153,6 +178,5 @@ def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir): replicas=replicas, prompt=prompt, gen=gen, - log_dir=args.log_dir, out_dir=args.out_dir, ) diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 5941242d9..593c62984 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -158,27 +158,37 @@ def get_result_sets(args: argparse.Namespace) -> set(): r"(.+)-tp(\d+)-bs(\d+)-replicas(\d+)-prompt(\d+)-gen(\d+)-clients.*.json" ) - backend_sets = defaultdict(set) + data_sets = defaultdict(set) - # Generate backend sets - for backend in args.backend: - for f in os.listdir(os.path.join(args.log_dir, backend)): + if hasattr(args, "data_dirs"): + data_set_dirs = args.data_dirs + elif hasattr(args, "backend"): + data_set_dirs = args.backend + + # Generate data sets + for data in data_set_dirs: + if hasattr(args, "log_dir"): + os_path = os.path.join(args.log_dir, data) + else: + os_path = os.path.join(data) + + for f in os.listdir(os_path): match = result_re.match(f) if match: - backend_sets[backend].add(match.groups()) + data_sets[data].add(match.groups()) # Intersection between all sets - for backend_set in backend_sets.values(): + for data_set in data_sets.values(): if result_params == None: - result_params = backend_set + result_params = data_set else: - result_params = result_params.intersection(backend_set) + result_params = result_params.intersection(data_set) # Warning messages about skipped sets - for key, backend_set in backend_sets.items(): - difference = backend_set.difference(result_params) + for key, data_set in data_sets.items(): + difference = data_set.difference(result_params) if difference: - print(f"WARNING: backend {key} has result combinations that are not present in all backends:") + print(f"WARNING: data {key} has result combinations that are not present in all data sets:") print(tabulate(difference, headers=["model", "tp_size", "bs", "replicas", "prompt", "gen"])) print("") diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index 23e108f8d..cd74c521f 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -209,8 +209,7 @@ def get_args_product( def get_results_path(args: argparse.Namespace) -> Path: return Path( - args.out_json_dir, - f"{args.backend}/", + f"{args.out_json_dir}_{args.backend}/", "-".join( ( args.model.replace("/", "_"), From 1be0fc77a62ef965e2dea920789f7df95a843820 Mon Sep 17 00:00:00 2001 From: Chandra Sekhar Gupta <38103118+guptha23@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:42:57 +0530 Subject: [PATCH 39/58] updating tokens per second to include the token count of generated tokens. (#895) Co-authored-by: Chandra Sekhar Gupta Aravapalli --- benchmarks/inference/mii/src/postprocess_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 593c62984..fe3a9fb1f 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -78,7 +78,7 @@ def get_summary(args, response_details): tokens_per_sec = mean( [ - (len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) + (len(get_tokenizer().tokenize(r.prompt)) + len(get_tokenizer().tokenize(r.generated_tokens))) / (r.end_time - r.start_time) for r in response_details ] From fdb8ee2d6a0822c132c9a086861ec2f94198fddb Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:43:52 -0700 Subject: [PATCH 40/58] Update tokens_per_sec calculation to work w/ stream and non-stream cases (#897) This PR updates the tokens_per_sec calculation in the MII benchmark to account for both the streaming and non-streaming modes. --- benchmarks/inference/mii/src/postprocess_results.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index fe3a9fb1f..4179f44b6 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -78,7 +78,9 @@ def get_summary(args, response_details): tokens_per_sec = mean( [ - (len(get_tokenizer().tokenize(r.prompt)) + len(get_tokenizer().tokenize(r.generated_tokens))) + (len(get_tokenizer().tokenize(r.prompt)) + + len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str + else len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details ] From cce62236a2c8f52d5548f310e64ee09ed2785416 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 1 May 2024 11:11:56 -0700 Subject: [PATCH 41/58] fix bug with queue.empty not being reliable (#898) See docs on `queue` for python multiprocessing. Specifically, `Because of multithreading/multiprocessing semantics, this is not reliable.`: https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Queue.empty --- benchmarks/inference/mii/src/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index 47a11ee5a..e8c656ab0 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -219,7 +219,7 @@ def _run_parallel( time.sleep(random.uniform(0, args.num_clients) * 0.01) try: - while not query_queue.empty(): + while True: print(f"queue size: {query_queue.qsize()} ({pid})", flush=True) input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0) From 75df1d7250452bcc7c065797a95c982bc8caab0b Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 7 Jun 2024 07:06:16 +0800 Subject: [PATCH 42/58] add client-only mode to mii benchmark (#900) Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> --- benchmarks/inference/mii/run_benchmark.py | 4 ++-- benchmarks/inference/mii/src/utils.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py index 858498884..0a2e0e457 100644 --- a/benchmarks/inference/mii/run_benchmark.py +++ b/benchmarks/inference/mii/run_benchmark.py @@ -20,7 +20,7 @@ def run_benchmark() -> None: args = parse_args(server_args=True, client_args=True) for server_args in get_args_product(args, which=SERVER_PARAMS): - if server_args.backend != "aml": + if server_args.backend != "aml" and not server_args.client_only: start_server(server_args) for client_args in get_args_product(server_args, which=CLIENT_PARAMS): @@ -36,7 +36,7 @@ def run_benchmark() -> None: print_summary(client_args, response_details) save_json_results(client_args, response_details) - if server_args.backend != "aml": + if server_args.backend != "aml" and not server_args.client_only: stop_server(server_args) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index cd74c521f..d3c1fee02 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -61,6 +61,10 @@ def parse_args( choices=["start", "stop", "restart"], help="Command for running server.py to manually start/stop/restart a server", ) + server_parser.add_argument( + "--client_only", action="store_true", help="Run client only with server started" + ) + # Client args client_parser = argparse.ArgumentParser(add_help=False) From bbab278112dc56d68c0d203f501b53d860788375 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 25 Jun 2024 17:26:02 -0700 Subject: [PATCH 43/58] Refactored LLM benchmark code (#899) * add refactored LLM benchmark code, initial commit * move prompt processing outside benchmark loop * formatting and improvements * slight refactor of benchmark runner, cleaning up, adding type hints * add tests, small refactors to improve code, make installable package * clean up code, add TODO notes for intended changes * update author list * add early stopping of benchmarks * add support for longer prompts and cleanup * address PR feedback, fix bugs, small updates * small change * fix small bugs around prompt length and max token size * remove debug prints * Update 128k-120.yaml * add min_requests override and print out for result summary * add README, rename benchmark * update unit tests --- .../inference/deepspeedometer/README.md | 85 ++++ .../deepspeedometer/configs/128k-120.yaml | 5 + .../deepspeedometer/configs/1300-120.yaml | 4 + .../deepspeedometer/configs/2600-60.yaml | 4 + .../deepspeedometer/configs/500-500.yaml | 4 + .../inference/deepspeedometer/pyproject.toml | 32 ++ .../inference/deepspeedometer/run_example.sh | 1 + .../src/deepspeedometer/__init__.py | 2 + .../src/deepspeedometer/arg_parsing.py | 51 +++ .../src/deepspeedometer/benchmark_runner.py | 390 ++++++++++++++++++ .../src/deepspeedometer/clients/__init__.py | 19 + .../clients/azure_ml_client.py | 79 ++++ .../src/deepspeedometer/clients/base.py | 30 ++ .../deepspeedometer/clients/dummy_client.py | 45 ++ .../deepspeedometer/clients/fastgen_client.py | 91 ++++ .../deepspeedometer/clients/vllm_client.py | 88 ++++ .../src/deepspeedometer/config.py | 13 + .../src/deepspeedometer/prompt.py | 117 ++++++ .../src/deepspeedometer/response.py | 16 + .../src/deepspeedometer/sample_input.py | 225 ++++++++++ .../inference/deepspeedometer/tests/README.md | 3 + .../deepspeedometer/tests/__init__.py | 0 .../deepspeedometer/tests/conftest.py | 95 +++++ .../deepspeedometer/tests/test_benchmark.py | 17 + .../deepspeedometer/tests/test_config.py | 32 ++ .../deepspeedometer/tests/test_early_stop.py | 23 ++ .../deepspeedometer/tests/test_prompt.py | 15 + 27 files changed, 1486 insertions(+) create mode 100644 benchmarks/inference/deepspeedometer/README.md create mode 100644 benchmarks/inference/deepspeedometer/configs/128k-120.yaml create mode 100644 benchmarks/inference/deepspeedometer/configs/1300-120.yaml create mode 100644 benchmarks/inference/deepspeedometer/configs/2600-60.yaml create mode 100644 benchmarks/inference/deepspeedometer/configs/500-500.yaml create mode 100644 benchmarks/inference/deepspeedometer/pyproject.toml create mode 100644 benchmarks/inference/deepspeedometer/run_example.sh create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/arg_parsing.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/benchmark_runner.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/azure_ml_client.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/dummy_client.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/fastgen_client.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/vllm_client.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/prompt.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/sample_input.py create mode 100644 benchmarks/inference/deepspeedometer/tests/README.md create mode 100644 benchmarks/inference/deepspeedometer/tests/__init__.py create mode 100644 benchmarks/inference/deepspeedometer/tests/conftest.py create mode 100644 benchmarks/inference/deepspeedometer/tests/test_benchmark.py create mode 100644 benchmarks/inference/deepspeedometer/tests/test_config.py create mode 100644 benchmarks/inference/deepspeedometer/tests/test_early_stop.py create mode 100644 benchmarks/inference/deepspeedometer/tests/test_prompt.py diff --git a/benchmarks/inference/deepspeedometer/README.md b/benchmarks/inference/deepspeedometer/README.md new file mode 100644 index 000000000..7c165c57d --- /dev/null +++ b/benchmarks/inference/deepspeedometer/README.md @@ -0,0 +1,85 @@ +# DeepSpeedometer + +This benchmark is designed to measure performance of LLM serving solutions. Using a different number of parallel clients sending requests to an inference server, we gather data to plot throughput-latency curves and find the saturation point of an inference server that demonstrates the maximum performance. + +## Installation + +To install the benchmark, clone this repository and install using `pip`: +```shell +git clone https://github.com/Microsoft/DeepSpeedExamples +cd ./DeepSpeedExamples/benchmarks/deepspeedometer +pip install . +``` + +## Usage + +To quickly test the benchmark code without creating an inference server, run the following: +``` +python3 -m deepspeedometer.benchmark_runner --model facebook/opt-125m --api dummy +``` + +### Supports APIs + +The benchmark supports different APIs, each with their own client type. Depending on the client, you may need to run the benchmark against a locally hosted inference server or a remote inference server. Adding support for new serving solutions can be achieved by creating a new client class that defines a few basic methods. See the section below on adding new clients for more information. + +The clients (i.e., APIs) curently supported (and configuration options for each) are listed below. You can see more information about the configuration options by looking at the `*ClientConfig` classes located in `clients/*.py`: + +1. `fastgen`: Runs a local model inference server with DeepSpeed's FastGen. Config options include: + - `model`: Which model to use for serving (required) + - `deployment_name`: Name of the deployment server + - `tp_size`: Tensor parallel size for each model replicas + - `num_replicas`: Number of model replicas + - `max_ragged_batch_size`: Max number of requests running per model replicas + - `quantization_mode`: Type of quantization to use +2. `vllm`: Runs a local model inference server with vLLM. + - `model`: Which model to use for serving (required) + - `tp_size`: Tensor parallel size for model + - `port`: Which port to use for REST API +3. `azureml`: Interfaces with remote AzureML online endpoint/deployment. + - `api_url`: AzureML endpoint API URL (required) + - `api_key`: AzureML token key for connecting to endpoint (required) + - `deployment_name`: Name of deployment hosted in given endpoint (required) + +### Benchmark Configuration + +The Benchmark has many options for tailoring performance measurements to a specific use-cases. For additional information and default values, see the `BenchmarkConfig` class defined in `benchmark_runner.py`. + +- `api`: Which API to use +- `warmup_requests`: Number of warm-up requests to run before measuring performance +- `result_dir`: Directory where results will be written out (as JSON files) +- `use_threading`: Whether to use threading for the benchmark clients. Default is to use multi-processing +- `config_file`: One or more config YAML files that contain values for any of the Prompt configuration options (see below section on prompt configuration) +- `num_clients`: One or more integer values for the number of parallel clients to run +- `num_requests_per_client`: Number of requests that will be run by each of the parallel clients +- `min_requests`: Minimum number of requests to be sent during duration of benchmark. Useful when there is a low number of clients to ensure good measurement. +- `prompt_text_source`: Text file or string that will be sampled to generate request prompts +- `early_stop_latency`: When running multiple values for `num_clients`, if the average latency per request exceeds this value (in seconds) the benchmark will not test a larger number of parallel clients +- `force`: Force the overwrite of result files. By default, if a result file exists, the benchmark is skipped + +### Prompt Configuration + +These options allow users to modify the prompt input and generation behavior of the served models. Note that you can run multiple prompt configurations in a single command by using the `config_file` input as described in the Benchmark Configuration section. + +- `model`: Which model to use for tokenizing prompts (required) +- `prompt_generator_seed`: Seed value for random number generation +- `max_prompt_length`: The maximum prompt length allowed +- `prompt_length`: Target mean prompt length +- `prompt_lenght_var`: Variance of generated prompt lengths +- `max_new_tokens`: Target mean number of generated tokens +- `max_new_tokens_var`: Variance of generated tokens +- `streaming`: Whether to enabled streaming output for generated tokens + +#### About Prompt Generation + +To mimic real-world serving scenarios, this benchmark samples prompt length and generated token length values from a normal distribution. This distribution can be manipulated with the `prompt_length*` and `max_new_tokens*` values in the prompt configuration. To get all prompt lengths and generation lengths to match exactly, set the `*_var` values to 0. + +## Adding New Client APIs + +The DeepSpeedometer benchmark was designed to allow easily adding support for new inference server solutions. To do so: + +1. Create a new `*_client.py` file in the `clients/` directory. +2. Define a `*Client` class that inherits from the `BaseClient` class in `clients/base.py`. This class should define 5 methods: `start_service`, `stop_service`, `prepare_request`, `send_request`, and `process_response`. Take a look at the type hints for these methods in the `BaseClient` class to understand the expected inputs and outputs for each method. +3. Define a `*ClientConfig` class that inherits from the `BaseConfigModel` class. Place any configuration options (i.e., user-passed command line arguments) necessary for your defined `*Client` class in here. +4. Import the newly added `*Client` and `*ClientConfig` into `clients/__init__.py` and add them to the `client_config_classes` and `client_classes` dictionaries. + +For the simplest example of adding a new client, take a look at the `clients/dummy_client.py` file where we have defined a client that does not stand up a server and only returns a sample of the input prompt after a short sleep cycle. We use this as a light-weight class for unit testing. \ No newline at end of file diff --git a/benchmarks/inference/deepspeedometer/configs/128k-120.yaml b/benchmarks/inference/deepspeedometer/configs/128k-120.yaml new file mode 100644 index 000000000..574e8e05e --- /dev/null +++ b/benchmarks/inference/deepspeedometer/configs/128k-120.yaml @@ -0,0 +1,5 @@ +prompt_length: 128000 +prompt_length_var: 0.1 +max_prompt_length: 131072 +max_new_tokens: 120 +max_new_tokens_var: 0.3 diff --git a/benchmarks/inference/deepspeedometer/configs/1300-120.yaml b/benchmarks/inference/deepspeedometer/configs/1300-120.yaml new file mode 100644 index 000000000..874a24c27 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/configs/1300-120.yaml @@ -0,0 +1,4 @@ +prompt_length: 1300 +prompt_lenght_var: 0.3 +max_new_tokens: 120 +max_new_tokens_var: 0.3 diff --git a/benchmarks/inference/deepspeedometer/configs/2600-60.yaml b/benchmarks/inference/deepspeedometer/configs/2600-60.yaml new file mode 100644 index 000000000..f7674f819 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/configs/2600-60.yaml @@ -0,0 +1,4 @@ +prompt_length: 2600 +prompt_lenght_var: 0.3 +max_new_tokens: 60 +max_new_tokens_var: 0.3 diff --git a/benchmarks/inference/deepspeedometer/configs/500-500.yaml b/benchmarks/inference/deepspeedometer/configs/500-500.yaml new file mode 100644 index 000000000..72389b37d --- /dev/null +++ b/benchmarks/inference/deepspeedometer/configs/500-500.yaml @@ -0,0 +1,4 @@ +prompt_length: 500 +prompt_lenght_var: 0.3 +max_new_tokens: 500 +max_new_tokens_var: 0.3 diff --git a/benchmarks/inference/deepspeedometer/pyproject.toml b/benchmarks/inference/deepspeedometer/pyproject.toml new file mode 100644 index 000000000..c15a27035 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" +[project] +name = "deepspeedometer" +version = "0.0.1" +authors = [ + { name="Ammar Ahmad Awan", email="ammar.awan@microsoft.com" }, + { name="Arash Bakhitiari", email="abakhtiari@microsoft.com" }, + { name="Connor Holmes"}, + { name="Lev Kurilenko", email="lev.kurilenko@microsoft.com" }, + { name="Heyang Qin", email="heyangqin@microsoft.com" }, + { name="Masahiro Tanaka", email="mtanaka@microsoft.com" }, + { name="Michael Wyatt", email="michaelwyatt@microsoft.com" }, +] +description = "LLM benchmarking tool" +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", +] +dependencies = [ + "loguru", + "pydantic>=2.0.0", + "torch", + "tqdm", + "transformers", +] + +[project.urls] +Homepage = "https://github.com/Microsoft/DeepSpeedExamples/tree/master/benchmarks/inference/deepspeedometer" +Issues = "https://github.com/Microsoft/DeepSpeedExamples/issues" diff --git a/benchmarks/inference/deepspeedometer/run_example.sh b/benchmarks/inference/deepspeedometer/run_example.sh new file mode 100644 index 000000000..42fef231d --- /dev/null +++ b/benchmarks/inference/deepspeedometer/run_example.sh @@ -0,0 +1 @@ +python -m src.deepspeedometer.benchmark_runner --model "facebook/opt-125m" --api dummy --config_file ./configs/1300-120.yaml diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py new file mode 100644 index 000000000..32cb0a0f9 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/__init__.py @@ -0,0 +1,2 @@ +from .arg_parsing import parse_args_to_configs +from .benchmark_runner import BenchmarkRunner diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/arg_parsing.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/arg_parsing.py new file mode 100644 index 000000000..8be6d0d42 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/arg_parsing.py @@ -0,0 +1,51 @@ +import argparse +from typing import List, Tuple + +from .benchmark_runner import BenchmarkConfig +from .clients import client_config_classes +from .config import BaseConfigModel + + +def parse_args_to_configs(args: List[str]) -> Tuple[BenchmarkConfig, BaseConfigModel]: + def add_model(parser: argparse.ArgumentParser, model: BaseConfigModel): + """Adds fields from pydantic model to the parser.""" + for name, field in model.model_fields.items(): + field_type = field.annotation + + # Get information about number of arguments expected + nargs = None + if getattr(field.annotation, "_name", "") == "List": + nargs = "+" + field_type = field.annotation.__args__[0] + + # Add field to parser + parser.add_argument( + f"--{name}", + dest=name, + nargs=nargs, + type=field_type, + required=getattr(field, "required", False), + default=getattr(field, "default", None), + help=getattr(field, "description", ""), + ) + + # Parse benchmark config fields + parser = argparse.ArgumentParser(allow_abbrev=False) + add_model(parser, BenchmarkConfig) + benchmark_args, remaining_args = parser.parse_known_args(args) + benchmark_config = BenchmarkConfig(**vars(benchmark_args)) + unused_args = set(remaining_args) + + # Parse client config fields + client_config_class = client_config_classes[benchmark_config.api] + parser = argparse.ArgumentParser(allow_abbrev=False) + add_model(parser, client_config_class) + client_args, remaining_args = parser.parse_known_args(args) + client_config = client_config_class(**vars(client_args)) + + # Check for unused arguments + unused_args = unused_args.intersection(remaining_args) + if unused_args: + raise ValueError(f"Unused arguments: {unused_args}") + + return benchmark_config, client_config diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/benchmark_runner.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/benchmark_runner.py new file mode 100644 index 000000000..96dd3a0da --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/benchmark_runner.py @@ -0,0 +1,390 @@ +import itertools +import json +import multiprocessing +import os +import queue +import sys +import threading +import time +import yaml +from pathlib import Path +from typing import List, Iterable, Tuple + +from loguru import logger +from tqdm import tqdm + +from .clients import client_classes, BaseClient +from .config import BaseConfigModel +from .prompt import Prompt, PromptConfig, PromptGenerator +from .response import Response +from .sample_input import sample_input_text + + +class BenchmarkConfig(PromptConfig): + api: str = "azure_ml" + """ Which API to use for benchmarking. New APIs can be added by creating a new client class in the `clients` directory. """ + + warmup_requests: int = 1 + """ Number of requests to run (per client) as a warm-up before starting the benchmark. """ + + result_dir: Path = Path("./results") + """ Top directory where results will be saved. """ + + use_threading: bool = False + """ Whether to use threading or multiprocessing for parallel client requests. Default is multiprocessing. """ + + config_file: List[Path] = [] + """ Path to YAML file(s) containing benchmark configuration settings. """ + + num_clients: List[int] = [1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32] + """ Number of clients to run in parallel. """ + + num_requests_per_client: int = 16 + """ Number of requests to run per client. """ + + min_requests: int = 128 + """ Minimum number of request to create (regardless of num_requests_per_client). """ + + prompt_text_source: str = sample_input_text + """ Text file or string to use for generated prompts. """ + + early_stop_latency: float = 10.0 + """ Maximum mean latency (in seconds) to allow before stopping the benchmark early. """ + + force: bool = False + """ Whether to overwrite existing result files. """ + + +class ClientLauncher: + def __init__( + self, + client_class: BaseClient, + client_config: BaseConfigModel, + warmup_requests: int, + use_threading: bool, + prompt_generator: PromptGenerator, + ): + self.client_class = client_class + self.client_config = client_config + self.client_obj = client_class(client_config) + self.warmup_requests = warmup_requests + self.prompt_generator = prompt_generator + + if use_threading: + self.runnable_cls = threading.Thread + self.barrier_cls = threading.Barrier + self.queue_cls = queue.Queue + else: + self.runnable_cls = multiprocessing.Process + self.barrier_cls = multiprocessing.Barrier + self.queue_cls = multiprocessing.Queue + + def run_parallel_clients(self, num_clients: int) -> None: + logger.info(f"Launching {num_clients} client(s)") + + total_requests = self.request_queue.qsize() + + self.barrier = self.barrier_cls(num_clients + 1) + processes = [ + self.runnable_cls( + target=self._run_client, + args=( + i, + self.barrier, + self.request_queue, + self.response_queue, + self.client_class, + self.client_config, + self.warmup_requests, + ), + ) + for i in range(num_clients) + ] + for p in processes: + p.start() + + self.barrier.wait() # Barrier 1 for master process + + self._progress_bar(total_requests - self.warmup_requests * num_clients) + + self.barrier.wait() # Barrier 2 for master process + + def _progress_bar(self, total_requests: int) -> None: + pbar = tqdm(total=total_requests) + num_responses = 0 + while num_responses != total_requests: + num_responses = self.response_queue.qsize() + pbar.update(num_responses - pbar.n) + time.sleep(1) + pbar.close() + + @staticmethod + def _run_client( + client_id: int, + barrier: multiprocessing.Barrier, + request_queue: multiprocessing.Queue, + response_queue: multiprocessing.Queue, + client_class: BaseClient, + client_config: BaseConfigModel, + warmup_requests: int, + ): + client = client_class(client_config) + + for _ in range(warmup_requests): + prompt = request_queue.get(timeout=1.0) + _ = client.send_request(prompt.request_kwargs) + + barrier.wait() # Barrier 1 for client process + try: + while True: + prompt = request_queue.get(timeout=1.0) + start_time = time.time() + raw_response = client.send_request(prompt.request_kwargs) + end_time = time.time() + request_time = end_time - start_time + response = Response( + prompt_text=prompt.text, + prompt_tokens=prompt.num_tokens, + raw_response=raw_response, + request_time=request_time, + client_id=client_id, + ) + response_queue.put_nowait(response) + except queue.Empty: + pass + + barrier.wait() # Barrier 2 for client process + + def add_request(self, prompt: Prompt) -> None: + request_kwargs = self.client_obj.prepare_request(prompt) + prompt.request_kwargs = request_kwargs + self.request_queue.put(prompt) + + def get_response(self) -> Response: + response = self.response_queue.get(timeout=1.0) + processed_response = self.client_obj.process_response(response.raw_response) + response.generated_output = processed_response + response.generated_tokens = self.prompt_generator.count_tokens( + processed_response + ) + return response + + def clear_queues(self) -> None: + self.request_queue = self.queue_cls() + self.response_queue = self.queue_cls() + + def start_service(self) -> None: + self.client_obj.start_service() + + def stop_service(self) -> None: + self.client_obj.stop_service() + + +class BenchmarkRunner: + def __init__( + self, benchmark_config: BaseConfigModel, client_config: BaseConfigModel + ) -> None: + logger.info("Initializing Benchmark Runner") + self.config = benchmark_config + self.client_config = client_config + self.client_class = client_classes[self.config.api] + self.prompt_generator = PromptGenerator( + self.config.model, self.config.prompt_text_source + ) + self.client_launcher = ClientLauncher( + client_class=self.client_class, + client_config=self.client_config, + warmup_requests=self.config.warmup_requests, + use_threading=self.config.use_threading, + prompt_generator=self.prompt_generator, + ) + self.all_responses = [] + + def _benchmark_settings(self) -> Iterable[Tuple[List[int], PromptConfig]]: + prompt_config_keys = list(PromptConfig.model_fields.keys()) + + configs_list = [] + for f in self.config.config_file: + logger.info(f"Generating benchmark run settings from config file: {f}") + with open(f, "r") as fh: + file_config = yaml.safe_load(fh) + + # Get any prompt config values stored in config files + for key in prompt_config_keys + ["num_clients"]: + if key not in file_config: + file_config[key] = getattr(self.config, key) + configs_list.append(file_config) + + if not configs_list: + logger.info(f"Generating benchmark run settings from command line args") + configs_list.append( + { + key: getattr(self.config, key) + for key in prompt_config_keys + ["num_clients"] + } + ) + + all_config_product = [] + for config in configs_list: + # Ensure all config values are iterable types (i.e., list or tuple) + for k, v in config.items(): + if not isinstance(v, list) or isinstance(v, tuple): + config[k] = [v] + + # We treat num_clients differently to enable early stopping + num_clients = config.pop("num_clients") + + # Generate all possible combinations of prompt config values + for vals in itertools.product(*[config[k] for k in prompt_config_keys]): + config_product = {k: v for k, v in zip(prompt_config_keys, vals)} + config_product["num_clients"] = num_clients + all_config_product.append(config_product) + + logger.info(f"Generated {len(all_config_product)} benchmark run setting(s)") + + for config in all_config_product: + num_clients = config.pop("num_clients") + prompt_config = PromptConfig(**config) + yield num_clients, prompt_config + + def _generate_requests(self, prompt_config: PromptConfig, num_clients: int) -> None: + logger.info("Generating Prompts") + + warmup_prompts = self.config.warmup_requests * num_clients + workload_prompts = max( + self.config.min_requests, self.config.num_requests_per_client * num_clients + ) + for prompt in self.prompt_generator( + config=prompt_config, num_prompts=warmup_prompts + workload_prompts + ): + self.client_launcher.add_request(prompt) + + logger.info( + f"Generated {warmup_prompts} warmup and {workload_prompts} workload prompts." + ) + + def _get_output_dir(self) -> Path: + return self.config.result_dir / self.config.api / self.config.model + + def _get_output_path(self, prompt_config: PromptConfig, num_clients: int) -> Path: + output_dir = self._get_output_dir() + output_file = f"prompt{prompt_config.prompt_length}_gen{prompt_config.max_new_tokens}_clients{num_clients}.json" + return output_dir / output_file + + def _process_responses( + self, prompt_config: PromptConfig, num_clients: int + ) -> List[Response]: + output_path = self._get_output_path( + prompt_config=prompt_config, num_clients=num_clients + ) + + logger.info(f"Saving results to {output_path}") + + all_responses = [] + while True: + try: + all_responses.append(self.client_launcher.get_response()) + except queue.Empty: + break + + os.makedirs(output_path.parent, exist_ok=True) + with open(output_path, "w") as fh: + json.dump([r.to_dict() for r in all_responses], fh, indent=2) + + logger.info(f"Saved {len(all_responses)} responses to {output_path}") + + return all_responses + + def _print_result_summary( + self, all_responses: List[Response], num_clients: int + ) -> None: + num_responses = int(len(all_responses)) + mean_latency = sum([r.request_time for r in all_responses]) / num_responses + query_throughput = num_clients / mean_latency + mean_prompt_length = int( + sum([r.prompt_tokens for r in all_responses]) / num_responses + ) + mean_gen_length = int( + sum([r.generated_tokens for r in all_responses]) / num_responses + ) + logger.info( + f"Result summary - # Requests: {num_responses:d}, Mean Prompt Length: {mean_prompt_length:d} tokens, Mean Generation Length: {mean_gen_length:d} tokens, Mean Latency: {mean_latency:.2f} s, Throughput: {query_throughput:.2f} queries/s," + ) + + def _check_early_stop(self, all_responses: List[Response]) -> bool: + if not all_responses: + return False + mean_latency = sum([r.request_time for r in all_responses]) / len(all_responses) + if mean_latency >= self.config.early_stop_latency: + logger.info( + f"Mean latency of {mean_latency:.2f} exceeds early stopping threshold of {self.config.early_stop_latency}. Stopping early." + ) + return True + return False + + def _skip_existing_result( + self, prompt_config: PromptConfig, num_clients: int + ) -> bool: + output_path = self._get_output_path( + prompt_config=prompt_config, num_clients=num_clients + ) + if output_path.exists(): + if self.config.force: + logger.info( + f"Result already exists, but force flag is set. Overwriting benchmark with {num_clients} client(s) and prompt config: {prompt_config}" + ) + return False + else: + logger.info( + f"Result already exists, skipping benchmark with {num_clients} client(s) and prompt config: {prompt_config}" + ) + return True + return False + + def run(self) -> None: + # Start the client service + self.client_launcher.start_service() + + # Generate all benchmark settings from user config(s) + for num_clients_list, prompt_config in self._benchmark_settings(): + all_responses = [] + for num_clients in sorted(num_clients_list): + if self._skip_existing_result( + prompt_config=prompt_config, num_clients=num_clients + ): + continue + + if self._check_early_stop(all_responses): + break + + logger.info( + f"Running benchmark with {num_clients} client(s) and prompt config: {prompt_config}" + ) + # Clear out queues and generate request prompts + self.client_launcher.clear_queues() + self._generate_requests( + prompt_config=prompt_config, num_clients=num_clients + ) + + # Launch the clients and process requests + self.client_launcher.run_parallel_clients(num_clients=num_clients) + + # Process raw responses and save results to file + all_responses = self._process_responses( + prompt_config=prompt_config, num_clients=num_clients + ) + + self._print_result_summary( + all_responses=all_responses, num_clients=num_clients + ) + + # Stop the client service + self.client_launcher.stop_service() + + +if __name__ == "__main__": + from .arg_parsing import parse_args_to_configs + + benchmark_config, client_config = parse_args_to_configs(sys.argv[1:]) + benchmark_runner = BenchmarkRunner(benchmark_config, client_config) + benchmark_runner.run() diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py new file mode 100644 index 000000000..a52c3618b --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py @@ -0,0 +1,19 @@ +from .base import BaseClient + +from .azure_ml_client import AzureMLClientConfig, AzureMLClient +from .dummy_client import DummyClientConfig, DummyClient +from .fastgen_client import FastGenClientConfig, FastGenClient +from .vllm_client import vLLMClientConfig, vLLMClient + +client_config_classes = { + "dummy": DummyClientConfig, + "azure_ml": AzureMLClientConfig, + "fastgen": FastGenClientConfig, + "vllm": vLLMClientConfig, +} +client_classes = { + "dummy": DummyClient, + "azure_ml": AzureMLClient, + "fastgen": FastGenClient, + "vllm": vLLMClient, +} diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/azure_ml_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/azure_ml_client.py new file mode 100644 index 000000000..5bedff692 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/azure_ml_client.py @@ -0,0 +1,79 @@ +import json +import requests +from typing import Any, Dict + +from loguru import logger + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +class AzureMLClientConfig(BaseConfigModel): + api_url: str = "" + """ URL for the AzureML REST API. """ + + api_key: str = "" + """ REST API key for the AzureML deployment. """ + + deployment_name: str = "" + """ Name of the AzureML deployment. """ + + +class AzureMLClient(BaseClient): + def __init__(self, config: AzureMLClientConfig) -> None: + super().__init__(config) + self.api_url = config.api_url + self.api_key = config.api_key + self.deployment_name = config.deployment_name + + def start_service(self) -> None: + # Verify that the server exists, this could be extended to actually + # start an AML deployment. However currently we assume one exists. + test_prompt = Prompt("hello world", num_tokens=5, max_new_tokens=16) + _ = self.process_response(self.send_request(self.prepare_request(test_prompt))) + + def stop_service(self) -> None: + pass + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + # TODO: add support for OpenAI chat completion template + if prompt.streaming: + raise ValueError("AzureMLClient does not support streaming prompts.") + + headers = { + "Content-Type": "application/json", + "Authorization": ("Bearer " + self.api_key), + "azureml-model-deployment": self.deployment_name, + } + pload = { + "input_data": { + "input_string": [ + prompt.text, + ], + "parameters": { + "max_tokens": prompt.max_new_tokens, + "return_full_text": prompt.return_full_text, + }, + } + } + return {"url": self.api_url, "headers": headers, "json": pload, "timeout": 180} + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + while True: + try: # Sometimes the AML endpoint will return an error, so we send the request again + response = requests.post(**request_kwargs) + output = json.loads(response.content) + assert ( + response.status_code == 200 + ), f"Status code: {response.status_code}" + assert output[0]["0"], f"Empty response" + break + except Exception as e: + logger.debug(f"Connection failed with {e}. Retrying AML request") + + return output + + def process_response(self, raw_response: Any) -> str: + response_text = raw_response[0]["0"] + return response_text diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py new file mode 100644 index 000000000..40a38e057 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/base.py @@ -0,0 +1,30 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict + +from ..config import BaseConfigModel +from ..prompt import Prompt + + +class BaseClient(ABC): + def __init__(self, config: BaseConfigModel) -> None: + self.config = config + + @abstractmethod + def start_service(self) -> None: + pass + + @abstractmethod + def stop_service(self) -> None: + pass + + @abstractmethod + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + pass + + @abstractmethod + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + pass + + @abstractmethod + def process_response(self, raw_response: Any) -> str: + pass diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/dummy_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/dummy_client.py new file mode 100644 index 000000000..f10b1e94e --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/dummy_client.py @@ -0,0 +1,45 @@ +import time +import random +from typing import Any, Dict + +from transformers import AutoTokenizer + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +class DummyClientConfig(BaseConfigModel): + model: str + dummy_client_latency_time: float = 0.1 + + +class DummyClient(BaseClient): + def __init__(self, config: DummyClientConfig) -> None: + super().__init__(config) + self.tokenizer = AutoTokenizer.from_pretrained(self.config.model) + self.latency_time = config.dummy_client_latency_time + + def start_service(self) -> None: + pass + + def stop_service(self) -> None: + pass + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + return {"input_text": prompt.text, "max_new_tokens": prompt.max_new_tokens} + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + time.sleep( + abs(random.uniform(self.latency_time - 0.1, self.latency_time + 0.2)) + ) + output_text = self.tokenizer.decode( + random.choices( + self.tokenizer.encode(request_kwargs["input_text"]), + k=request_kwargs["max_new_tokens"], + ) + ) + return output_text + + def process_response(self, raw_response: Any) -> str: + return raw_response diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/fastgen_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/fastgen_client.py new file mode 100644 index 000000000..c3f3a086f --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/fastgen_client.py @@ -0,0 +1,91 @@ +import time +from typing import Any, Dict, Optional + +from loguru import logger +from pydantic import Field + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +class FastGenClientConfig(BaseConfigModel): + model: str = Field(..., description="HuggingFace.co model name") + deployment_name: str = "fastgen-benchmark-deployment" + tp_size: int = 1 + num_replicas: int = 1 + max_ragged_batch_size: int = 768 + quantization_mode: Optional[str] = None + + +class FastGenClient(BaseClient): + def __init__(self, config: FastGenClientConfig): + super().__init__(config) + try: + import mii + except ImportError as e: + logger.error( + "Please install the `deepspeed-mii` package to use this client." + ) + raise e + + self.mii_client = mii.client(config.deployment_name) + self.streaming = config.streaming + + def start_service(self) -> None: + import mii + from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig + from deepspeed.inference.v2.ragged import DSStateManagerConfig + + tp_config = DeepSpeedTPConfig(tp_size=self.config.tp_size) + mgr_config = DSStateManagerConfig( + max_ragged_batch_size=self.config.max_ragged_batch_size, + max_ragged_sequence_count=self.config.max_ragged_batch_size, + ) + inference_config = RaggedInferenceEngineConfig( + tensor_parallel=tp_config, state_manager=mgr_config + ) + mii.serve( + self.config.model, + deployment_name=self.config.deployment_name, + tensor_parallel=self.config.tp_size, + inference_engine_config=inference_config, + replica_num=self.config.num_replicas, + quantization_mode=self.config.quantization_mode, + ) + + def stop_service(self) -> None: + import mii + + mii.client(self.config.deployment_name).terminate_server() + + def _streaming_callback(self, raw_response) -> None: + self.streaming_response_tokens.append(raw_response[0].generated_text) + time_now = time.time() + self.streaming_token_gen_time.append(time_now - time_last_token) + time_last_token = time_now + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + request_kwargs = { + "prompts": prompt.text, + "max_new_tokens": prompt.max_new_tokens, + } + if self.streaming: + self.streaming_response_tokens = [] + self.streaming_token_gen_time = [] + self.streaming_time_last_token = None + request_kwargs["streaming_fn"] = self._streaming_callback + return request_kwargs + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + if self.streaming: + self.streaming_time_last_token = time.time() + response = self.mii_client(**request_kwargs) + if self.streaming: + response = self.streaming_response_tokens + + return response + + def process_response(self, raw_response: Any) -> str: + if not self.streaming: + return raw_response[0].generated_text diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/vllm_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/vllm_client.py new file mode 100644 index 000000000..563c66e9d --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/vllm_client.py @@ -0,0 +1,88 @@ +import json +import requests +import subprocess +import time +from typing import Any, Dict + +from loguru import logger +from pydantic import Field + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +class vLLMClientConfig(BaseConfigModel): + model: str = Field(..., description="HuggingFace.co model name") + tp_size: int = 1 + port: int = 26500 + + +class vLLMClient(BaseClient): + def __init__(self, config: vLLMClientConfig): + super().__init__(config) + try: + import vllm + except ImportError as e: + logger.error("Please install the `vllm` package to use this client.") + raise e + + def start_service(self) -> None: + vllm_cmd = ( + "python", + "-m", + "vllm.entrypoints.api_server", + "--host", + "127.0.0.1", + "--port", + str(self.config.port), + "--tensor-parallel-size", + str(self.config.tp_size), + "--model", + self.config.model, + ) + p = subprocess.Popen( + vllm_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, close_fds=True + ) + start_time = time.time() + timeout_after = 60 * 5 # 5 minutes + while True: + line = p.stderr.readline().decode("utf-8") + if "Application startup complete" in line: + break + if "error" in line.lower(): + p.terminate() + # self.stop_service(config) + raise RuntimeError(f"Error starting VLLM server: {line}") + if time.time() - start_time > timeout_after: + p.terminate() + # self.stop_service(config) + raise TimeoutError("Timed out waiting for VLLM server to start") + time.sleep(0.01) + + def stop_service(self) -> None: + vllm_cmd = ("pkill", "-f", "vllm.entrypoints.api_server") + p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.wait() + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + api_url = "http://localhost:26500/generate" + headers = {"User-Agent": "Benchmark Client"} + pload = { + "prompt": prompt.text, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": prompt.max_new_tokens, + "ignore_eos": False, + } + return {"url": api_url, "headers": headers, "json": pload, "timeout": 180} + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + response = requests.post(**request_kwargs) + output = json.loads(response.content) + return output + + def process_response(self, raw_response: Any) -> str: + return raw_response["text"] diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py new file mode 100644 index 000000000..d524eb2cf --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/config.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel, ConfigDict + + +class BaseConfigModel(BaseModel): + model_config = ConfigDict( + validate_default=True, + validate_assignment=False, + use_enum_values=True, + populate_by_name=True, + extra="forbid", + arbitrary_types_allowed=True, + protected_namespaces=(), + ) diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/prompt.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/prompt.py new file mode 100644 index 000000000..58bd82d0a --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/prompt.py @@ -0,0 +1,117 @@ +import os +from dataclasses import dataclass +from typing import Iterable, Optional +from typing_extensions import Self + +import numpy as np +import torch +from loguru import logger +from pydantic import model_validator +from transformers import AutoTokenizer + +from .config import BaseConfigModel + +# Avoids a warning from transformers +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +@dataclass +class Prompt: + text: str + num_tokens: int + max_new_tokens: int + streaming: bool = False + return_full_text: bool = False + request_kwargs: dict = None + + +class PromptConfig(BaseConfigModel): + model: str + """ Names of the model used to benchmark. Used to load the model/tokenizer from HuggingFace.co. """ + + prompt_generator_seed: Optional[int] = None + """ Seed value for prompt generator. """ + + max_prompt_length: int = 4000 + """ Maximum prompt length for any request. """ + + prompt_length: int = 2600 + """ Mean prompt length for requests. """ + + prompt_length_var: float = 0.3 + """ Variance of prompt length. """ + + max_new_tokens: int = 60 + """ Mean number of new tokens to generate in each request. """ + + max_new_tokens_var: float = 0.3 + """ Variance of new tokens to generate. """ + + streaming: bool = False + """ Whether to enable streaming mode for the client. """ + + @model_validator(mode="after") + def set_max_prompt_length(self) -> Self: + if self.prompt_length > self.max_prompt_length: + logger.warning( + f"Prompt length {self.prompt_length} is greater than max prompt length {self.max_prompt_length}. Setting max prompt length to {self.prompt_length}." + ) + self.max_prompt_length = max(self.max_prompt_length, self.prompt_length) + return self + + +class PromptGenerator: + def __init__(self, model: str, prompt_text_source: str) -> None: + self.tokenizer = AutoTokenizer.from_pretrained(model) + if os.path.isfile(prompt_text_source): + with open(prompt_text_source, "r") as f: + prompt_text_source = f.read() + self.input_text = prompt_text_source + self.tokenized_input = self.tokenizer.encode( + self.input_text, return_tensors="pt", padding=False + )[0] + + def count_tokens(self, text: str) -> int: + return len(self.tokenizer.encode(text)) + + def __call__(self, config: PromptConfig, num_prompts: int) -> Iterable[Prompt]: + tokenized_input = self.tokenized_input + if len(tokenized_input) < config.max_prompt_length: + tokenized_input = torch.cat( + [ + tokenized_input + for _ in range(config.max_prompt_length // len(tokenized_input) + 1) + ] + ).flatten() + + if config.prompt_generator_seed is not None: + np.random.seed(config.prompt_generator_seed) + + for _ in range(num_prompts): + # Take the absolute value here because sometimes the normal + # distribution will return a negative value. This is technically + # wrong, but works out OK for most scenarios. + prompt_length = min( + abs( + int( + np.random.normal( + config.prompt_length, + config.prompt_length * config.prompt_length_var, + ) + ) + ), + config.max_prompt_length, + ) + max_new_tokens = abs( + int( + np.random.normal( + config.max_new_tokens, + config.max_new_tokens * config.max_new_tokens_var, + ) + ) + ) + yield Prompt( + text=self.tokenizer.decode(tokenized_input[:prompt_length]), + num_tokens=prompt_length, + max_new_tokens=max_new_tokens, + ) diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py new file mode 100644 index 000000000..3842ce5d7 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/response.py @@ -0,0 +1,16 @@ +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass +class Response: + prompt_text: str = "" + prompt_tokens: int = 0 + generated_output: str = "" + generated_tokens: int = 0 + request_time: float = 0 + raw_response: Any = None + client_id: int = 0 + + def to_dict(self) -> dict: + return asdict(self) diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/sample_input.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/sample_input.py new file mode 100644 index 000000000..0754da724 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/sample_input.py @@ -0,0 +1,225 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This is a sample input consisting of: +# Code & Text + +sample_input_text = """Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output. + During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss. + The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates. + Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). + By adjusting the parameters of the neural network during training, deep learning models learn to represent and generalize from complex data patterns. They have achieved remarkable success in various tasks, including image recognition, speech recognition, and natural language processing. + Here are the key fundamentals of deep learning for training large language models: + Neural Networks: At the heart of deep learning are artificial neural networks, which are inspired by the structure and functioning of biological neurons in the human brain. These networks consist of interconnected layers of artificial neurons called nodes or units. The nodes receive input, perform computations, and pass the results to the next layer. + Representation Learning: Deep learning models excel at learning meaningful representations of data. In the context of language, the models can automatically learn hierarchical representations of text, capturing complex relationships and semantic structures. + Feedforward and Backpropagation: Deep learning models typically use feedforward neural networks, where information flows from the input layer through intermediate hidden layers to the output layer. The network makes predictions based on the input data, and the prediction error is then backpropagated through the network. Backpropagation calculates gradients that indicate how each parameter in the network should be adjusted to minimize the error. + Activation Functions: Activation functions introduce non-linearities to neural networks, enabling them to learn complex patterns. Common activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). These functions determine the output of each neuron based on its weighted inputs. + Loss Functions: During training, a loss function is used to measure the discrepancy between the predicted output of the neural network and the desired output. In language modeling tasks, common loss functions include cross-entropy loss, which quantifies the difference in probability distributions. + Optimization Algorithms: Optimization algorithms determine how the network's parameters are updated based on the calculated gradients during backpropagation. Stochastic Gradient Descent (SGD) is a widely used algorithm that iteratively updates the parameters in the direction that minimizes the loss. Variants of SGD, such as Adam or RMSprop, adaptively adjust the learning rate to accelerate convergence. + Regularization Techniques: Deep learning models are prone to overfitting, where they memorize the training data but fail to generalize well to unseen examples. Regularization techniques such as dropout and weight decay are commonly used to prevent overfitting and improve generalization by adding constraints to the model's parameters. + Training on Large-Scale Datasets: Deep learning models, including large language models, require substantial amounts of labeled training data to learn effectively. Large-scale datasets are crucial to expose the model to diverse language patterns and ensure it captures a broad understanding of language. + Parallel Computing: Training large language models is computationally demanding. To accelerate the training process, parallel computing techniques, such as using multiple GPUs or distributed computing systems, are employed. These techniques allow for efficient processing of large datasets and speeding up the training iterations. + Transfer Learning and Fine-tuning: Transfer learning is a technique where a pre-trained model, trained on a large-scale dataset, is used as a starting point for a new task or dataset. Fine-tuning involves adjusting the pre-trained model's parameters on the new dataset to adapt it to the specific task at hand. This approach significantly reduces the training time and data requirements for new models. + The training process of a large language model typically involves the following steps: + Data Collection: A diverse and comprehensive dataset is collected, which typically consists of a vast range of text from sources like books, websites, articles, and other textual resources. The quality and variety of the dataset are crucial to ensure the model learns a broad understanding of language. + Preprocessing: The collected text data is preprocessed to clean and normalize it. This step involves removing irrelevant characters or symbols, converting the text to a consistent format, and organizing it into smaller units such as sentences or paragraphs. + Tokenization: The preprocessed text is divided into individual tokens, which can be as small as words or even subword units. Tokenization helps in representing and processing the text efficiently during training. + Architecture Design: The model architecture, often based on the transformer architecture, is defined. Transformers are neural network models that excel in capturing long-range dependencies in sequential data, making them well-suited for language modeling tasks. + Model Initialization: The model parameters are randomly initialized to start the training process. These parameters will be adjusted iteratively during training to optimize the model's performance. + Training Loop: The model is trained using a large-scale computational infrastructure. The training loop typically involves several iterations over the dataset, known as epochs. During each epoch, the model processes the input data, generates predictions, and compares them with the expected output. The discrepancy between the predicted and expected output is used to compute a loss, which quantifies the model's performance. + Backpropagation and Optimization: Backpropagation is employed to calculate the gradients of the model's parameters with respect to the loss. These gradients indicate the direction and magnitude of the parameter updates needed to minimize the loss. Optimization algorithms, such as stochastic gradient descent (SGD) or its variants, are then used to update the model's parameters based on the computed gradients. + Iterative Refinement: Steps 6 and 7 are repeated for multiple epochs, gradually refining the model's performance. The model's ability to generate coherent and contextually relevant responses improves as it learns from the dataset. + Evaluation: The trained model is evaluated on a separate dataset to assess its performance and identify areas for improvement. Various metrics, such as perplexity or accuracy, can be used to evaluate the model's language generation capabilities. + Fine-tuning and Iteration: Based on the evaluation results, the model may undergo fine-tuning or further iterations of training to enhance its performance. This process helps in addressing specific limitations or biases and aligning the model's output more closely with desired expectations. + It's important to note that training a large language model from scratch is a computationally intensive process that requires substantial computational resources, including powerful hardware like GPUs or specialized hardware accelerators, and large-scale distributed systems to handle the massive amount of data and model parameters involved. + Here are ten highly recommended books that can help you learn deep learning: + "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville: + This comprehensive book covers the fundamental concepts of deep learning, including neural networks, optimization algorithms, and regularization techniques. It also explores advanced topics like generative models and deep reinforcement learning. + "Deep Learning with Python" by François Chollet: + Written by the creator of the Keras deep learning library, this book provides a practical introduction to deep learning with Python. It covers essential concepts, tools, and techniques, and includes hands-on examples and case studies. + "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow" by Aurélien Géron: + This book offers a hands-on approach to learning machine learning and deep learning using popular Python libraries such as Scikit-Learn, Keras, and TensorFlow. It covers various algorithms and provides practical examples and exercises. + "Deep Learning for Computer Vision" by Rajalingappaa Shanmugamani: + Focusing on deep learning techniques for computer vision tasks, this book explores topics such as convolutional neural networks (CNNs), image classification, object detection, and image generation. It includes code examples using Python and popular deep learning frameworks. + "Deep Learning: A Practitioner's Approach" by Josh Patterson and Adam Gibson: + This book offers a practical guide to implementing deep learning solutions using the Deeplearning4j library. It covers key concepts, architectures, and techniques, and includes code examples and case studies. + "Grokking Deep Learning" by Andrew Trask: + Geared towards beginners, this book provides an intuitive and accessible introduction to deep learning concepts. It covers neural networks, backpropagation, gradient descent, and other fundamental topics with clear explanations and visualizations. + "Deep Learning for Natural Language Processing" by Palash Goyal, Sumit Pandey, and Karan Jain: + Focusing on deep learning techniques for natural language processing (NLP), this book explores topics like word embeddings, recurrent neural networks (RNNs), and sequence-to-sequence models. It includes code examples using Python and popular NLP libraries. + "Deep Reinforcement Learning" by Pieter Abbeel and John Schulman: + This book provides an in-depth exploration of deep reinforcement learning, a subfield that combines deep learning with reinforcement learning. It covers topics like Q-learning, policy gradients, and deep Q-networks (DQNs) and provides practical examples. + "Deep Learning for Time Series Forecasting" by N.D. Lewis: + Focusing on deep learning techniques for time series data, this book covers topics such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks, and attention models. It includes code examples using Python and popular deep learning frameworks. + "Interpretable Deep Learning" by Christoph Molnar: + This book delves into the challenges and techniques for interpreting and understanding deep learning models. It covers model visualization, feature importance, and other methods for explaining and interpreting deep learning predictions. + These books cover a range of deep learning topics and provide valuable insights and practical guidance for learning and applying deep learning techniques. Choose the ones that align with your interests and learning style to enhance your understanding of deep learning. + Here are 10 popular GitHub projects that can be useful for building large language models (LLMs) or working with natural language processing (NLP) tasks: + TensorFlow: An open-source deep learning framework that provides tools and resources for building and training LLMs. It offers extensive support for various neural network architectures and has a large community. + PyTorch: Another popular deep learning framework that provides a dynamic computational graph and a wide range of tools for building LLMs. It is known for its user-friendly interface and flexibility. + Hugging Face Transformers: A library that provides pre-trained models and a high-level API for natural language understanding (NLU) tasks, including LLMs. It supports popular models like GPT, BERT, and RoBERTa. + Fairseq: A library developed by Facebook AI Research that focuses on sequence modeling tasks, including LLMs. It offers pre-trained models and tools for training and evaluating models using sequence-to-sequence architectures. + AllenNLP: A powerful NLP research library that simplifies the process of building and evaluating deep learning models. It offers pre-built components for common NLP tasks and supports LLMs with various architectures. + OpenAI GPT-3: Although not available on GitHub, OpenAI's GPT-3 language model is widely recognized and can be accessed via the OpenAI API. It offers state-of-the-art language generation capabilities and can be used for various NLP tasks. + BERT: A pre-trained language model developed by Google Research that has achieved exceptional results on various NLP benchmarks. The official implementation is available on GitHub and can be fine-tuned for specific tasks. + spaCy: A popular Python library for NLP tasks that provides efficient and scalable tools for tokenization, named entity recognition, part-of-speech tagging, and more. It integrates well with deep learning frameworks. + FastText: A library developed by Facebook Research that provides efficient tools for text classification and word representation learning. It offers pre-trained word embeddings and supports training LLMs for classification tasks. + NLTK (Natural Language Toolkit): A comprehensive library for NLP tasks in Python. It provides various modules for tokenization, stemming, tagging, parsing, and more. Although it doesn't focus explicitly on LLMs, it is widely used for preprocessing text data in NLP pipelines. + These projects offer a range of resources, pre-trained models, and tools that can assist you in building and working with large language models. Make sure to review the documentation and examples provided by each project to understand their capabilities and how they can be integrated into your workflow. + Here are some popular backend libraries that are commonly used for deep learning: + TensorFlow: Developed by Google's Brain Team, TensorFlow is one of the most widely used deep learning frameworks. It provides a flexible and comprehensive ecosystem for building and deploying machine learning models. TensorFlow offers high-level APIs for easy model construction, as well as lower-level APIs for fine-grained control. It supports distributed computing and has extensive community support. + PyTorch: Developed by Facebook's AI Research lab, PyTorch is known for its simplicity and dynamic computational graph. It allows for intuitive model construction and debugging. PyTorch is widely used in both research and industry due to its flexibility, support for dynamic networks, and strong GPU acceleration capabilities. + Keras: Initially developed as a user-friendly deep learning library, Keras is now integrated as the official high-level API in TensorFlow. It provides a user-friendly and modular interface for building neural networks. Keras abstracts away many complexities and allows users to build models with just a few lines of code. It supports multiple backends, including TensorFlow and Theano. + Theano: Although its development has been discontinued, Theano was one of the first widely-used deep learning libraries. It allows for efficient mathematical operations on multi-dimensional arrays and supports GPU acceleration. Theano was influential in shaping the deep learning landscape and served as a precursor to subsequent frameworks. + Caffe: Developed by the Berkeley Vision and Learning Center (BVLC), Caffe is a popular deep learning framework known for its efficiency and simplicity. It is particularly suitable for convolutional neural networks (CNNs) and image-related tasks. Caffe has a clean and expressive architecture description language that makes it easy to define and train deep models. + MXNet: MXNet is an open-source deep learning framework developed by Apache. It offers a flexible and efficient interface for building and deploying neural networks. MXNet provides a hybrid frontend that allows users to seamlessly switch between symbolic and imperative programming. It is known for its scalability and supports multiple programming languages. + Chainer: Chainer is a flexible deep learning framework that focuses on dynamic neural networks. It allows for intuitive model construction using imperative programming, making it easy to define complex architectures and manipulate data within the network. Chainer is known for its "define-by-run" approach, which facilitates dynamic computations. + Microsoft Cognitive Toolkit (CNTK): CNTK is a deep learning framework developed by Microsoft. It provides a highly efficient and scalable implementation of deep neural networks. CNTK supports both declarative and imperative programming models, making it suitable for both research and production-level deployments. + Deeplearning4j: Deeplearning4j is an open-source deep learning library that focuses on scalability and performance. It is designed to integrate with the Java ecosystem and supports distributed computing. Deeplearning4j provides tools for building various types of neural networks and offers integration with other popular libraries like Hadoop and Spark. + PaddlePaddle: PaddlePaddle (PArallel Distributed Deep LEarning) is a deep learning framework developed by Baidu. It emphasizes scalability and supports large-scale distributed training. PaddlePaddle provides a rich set of built-in models and algorithms, making it accessible to both beginners and advanced users. + Each of these backend libraries offers unique features, performance characteristics, and levels of abstraction. The choice of a backend library depends on factors such as your programming language preferences, the complexity of your models, the availability of community support, and the specific requirements of your deep learning project. + Here's an example code snippet that demonstrates how to create a GPT-Neox20B model using the Hugging Face Transformers library and start fine-tuning it with sample data from the '/tmp/wikitext' directory: + + import torch + from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments + + # Load the GPT-Neo model and tokenizer + model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B") + tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") + + # Set the path to the training data + data_path = "/tmp/wikitext" + + # Define the dataset and data collator + dataset = TextDataset(tokenizer=tokenizer, file_path=data_path, block_size=128) + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + # Define the training arguments + training_args = TrainingArguments( + output_dir="./output_dir", # Directory to save the fine-tuned model and logs + overwrite_output_dir=True, + num_train_epochs=3, # Number of training epochs + per_device_train_batch_size=8, + save_steps=500, # Number of updates steps before saving the model + save_total_limit=2, # Limit the total number of saved models + prediction_loss_only=True, + ) + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=dataset, + ) + + # Start fine-tuning + trainer.train() + + # Save the fine-tuned model + trainer.save_model("./fine-tuned-model") + + To deploy the GPT-Neo model on an HTTP server, you can use a framework like Flask or FastAPI. Here's a basic example using Flask: + + Install Flask: + + pip install flask + + Create a Python file, let's call it app.py, and import the necessary libraries: + + from flask import Flask, request, jsonify + from transformers import GPTNeoForCausalLM, GPT2Tokenizer + + # Initialize Flask app + app = Flask(__name__) + + # Load the pretrained GPT-Neo model and tokenizer + model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B") + tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") + Define an endpoint for model inference: + + python + Copy code + @app.route("/generate", methods=["POST"]) + def generate_text(): + data = request.get_json() + input_text = data["input_text"] + + # Tokenize the input text + input_ids = tokenizer.encode(input_text, return_tensors="pt") + + # Generate text with the model + output = model.generate(input_ids, max_length=100, num_return_sequences=1) + generated_text = tokenizer.decode(output[0], skip_special_tokens=True) + + return jsonify({"generated_text": generated_text}) + Run the Flask app: + + python + Copy code + if __name__ == "__main__": + app.run(host="0.0.0.0", port=5000) + The code that includes the text generation sampling functions and provides a commented example: + + import requests + import numpy as np + + class TextGeneratorAPI: + def __init__(self, server_url): + self.server_url = server_url + + def generate_text(self, input_text, sampling_algorithm="greedy", temperature=0.7): + url = f"{self.server_url}/generate" + payload = { + "input_text": input_text, + "sampling_algorithm": sampling_algorithm, + "temperature": temperature + } + response = requests.post(url, json=payload) + generated_text = response.json()["generated_text"] + return generated_text + + def greedy_sampling(self, logits): + return np.argmax(logits) + + def random_sampling(self, logits): + probabilities = np.exp(logits / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(len(logits), p=probabilities) + + def top_k_sampling(self, logits, k=10): + indices = np.argsort(logits)[-k:] + probabilities = np.exp(logits[indices] / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(indices, p=probabilities) + + def top_p_sampling(self, logits, p=0.9): + sorted_logits = np.sort(logits)[::-1] + cumulative_probs = np.cumsum(np.exp(sorted_logits) / temperature) + indices = np.arange(len(sorted_logits)) + selected_indices = indices[cumulative_probs <= p] + probabilities = np.exp(logits[selected_indices] / temperature) + probabilities = probabilities / np.sum(probabilities) + return np.random.choice(selected_indices, p=probabilities) + In this updated code, the TextGeneratorAPI class includes the additional sampling functions: greedy_sampling, random_sampling, top_k_sampling, and top_p_sampling. These functions take logits (output of the model) as input and return the index of the selected token based on the respective sampling algorithm. + The greedy_sampling function selects the token with the highest probability (argmax) as the next token. The random_sampling function applies a temperature scaling to the logits and then samples from the resulting probability distribution. The top_k_sampling function selects from the top-k tokens with the highest probabilities. The top_p_sampling function selects from the tokens with cumulative probabilities below a certain threshold (top-p). + You can now use the updated TextGeneratorAPI class with the sampling functions. Here's an example: + + api = TextGeneratorAPI(server_url="http://localhost:5000") + + input_text = "Once upon a time" + + # Generate text using different sampling algorithms and temperatures + greedy_text = api.generate_text(input_text, sampling_algorithm="greedy") + random_text = api.generate_text(input_text, sampling_algorithm="random") + top_k_text = api.generate_text(input_text, sampling_algorithm="top_k", temperature=0.8) + top_p_text = api.generate_text(input_text, sampling_algorithm="top_p", temperature=0.9) + + print("Greedy Sampling:", greedy_text) + print("Random Sampling:", random_text) + print("Top-k Sampling:", top_k_text) + print("Top-p Sampling:", top_p_text) + Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API. + """ diff --git a/benchmarks/inference/deepspeedometer/tests/README.md b/benchmarks/inference/deepspeedometer/tests/README.md new file mode 100644 index 000000000..15a5f49f9 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/README.md @@ -0,0 +1,3 @@ +To run the unit tests: + +`python3 -m pytest .` \ No newline at end of file diff --git a/benchmarks/inference/deepspeedometer/tests/__init__.py b/benchmarks/inference/deepspeedometer/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/inference/deepspeedometer/tests/conftest.py b/benchmarks/inference/deepspeedometer/tests/conftest.py new file mode 100644 index 000000000..e2f779c44 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/conftest.py @@ -0,0 +1,95 @@ +import pytest + + +@pytest.fixture(scope="function", params=["facebook/opt-125m"]) +def model(request): + return request.param + + +@pytest.fixture(scope="function", params=["dummy"]) +def api(request): + return request.param + + +@pytest.fixture(scope="function", params=[""]) +def result_dir(request, tmpdir): + if request.param: + return str(request.param) + return str(tmpdir) + + +@pytest.fixture(scope="function", params=[5]) +def num_requests_per_client(request): + return str(request.param) + + +@pytest.fixture(scope="function", params=[16]) +def min_requests(request): + return str(request.param) + + +@pytest.fixture(scope="function", params=[(1, 2)]) +def num_clients(request): + if isinstance(request.param, tuple) or isinstance(request.param, list): + return [str(num) for num in request.param] + else: + return [str(request.param)] + + +@pytest.fixture(scope="function", params=[0]) +def num_config_files(request): + return request.param + + +@pytest.fixture(scope="function") +def config_files(num_config_files, tmp_path): + config_files = [] + for i in range(num_config_files): + config_file = tmp_path / f"config_{i}.yaml" + config_file.touch() + config_files.append(str(config_file)) + return config_files + + +@pytest.fixture(scope="function", params=[""]) +def prompt_length_var(request): + return str(request.param) + + +@pytest.fixture(scope="function", params=[""]) +def max_new_tokens_var(request): + return str(request.param) + + +@pytest.fixture(scope="function") +def benchmark_args( + model, + api, + result_dir, + num_requests_per_client, + min_requests, + num_clients, + config_files, + prompt_length_var, + max_new_tokens_var, +): + args = [] + if model: + args.extend(["--model", model]) + if api: + args.extend(["--api", api]) + if result_dir: + args.extend(["--result_dir", result_dir]) + if num_requests_per_client: + args.extend(["--num_requests_per_client", num_requests_per_client]) + if min_requests: + args.extend(["--min_requests", min_requests]) + if num_clients: + args.extend(["--num_clients"] + num_clients) + if config_files: + args.extend(["--config_file"] + config_files) + if prompt_length_var: + args.extend(["--prompt_length_var", prompt_length_var]) + if max_new_tokens_var: + args.extend(["--max_new_tokens_var", max_new_tokens_var]) + return args diff --git a/benchmarks/inference/deepspeedometer/tests/test_benchmark.py b/benchmarks/inference/deepspeedometer/tests/test_benchmark.py new file mode 100644 index 000000000..2b067d39e --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/test_benchmark.py @@ -0,0 +1,17 @@ +import pytest + +from deepspeedometer import parse_args_to_configs, BenchmarkRunner + + +def test_benchmark_runner(benchmark_args, num_clients): + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + benchmark_runner = BenchmarkRunner(benchmark_config, client_config) + benchmark_runner.run() + + expected_results = sum(1 for _ in benchmark_runner._benchmark_settings()) * len( + num_clients + ) + actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json"))) + assert ( + expected_results == actual_results + ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})." diff --git a/benchmarks/inference/deepspeedometer/tests/test_config.py b/benchmarks/inference/deepspeedometer/tests/test_config.py new file mode 100644 index 000000000..d20e0981a --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/test_config.py @@ -0,0 +1,32 @@ +import pytest + +import yaml + +import pydantic + +from deepspeedometer import BenchmarkRunner, parse_args_to_configs + + +def test_config(benchmark_args): + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + + +@pytest.mark.parametrize("model", [""]) +def test_config_required_fail(benchmark_args): + with pytest.raises(pydantic.ValidationError): + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + + +@pytest.mark.parametrize("num_config_files", [1]) +def test_config_file(benchmark_args, config_files, num_clients): + # Create a config that would generate 6 benchmark settings + config = {"max_prompt_length": [500, 1300, 2600], "num_clients": [1, 2]} + with open(config_files[0], "w") as f: + yaml.dump(config, f) + + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + benchmark_runner = BenchmarkRunner(benchmark_config, client_config) + benchmark_settings = sum(1 for _ in benchmark_runner._benchmark_settings()) * len( + num_clients + ) + assert benchmark_settings == 6 diff --git a/benchmarks/inference/deepspeedometer/tests/test_early_stop.py b/benchmarks/inference/deepspeedometer/tests/test_early_stop.py new file mode 100644 index 000000000..2a63ba206 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/test_early_stop.py @@ -0,0 +1,23 @@ +import pytest + +from deepspeedometer import parse_args_to_configs, BenchmarkRunner + + +@pytest.mark.parametrize("num_clients", [(1, 2, 4)], indirect=True) +def test_early_stop(benchmark_args): + benchmark_args += [ + "--early_stop_latency", + "1", + "--dummy_client_latency_time", + "2.0", + ] + print(benchmark_args) + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + benchmark_runner = BenchmarkRunner(benchmark_config, client_config) + benchmark_runner.run() + + expected_results = 1 + actual_results = len(list(benchmark_runner._get_output_dir().glob("*.json"))) + assert ( + expected_results == actual_results + ), f"Number of result files ({actual_results}) does not match expected number ({expected_results})." diff --git a/benchmarks/inference/deepspeedometer/tests/test_prompt.py b/benchmarks/inference/deepspeedometer/tests/test_prompt.py new file mode 100644 index 000000000..997a82dd5 --- /dev/null +++ b/benchmarks/inference/deepspeedometer/tests/test_prompt.py @@ -0,0 +1,15 @@ +import pytest + +from deepspeedometer import BenchmarkRunner, parse_args_to_configs + + +@pytest.mark.parametrize("prompt_length_var, max_new_tokens_var", [(0, 0)]) +def test_prompt_length(benchmark_args): + benchmark_config, client_config = parse_args_to_configs(benchmark_args) + benchmark_runner = BenchmarkRunner(benchmark_config, client_config) + num_clients, prompt_config = next(benchmark_runner._benchmark_settings()) + + for prompt in benchmark_runner.prompt_generator(prompt_config, num_prompts=10): + prompt_length = benchmark_runner.prompt_generator.count_tokens(prompt.text) + # Using pytest.approx here because often we will have 1-off errors due to tokenization special tokens + assert prompt_length == pytest.approx(benchmark_runner.config.prompt_length, 1) From b04fedd32665c99c906baa4e1ccd217d58592fd6 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Wed, 14 Aug 2024 13:34:42 -0700 Subject: [PATCH 44/58] Enable cpu/xpu support for the benchmarking suite (#905) * enable cpu/xpu support for the benchmarking suite * fixes according to review feedback --- benchmarks/communication/README.md | 10 +++++++++- benchmarks/communication/all_gather.py | 14 ++++++++++++-- benchmarks/communication/all_reduce.py | 14 ++++++++++++-- benchmarks/communication/all_to_all.py | 14 ++++++++++++-- benchmarks/communication/broadcast.py | 14 ++++++++++++-- benchmarks/communication/constants.py | 1 + benchmarks/communication/pt2pt.py | 14 ++++++++++++-- benchmarks/communication/utils.py | 6 ++++++ 8 files changed, 76 insertions(+), 11 deletions(-) diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md index 535b5d308..15ce1995b 100644 --- a/benchmarks/communication/README.md +++ b/benchmarks/communication/README.md @@ -1,6 +1,6 @@ # The DeepSpeed Communication Benchmarking Suite -The intent of these benchmarks is to measure communication latency/bw of deepspeed and/or pytorch distributed communication operations at the Python layer. These benchmarks are complementary to C-level comms benchmarks like [OSU Micro-Benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks/) and [NCCL Tests](https://github.com/NVIDIA/nccl-tests) in that users can: +The intent of these benchmarks is to measure communication latency/bw of deepspeed and/or pytorch distributed communication operations at the Python layer. These benchmarks are complementary to C-level comms benchmarks like [OSU Micro-Benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks/) , [NCCL Tests](https://github.com/NVIDIA/nccl-tests) and [oneCCL Benchmark](https://oneapi-src.github.io/oneCCL/benchmark.html) in that users can: - Easily debug which layer of the communication software stack hangs or performance degradations originate from. - Measure the expected communication performance of either DeepSpeed comms or pure PyTorch distributed @@ -77,6 +77,14 @@ Finally, users can choose specific communication operations to run in `run_all.p deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
+## CPU Support +Those benchmarks could also support other devices like Intel CPU via oneCCL. +Users just need to append one more argument "--device cpu" for all python scripts to run on Intel CPU. +For example, run with a single large message size on Intel CPU: +
+deepspeed all_reduce.py --device cpu
+
+ # Adding Communication Benchmarks diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py index 8aa33581d..76c4f3b1e 100644 --- a/benchmarks/communication/all_gather.py +++ b/benchmarks/communication/all_gather.py @@ -17,6 +17,9 @@ # Run all_gather and print metrics def timed_all_gather(input, output, start_event, end_event, args): + if args.device == "cpu": + print_rank_0(f"No Event support on CPU to measure time for now") + return if args.dist == 'torch': import torch.distributed as dist @@ -64,8 +67,15 @@ def run_all_gather(local_rank, args): global_rank = dist.get_rank() world_size = dist.get_world_size() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + if args.device == "xpu": + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + elif args.device == "cpu": + start_event = torch.cpu.Event() + end_event = torch.cpu.Event() + else: + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) if args.scan: # Create list of message sizes diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py index b9decfd98..41c3116ee 100644 --- a/benchmarks/communication/all_reduce.py +++ b/benchmarks/communication/all_reduce.py @@ -15,6 +15,9 @@ def timed_all_reduce(input, start_event, end_event, args): + if args.device == "cpu": + print_rank_0(f"No Event support on CPU to measure time for now") + return if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -60,8 +63,15 @@ def run_all_reduce(local_rank, args): world_size = dist.get_world_size() global_rank = dist.get_rank() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + if args.device == "xpu": + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + elif args.device == "cpu": + start_event = torch.cpu.Event() + end_event = torch.cpu.Event() + else: + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) if args.scan: M_LIST = [] diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py index 7eccfa824..dc10b9ec9 100644 --- a/benchmarks/communication/all_to_all.py +++ b/benchmarks/communication/all_to_all.py @@ -15,6 +15,9 @@ def timed_all_to_all(input, output, start_event, end_event, args): + if args.device == "cpu": + print_rank_0(f"No Event support on CPU to measure time for now") + return if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -59,8 +62,15 @@ def run_all_to_all(local_rank, args): # Prepare benchmark header print_header(args, 'all_to_all') - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + if args.device == "xpu": + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + elif args.device == "cpu": + start_event = torch.cpu.Event() + end_event = torch.cpu.Event() + else: + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) if args.scan: M_LIST = [] diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py index 860c9555b..d05303be1 100644 --- a/benchmarks/communication/broadcast.py +++ b/benchmarks/communication/broadcast.py @@ -15,6 +15,9 @@ def timed_broadcast(input, start_event, end_event, args): + if args.device == "cpu": + print_rank_0(f"No Event support on CPU to measure time for now") + return if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -60,8 +63,15 @@ def run_broadcast(local_rank, args): world_size = dist.get_world_size() global_rank = dist.get_rank() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + if args.device == "xpu": + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + elif args.device == "cpu": + start_event = torch.cpu.Event() + end_event = torch.cpu.Event() + else: + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) if args.scan: M_LIST = [] diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py index ae9fa261b..60df98ed2 100644 --- a/benchmarks/communication/constants.py +++ b/benchmarks/communication/constants.py @@ -12,4 +12,5 @@ DEFAULT_UNIT = 'Gbps' DEFAULT_DIST = 'deepspeed' DEFAULT_MAXSIZE = 24 +DEFAULT_DEVICE = 'cuda' TORCH_DISTRIBUTED_DEFAULT_PORT = 29500 diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py index 57eab9a66..ec3252eb8 100644 --- a/benchmarks/communication/pt2pt.py +++ b/benchmarks/communication/pt2pt.py @@ -15,6 +15,9 @@ def timed_pt2pt(input, start_event, end_event, args): + if args.device == "cpu": + print_rank_0(f"No Event support on CPU to measure time for now") + return if args.dist == 'torch': import torch.distributed as dist elif args.dist == 'deepspeed': @@ -78,8 +81,15 @@ def run_pt2pt(local_rank, args): global_rank = dist.get_rank() world_size = dist.get_world_size() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + if args.device == "xpu": + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + elif args.device == "cpu": + start_event = torch.cpu.Event() + end_event = torch.cpu.Event() + else: + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) if args.scan: # Create list of message sizes diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py index a74d24e28..6f6dd83a1 100644 --- a/benchmarks/communication/utils.py +++ b/benchmarks/communication/utils.py @@ -108,6 +108,11 @@ def get_bw(comm_op, size, duration, args): n = dist.get_world_size() tput = 0 busbw = 0 + + if duration == 0: + print_rank_0("Error. Duration is 0.") + return tput, busbw + if comm_op == "all_to_all": tput = (size / duration) busbw = (size / duration) * ((n - 1) / n) @@ -235,4 +240,5 @@ def benchmark_parser(): default=.3, help='Proportion of max available GPU memory to use for single-size evals') parser.add_argument("--debug", action="store_true", help='Enables all_to_all debug prints') + parser.add_argument("--device", type=str, default=DEFAULT_DEVICE, help='target device') return parser From 8d91a5ab02c0548b98648f760b4d8deeca41cae4 Mon Sep 17 00:00:00 2001 From: keshavkowshik Date: Fri, 16 Aug 2024 09:24:17 -0700 Subject: [PATCH 45/58] Update README.md (#916) --- benchmarks/inference/deepspeedometer/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/inference/deepspeedometer/README.md b/benchmarks/inference/deepspeedometer/README.md index 7c165c57d..b327916c5 100644 --- a/benchmarks/inference/deepspeedometer/README.md +++ b/benchmarks/inference/deepspeedometer/README.md @@ -1,5 +1,8 @@ # DeepSpeedometer +NOTE: This is an experimental tool and is not currently being supported since it's not fully functional. Please use the MII benchmark which can be found here: +https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference/mii + This benchmark is designed to measure performance of LLM serving solutions. Using a different number of parallel clients sending requests to an inference server, we gather data to plot throughput-latency curves and find the saturation point of an inference server that demonstrates the maximum performance. ## Installation @@ -82,4 +85,4 @@ The DeepSpeedometer benchmark was designed to allow easily adding support for ne 3. Define a `*ClientConfig` class that inherits from the `BaseConfigModel` class. Place any configuration options (i.e., user-passed command line arguments) necessary for your defined `*Client` class in here. 4. Import the newly added `*Client` and `*ClientConfig` into `clients/__init__.py` and add them to the `client_config_classes` and `client_classes` dictionaries. -For the simplest example of adding a new client, take a look at the `clients/dummy_client.py` file where we have defined a client that does not stand up a server and only returns a sample of the input prompt after a short sleep cycle. We use this as a light-weight class for unit testing. \ No newline at end of file +For the simplest example of adding a new client, take a look at the `clients/dummy_client.py` file where we have defined a client that does not stand up a server and only returns a sample of the input prompt after a short sleep cycle. We use this as a light-weight class for unit testing. From 95639048ff36edc528370af7797e396cdeec9347 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 22 Aug 2024 00:33:31 +0800 Subject: [PATCH 46/58] Add openai client to deepspeedometer (#913) * add openai client * adding openai api support for mii benchmark * enable openai_api (non-stream) mode * enable stream mode for openai-api --------- Co-authored-by: Olatunji Ruwase --- .../src/deepspeedometer/clients/__init__.py | 3 + .../deepspeedometer/clients/openai_client.py | 57 ++++++++++++++ benchmarks/inference/mii/src/client.py | 76 ++++++++++++++++++- .../mii/src/plot_effective_throughput.py | 9 ++- .../inference/mii/src/postprocess_results.py | 11 ++- benchmarks/inference/mii/src/server.py | 8 ++ benchmarks/inference/mii/src/utils.py | 14 +++- 7 files changed, 169 insertions(+), 9 deletions(-) create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py index a52c3618b..ac1891112 100644 --- a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py @@ -4,16 +4,19 @@ from .dummy_client import DummyClientConfig, DummyClient from .fastgen_client import FastGenClientConfig, FastGenClient from .vllm_client import vLLMClientConfig, vLLMClient +from .openai_client import openaiClientConfig, openaiClient client_config_classes = { "dummy": DummyClientConfig, "azure_ml": AzureMLClientConfig, "fastgen": FastGenClientConfig, "vllm": vLLMClientConfig, + "openai": openaiClientConfig } client_classes = { "dummy": DummyClient, "azure_ml": AzureMLClient, "fastgen": FastGenClient, "vllm": vLLMClient, + "openai": openaiClient, } diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py new file mode 100644 index 000000000..76eadfc5c --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py @@ -0,0 +1,57 @@ +import os +import json +import requests +import subprocess +import time +from typing import Any, Dict + +from loguru import logger +from pydantic import Field + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +# client to test any openai API +class openaiClientConfig(BaseConfigModel): + model: str = Field(..., description="HuggingFace.co model name") + url: str = "http://127.0.0.1:26500/v1/completions" + + +class openaiClient(BaseClient): + def __init__(self, config: openaiClientConfig): + super().__init__(config) + + def start_service(self) -> None: + pass + + def stop_service(self) -> None: + pass + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + api_url = self.config.url + headers = { + "User-Agent": "Benchmark Client", + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + pload = { + "prompt": prompt.text, + "model": self.config.model, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": prompt.max_new_tokens, + "ignore_eos": False, + } + return {"url": api_url, "headers": headers, "json": pload, "timeout": 180} + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + response = requests.post(**request_kwargs) + output = json.loads(response.content) + return output + + def process_response(self, raw_response: Any) -> str: + return raw_response["choices"][0]["text"] diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index e8c656ab0..4e20d37c9 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -131,6 +131,80 @@ def get_response(response: requests.Response) -> List[str]: ) +# client talks with openai api +def call_openai( + input_tokens: str, max_new_tokens: int, args: argparse.Namespace +) -> ResponseDetails: + + api_url = args.openai_api_url + headers = { + "User-Agent": "Benchmark Client", + "Content-Type": "application/json", + "Authorization": f"Bearer {args.openai_api_key}" + } + + pload = { + "prompt": input_tokens, + "model": args.model, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": max_new_tokens, + "ignore_eos": False, + "stream": args.stream, + } + + def clear_line(n: int = 1) -> None: + LINE_UP = "\033[1A" + LINE_CLEAR = "\x1b[2K" + for _ in range(n): + print(LINE_UP, end=LINE_CLEAR, flush=True) + + def get_streaming_response( + response: requests.Response, time_last_token + ) -> Iterable[List[str]]: + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"data:" + ): + if chunk: + plain=chunk.decode("utf-8") + if plain.strip() == "[DONE]": + continue + data = json.loads(plain) + output = data["choices"][0]["text"] + time_now = time.time() + yield output, time_now - time_last_token + time_last_token = time_now + + # For non-streaming, but currently non-streaming is not fully implemented + def get_response(response: requests.Response) -> List[str]: + data = json.loads(response.content) + output = data["choices"][0]["text"] + return output + + token_gen_time = [] + start_time = time.time() + #response = requests.post(api_url, headers=headers, json=pload, stream=False) + response = requests.post(api_url, headers=headers, json=pload, stream=args.stream) + if args.stream: + output = "" + for h, t in get_streaming_response(response, start_time): + output += h + token_gen_time.append(t) + else: + output = get_response(response) + + return ResponseDetails( + generated_tokens=output, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time, + ) + + def call_aml( input_tokens: str, max_new_tokens: int, @@ -205,7 +279,7 @@ def _run_parallel( event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) - backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml} + backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml, "openai": call_openai} call_fn = backend_call_fns[args.backend] barrier.wait() diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py index 196f70211..2370a2e1e 100644 --- a/benchmarks/inference/mii/src/plot_effective_throughput.py +++ b/benchmarks/inference/mii/src/plot_effective_throughput.py @@ -15,9 +15,10 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \ + parser.add_argument("--backend", type=str, choices=["fastgen", "vllm", "openai"], default=["fastgen", "vllm"], \ nargs="+", help="Specify the backends to generate plots for") parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--model", type=str) parser.add_argument("--out_dir", type=Path, default="./plots/goodtput") parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second") parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets") @@ -76,7 +77,7 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span): def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ): - tokenizer = get_tokenizer() + tokenizer = get_tokenizer(args.model) prompt_length = len(tokenizer.tokenize(response_detail.prompt)) prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec if prompt_latency_SLA < response_detail.token_gen_time[0]: @@ -137,7 +138,9 @@ def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen ] plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\ - 'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}} + 'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}, \ + 'openai': {'label': 'openai-API', 'marker': '+', 'color': 'red'} + } for f in validate_funcs: plt.figure() diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 4179f44b6..378925027 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -49,10 +49,13 @@ def parse_args(): return args -def get_tokenizer(): +def get_tokenizer(model=None): global tokenizer if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + if model==None: + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + else: + tokenizer = AutoTokenizer.from_pretrained(model) return tokenizer @@ -78,8 +81,8 @@ def get_summary(args, response_details): tokens_per_sec = mean( [ - (len(get_tokenizer().tokenize(r.prompt)) + - len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str + (len(get_tokenizer(args["model"]).tokenize(r.prompt)) + + len(get_tokenizer(args["model"]).tokenize(r.generated_tokens)) if type(r.generated_tokens) == str else len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py index 56fd7930e..6d3c1cd69 100644 --- a/benchmarks/inference/mii/src/server.py +++ b/benchmarks/inference/mii/src/server.py @@ -19,6 +19,7 @@ def start_server(args: argparse.Namespace) -> None: "fastgen": start_fastgen_server, "vllm": start_vllm_server, "aml": start_aml_server, + "openai": start_openai_server, } start_fn = start_server_fns[args.backend] start_fn(args) @@ -90,12 +91,16 @@ def start_aml_server(args: argparse.Namespace) -> None: "AML server start not implemented. Please use Azure Portal to start the server." ) +def start_openai_server(args: argparse.Namespace) -> None: + # openai api has no command to stop server + pass def stop_server(args: argparse.Namespace) -> None: stop_server_fns = { "fastgen": stop_fastgen_server, "vllm": stop_vllm_server, "aml": stop_aml_server, + "openai": stop_openai_server, } stop_fn = stop_server_fns[args.backend] stop_fn(args) @@ -118,6 +123,9 @@ def stop_aml_server(args: argparse.Namespace) -> None: "AML server stop not implemented. Please use Azure Portal to stop the server." ) +def stop_openai_server(args: argparse.Namespace) -> None: + # openai api has no command to stop server + pass if __name__ == "__main__": args = parse_args(server_args=True) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index d3c1fee02..ac2065065 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -122,6 +122,18 @@ def parse_args( default="./results/", help="Directory to save result JSON files", ) + client_parser.add_argument( + "--openai_api_url", + type=str, + default=None, + help="When using the openai API backend, this is the API URL that points to an openai api server", + ) + client_parser.add_argument( + "--openai_api_key", + type=str, + default=None, + help="When using the openai API backend, this is the API key for a given openai_api_url", + ) client_parser.add_argument( "--aml_api_url", type=str, @@ -156,7 +168,7 @@ def parse_args( parser.add_argument( "--backend", type=str, - choices=["aml", "fastgen", "vllm"], + choices=["aml", "fastgen", "vllm", "openai"], default="fastgen", help="Which backend to benchmark", ) From 0d40b31613ff2ee5f135be727075bf576f6a556e Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 21 Aug 2024 13:51:39 -0400 Subject: [PATCH 47/58] DeepNVMe example scripts (#914) * DeepNVMe examples * Update files * Rewrite examples * Add README and rename files * Rename files * Fix naming * Fix typo * Add bash scripts; simplify code * README doc * Update READMe and scripts; Fix GDS examples * Address renaming feedback * Add operator setup instructions * Handle init with default args --------- Co-authored-by: Ubuntu --- deepnvme/file_access/README.md | 116 ++++++++++++++++++ deepnvme/file_access/aio_load_cpu_tensor.py | 31 +++++ deepnvme/file_access/aio_load_gpu_tensor.py | 32 +++++ deepnvme/file_access/aio_store_cpu_tensor.py | 40 ++++++ deepnvme/file_access/aio_store_gpu_tensor.py | 40 ++++++ deepnvme/file_access/gds_load_gpu_tensor.py | 33 +++++ deepnvme/file_access/gds_store_gpu_tensor.py | 39 ++++++ .../file_access/media/deepnvme_ops_report.png | Bin 0 -> 8964 bytes deepnvme/file_access/py_load_cpu_tensor.py | 22 ++++ deepnvme/file_access/py_load_gpu_tensor.py | 22 ++++ deepnvme/file_access/py_store_cpu_tensor.py | 26 ++++ deepnvme/file_access/py_store_gpu_tensor.py | 27 ++++ deepnvme/file_access/run_load_tensor.sh | 26 ++++ deepnvme/file_access/run_store_tensor.sh | 26 ++++ deepnvme/file_access/utils.py | 57 +++++++++ 15 files changed, 537 insertions(+) create mode 100644 deepnvme/file_access/README.md create mode 100644 deepnvme/file_access/aio_load_cpu_tensor.py create mode 100644 deepnvme/file_access/aio_load_gpu_tensor.py create mode 100644 deepnvme/file_access/aio_store_cpu_tensor.py create mode 100644 deepnvme/file_access/aio_store_gpu_tensor.py create mode 100644 deepnvme/file_access/gds_load_gpu_tensor.py create mode 100644 deepnvme/file_access/gds_store_gpu_tensor.py create mode 100644 deepnvme/file_access/media/deepnvme_ops_report.png create mode 100644 deepnvme/file_access/py_load_cpu_tensor.py create mode 100644 deepnvme/file_access/py_load_gpu_tensor.py create mode 100644 deepnvme/file_access/py_store_cpu_tensor.py create mode 100644 deepnvme/file_access/py_store_gpu_tensor.py create mode 100644 deepnvme/file_access/run_load_tensor.sh create mode 100644 deepnvme/file_access/run_store_tensor.sh create mode 100644 deepnvme/file_access/utils.py diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md new file mode 100644 index 000000000..1183908d8 --- /dev/null +++ b/deepnvme/file_access/README.md @@ -0,0 +1,116 @@ +# Using DeepNVMe for simple file reads and writes involving CPU/GPU tensors + +The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA Magnum IOTM GPUDirect® Storage (GDS) as appropriate. + +The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. + + +File Operation | Python | DeepNVMe (aio) | DeepNVMe (GDS) +|---|---|---|---| +Load CPU tensor from file | py_load_cpu_tensor.py | aio_load_cpu_tensor.py | - | +Load GPU tensor from file | py_load_gpu_tensor.py | aio_load_gpu_tensor.py | gds_load_gpu_tensor.py | +Store CPU tensor to file | py_store_cpu_tensor.py | aio_store_cpu_tensor.py | - | +Store GPU tensor to file | py_store_gpu_tensor.py | aio_store_gpu_tensor.py | gds_store_gpu_tensor.py | + +The Python implementations are the scripts with `py_` prefix. while the DeepNVMe implementations are those with`aio_` and `gds_`prefixes. + +## Requirements +Ensure your environment is properly configured to run these examples. First, you need to install DeepSpeed version >= 0.15.0. Next, ensure that the DeepNVMe operators are available in the DeepSpeed installation. The `async_io` operator is required for any DeepNVMe functionality, while the `gds` operator is required only for GDS functionality. You can confirm availability of each operator by inspecting the output of `ds_report` to check that compatible status is [OKAY]. Below is a snippet of `ds_report` output showing availability of both `async_io` and `gds` operators. + +
+ +
+
+ ds_report output showing availability of DeepNVMe operators (async_io and gds) in a DeepSpeed installation. +
+ + +If `async_io` opertator is unavailable, you will need to install the appropriate `libaio` library binaries for your Linux flavor. For example, Ubuntu users will need to run `apt install libaio-dev`. In general, you should carefully inspect `ds_report` output for helpful tips such as the following: + +```bash +[WARNING] async_io requires the dev libaio .so object and headers but these were not found. +[WARNING] async_io: please install the libaio-dev package with apt +[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. +``` + +To enable `gds` operator, you will need to install NVIDIA GDS by consulting the appropriate guide for [bare-metal systems](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html) or Azure VMs (coming soon). + +## Tensor Load Examples +The tensor load example scripts share a common command-line interface, which is illustrated below using `py_read_load_cpu_tensor.py`. +```bash +$ python py_load_cpu_tensor.py --help +usage: py_load_cpu_tensor.py [-h] --input_file INPUT_FILE [--loop LOOP] [--validate] + +options: + -h, --help show this help message and exit + --input_file INPUT_FILE + File on NVMe device that will read as input. + --loop LOOP The number of times to repeat the operation (default 3). + --validate Run validation step that compares tensor value against Python file read +``` +Before running these example scripts ensure that the input file exists on an NVMe device. The `--validate` option is relevant only to the DeepNVme implementations. This option provides minimal correctness checking by comparing against a tensor loaded using Python. We also provide a bash script `run_load_tensor.sh`, which runs all the example tensor load scripts. + + +## Tensor Store Examples +The tensor store examples share a command-line interface, which is illustrated below using `py_store_cpu_tensor.py` +```bash +$ python py_store_cpu_tensor.py --help +usage: py_store_cpu_tensor.py [-h] --nvme_folder NVME_FOLDER [--mb_size MB_SIZE] [--loop LOOP] [--validate] + +options: + -h, --help show this help message and exit + --nvme_folder NVME_FOLDER + NVMe folder for file write. + --mb_size MB_SIZE Size of tensor to save in MB (default 1024). + --loop LOOP The number of times to repeat the operation (default 3). + --validate Run validation step that compares tensor value against Python file read + +``` +Before running these examples ensure that the output folder exists on an NVMe device and that you have write permission. The `--validate` option is relevant only to the DeepNVMe implementations. This option provides minimal correctness checking by comparing the output file against that created using Python. We also provide a bash script `run_store_tensor.sh`, which runs all the example tensor store scripts. + + +## Performance Advisory +Although this folder is primarily meant to help with integrating DeepNVMe into your Deep Learning applications, the example scripts also print out performance numbers of read and write throughput. So, we expect you will observe some performance advantage of DeepNVMe compared to Python. However, do note that it is likely that better performance can be realized by tuning DeepNVMe for your environment. Such tuning efforts will ideally generate more optimal values for configuring DeepNVMe. + +For reference, DeepNVMe configuration using hard-coded constants for `aio_` implementations is as follows: + +```python + aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1) +``` + +The corresponding DeepNVMe configuration for `gds_` implementations is as follows: + +```python + gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1) +``` + +Despite the above caveat, it seems that some performance numbers would be useful here to help set the right expectations. The experiments were conducted on an Azure [NC80adis_H100_v5](https://learn.microsoft.com/en-us/azure/virtual-machines/ncads-h100-v5) series virtual machine (VM). This VM includes two 3.5TB local NVMe devices (labelled Microsoft NVMe Direct Disk v2) that we combined into a single RAID-0 volume. The software environment included Ubuntu 22.04.4 LTS, Linux kernel 6.5.0-26-generic, Pytorch 2.4, and CUDA 12.4. We ran experiments of 1GB data transfers using the unmodified scripts, i.e., without DeepNVMe tuning, and present the throughput results in the tables below. In summary, we observed that DeepNVMe significantly accelerates I/O operations compared to Python. DeepNVMe is 8-16X faster for loading tensor data, and 11X-119X faster for writing tensor data. + +Load 1GB CPU tensor (1GB file read) | GB/sec | Speedup over Python | +|---|---|---| +py_load_cpu_tensor.py | 1.5 | - | +aio_load_cpu_tensor.py | 12.3 | 8X | + +Load 1GB GPU tensor (1GB file read) | GB/sec | Speedup over Python | +|---|---|---| +py_load_gpu_tensor.py | 0.7| - | +aio_load_gpu_tensor.py | 9.9 | 14X | +gds_load_gpu_tensor.py | 11.1 | 16X | + + +Store 1GB CPU tensor (1GB file write) | GB/sec | Speedup over Python | +|---|---|---| +py_store_cpu_tensor.py | 0.7 | - | +aio_store_cpu_tensor.py | 8.1 | 11X | + + +Store 1GB GPU tensor (1GB file write) | GB/sec | Speedup over Python | +|---|---|---| +py_store_gpu_tensor.py | 0.5 | - | +aio_store_gpu_tensor.py | 8.3 | 18X | +gds_store_gpu_tensor.py | 8.6 | 19X | + + + +# Conclusion +We hope you find this document and example scripts useful for integrating DeepNVMe into your applications. diff --git a/deepnvme/file_access/aio_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py new file mode 100644 index 000000000..27a1e61c5 --- /dev/null +++ b/deepnvme/file_access/aio_load_cpu_tensor.py @@ -0,0 +1,31 @@ +import torch +import os, timeit, functools +from deepspeed.ops.op_builder import AsyncIOBuilder +from utils import parse_read_arguments, GIGA_UNIT + +def file_read(inp_f, handle, bounce_buffer): + handle.sync_pread(bounce_buffer, inp_f) + return bounce_buffer.cpu() + +def main(): + args = parse_read_arguments() + input_file = args.input_file + file_sz = os.path.getsize(input_file) + cnt = args.loop + + aio_handle = AsyncIOBuilder().load().aio_handle() + bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory() + + t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer)) + aio_t = t.timeit(cnt) + aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t + print(f'aio load_cpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec') + + if args.validate: + from py_load_cpu_tensor import file_read as py_file_read + aio_tensor = file_read(input_file, aio_handle, bounce_buffer) + py_tensor = py_file_read(input_file) + print(f'Validation success = {aio_tensor.equal(py_tensor)}') + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/aio_load_gpu_tensor.py b/deepnvme/file_access/aio_load_gpu_tensor.py new file mode 100644 index 000000000..aeecc6e5d --- /dev/null +++ b/deepnvme/file_access/aio_load_gpu_tensor.py @@ -0,0 +1,32 @@ +import torch +import os, timeit, functools +from deepspeed.ops.op_builder import AsyncIOBuilder +from utils import parse_read_arguments, GIGA_UNIT + +def file_read(inp_f, handle, bounce_buffer): + handle.sync_pread(bounce_buffer, inp_f) + return bounce_buffer.cuda() + + +def main(): + args = parse_read_arguments() + input_file = args.input_file + file_sz = os.path.getsize(input_file) + cnt = args.loop + + aio_handle = AsyncIOBuilder().load().aio_handle() + bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory() + + t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer)) + aio_t = t.timeit(cnt) + aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t + print(f'aio load_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec') + + if args.validate: + from py_load_cpu_tensor import file_read as py_file_read + aio_tensor = file_read(input_file, aio_handle, bounce_buffer).cpu() + py_tensor = py_file_read(input_file) + print(f'Validation success = {aio_tensor.equal(py_tensor)}') + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/aio_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py new file mode 100644 index 000000000..20c03792b --- /dev/null +++ b/deepnvme/file_access/aio_store_cpu_tensor.py @@ -0,0 +1,40 @@ +import torch +import os, timeit, functools, pathlib +from deepspeed.ops.op_builder import AsyncIOBuilder +from utils import parse_write_arguments, GIGA_UNIT + +def file_write(out_f, tensor, handle, bounce_buffer): + bounce_buffer.copy_(tensor) + handle.sync_pwrite(bounce_buffer, out_f) + +def main(): + args = parse_write_arguments() + cnt = args.loop + output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') + pathlib.Path(output_file).unlink(missing_ok=True) + file_sz = args.mb_size*(1024**2) + app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False) + + aio_handle = AsyncIOBuilder().load().aio_handle() + bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory() + + + t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer)) + + aio_t = t.timeit(cnt) + aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t + print(f'aio store_cpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec') + + if args.validate: + import tempfile, filecmp + from py_store_cpu_tensor import file_write as py_file_write + py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file)) + py_file_write(py_ref_file, app_tensor) + filecmp.clear_cache() + print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }') + + pathlib.Path(output_file).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/aio_store_gpu_tensor.py b/deepnvme/file_access/aio_store_gpu_tensor.py new file mode 100644 index 000000000..71a4aa7bb --- /dev/null +++ b/deepnvme/file_access/aio_store_gpu_tensor.py @@ -0,0 +1,40 @@ +import torch +import os, timeit, functools, pathlib +from deepspeed.ops.op_builder import AsyncIOBuilder +from utils import parse_write_arguments, GIGA_UNIT + +def file_write(out_f, tensor, handle, bounce_buffer): + bounce_buffer.copy_(tensor) + handle.sync_pwrite(bounce_buffer, out_f) + +def main(): + args = parse_write_arguments() + cnt = args.loop + output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') + pathlib.Path(output_file).unlink(missing_ok=True) + file_sz = args.mb_size*(1024**2) + app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False) + + aio_handle = AsyncIOBuilder().load().aio_handle() + bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory() + + + t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer)) + + aio_t = t.timeit(cnt) + aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t + print(f'aio store_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec') + + if args.validate: + import tempfile, filecmp + from py_store_cpu_tensor import file_write as py_file_write + py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file)) + py_file_write(py_ref_file, app_tensor) + filecmp.clear_cache() + print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }') + + pathlib.Path(output_file).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py new file mode 100644 index 000000000..dd6273707 --- /dev/null +++ b/deepnvme/file_access/gds_load_gpu_tensor.py @@ -0,0 +1,33 @@ +import torch +import os, timeit, functools +from utils import parse_read_arguments, GIGA_UNIT +from deepspeed.ops.op_builder import GDSBuilder + +def file_read(inp_f, handle, gpu_buffer): + handle.sync_pread(gpu_buffer, inp_f) + return gpu_buffer.cuda() + +def main(): + args = parse_read_arguments() + input_file = args.input_file + file_sz = os.path.getsize(input_file) + cnt = args.loop + + gds_handle = GDSBuilder().load().gds_handle() + gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False)) + + t = timeit.Timer(functools.partial(file_read, input_file, gds_handle, gds_buffer)) + gds_t = t.timeit(cnt) + gds_gbs = (cnt*file_sz)/GIGA_UNIT/gds_t + print(f'gds load_gpu: {file_sz/GIGA_UNIT} GB, {gds_t/cnt} secs, {gds_gbs:5.2f} GB/sec') + + if args.validate: + from py_load_cpu_tensor import file_read as py_file_read + aio_tensor = file_read(input_file, gds_handle, gds_buffer).cpu() + py_tensor = py_file_read(input_file) + print(f'Validation success = {aio_tensor.equal(py_tensor)}') + + gds_handle.free_pinned_device_tensor(gds_buffer) + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py new file mode 100644 index 000000000..06ba508ba --- /dev/null +++ b/deepnvme/file_access/gds_store_gpu_tensor.py @@ -0,0 +1,39 @@ +import torch +import os, timeit, functools, pathlib +from deepspeed.ops.op_builder import GDSBuilder +from utils import parse_write_arguments, GIGA_UNIT + +def file_write(out_f, tensor, handle, gpu_buffer): + gpu_buffer.copy_(tensor) + handle.sync_pwrite(gpu_buffer, out_f) + +def main(): + args = parse_write_arguments() + cnt = args.loop + output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') + pathlib.Path(output_file).unlink(missing_ok=True) + file_sz = args.mb_size*(1024**2) + app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False) + + gds_handle = GDSBuilder().load().gds_handle() + gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False)) + + t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, gds_handle, gds_buffer)) + + gds_t = t.timeit(cnt) + gds_gbs = (cnt*file_sz)/GIGA_UNIT/gds_t + print(f'gds store_gpu: {file_sz/GIGA_UNIT} GB, {gds_t/cnt} secs, {gds_gbs:5.2f} GB/sec') + + if args.validate: + import tempfile, filecmp + from py_store_cpu_tensor import file_write as py_file_write + py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file)) + py_file_write(py_ref_file, app_tensor) + filecmp.clear_cache() + print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }') + + gds_handle.free_pinned_device_tensor(gds_buffer) + pathlib.Path(output_file).unlink(missing_ok=True) + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/media/deepnvme_ops_report.png b/deepnvme/file_access/media/deepnvme_ops_report.png new file mode 100644 index 0000000000000000000000000000000000000000..c05e9b863b77c6810be396bc382ca285d4e719b3 GIT binary patch literal 8964 zcmaKS1yoesyFLg=DIqYZNS6pRgMKnA)SJRbVxTt*WiGplr)UAAdPfN z$p4_<_y66y?z)$?4$KVa?04^XzwtcJp758-GIt3c5@29p+?A7+RKvi)gaNOc@NNUY zEl*B&fIpbdYBJ&&h`~o2z#D9HF(ok!jH>s9mqs|idwd63ZD$M&5cAE2DW}G?hk?P6 zA}1;K%3XgugD{3-BBOig#r@C+k}n59IgP!^BoReaMr5=19ZXH=6TWv-C*S98BcTwx zg}+ns)KH6F%(#u9O9CBs%Y#&IKwN{3EU*$r_h4@C2@la5Mscg$7`lNwoSZdlf^*j? znKKO+t_xnx8(wP{uU%X|IwOaK8Vc8%uZ4XE(ae-(J2Q@+N}&RW@}UetByn%$w|VJ< zV#2b9?}^KHzO=xnXEJ_V8$@#NDI(P9bl$51xfnt4PNi1SgV+GlYmM;L(#xWHSSwXaA4U$YJ7m;r9u^Z1Ssv8Ko>LnKp$>xZE5D z-uqHDFHXLhxFUiLwC{IEQ(JG+waL0EeZO3MQ4m+r8Y`O3EG4I;6!q|hdu2u+b;Vm* zCX~&w@Gpa;h7(%%CgsL8u-+Co^KR> zgGlPdyIe$f4I2miIwPLgsf6P^z&*>am8{)3l-1OZ|FIVV{gM(2f zy4HD!$Rg>`8>(HMcX;9D6RLPU_H9AI;M1T>L2K*+FD6M-&Yj}|c9qbDg~!yE!LEbp z{8qd3r~=HWs3?~0ytmiCt!5h?SNn}WD$+81fBy45^p&Qj`5KHB_m|s(Ps8S1bAvWv zlCD#DDvxEH_E4sf4lf^H+E9m_bTi5Hk8fa|`XcR`H&ZgPnSCGF)CZ3Dx~c5H@z3oe zZ`-4eHR{ZVXw*Y+(0nmEvUKR?JpYBdw7b4&UWC_k39ue=ytbmcdUW|Cco@-oxdOFn z55l+F94{`Zt9z~II%QTcpd#_@Cuf$w+Vak!dtm?k$=JwlT-yhK@8 zQX*fj(=eE)Sq?KB9c&@qX!urZ8E<)6q?)TJKz6`BHa>2;N&b@0SFZa#tv!S$-GX(u z<-%_5yeF1%HIj%{cy`?X#NVtxF~$3r850w=KKi>q7VdZ$DWf8Guq-87jf6|D*=WBR zp&H`4bWPOb#!n?%Sz|H6aQL>#enFTqXr|tlFpqxY&pdlin?TDia<;{eIH0#@RY=6aL5)+-v zlDiMMByH!A6L~e54G%+!X*K$jpu3mBpDUya_NceTn(OTs?CkVN+D*1?3Te-d#TJy1 z2K0%{tsD#AxgEBDJcA#ueno_V(9;Q;`j&+p0s7575<-mu{i2b*>>0^3G%#3>0 zn!1|;=&}h#TjI+CEy8Fwj+lFzlu!8x2&SX$a4Q{uNY{YcFQY!m&d5ZzL>P61Jjn0f zCLdKEh6V~Fa0o%kJ`Tn;I1Q8*JOUp`rGCz%W^@(Tx%;uFgAJZAJf<>++rq|Bgo{u0_FOwvq;-U+!&HDC)l(MEZ? znA5WCJvl@IGY;Wl5WKuzyym>~OBmgVP?1@}rwe z#Aza4@R>h0g^Od0s9!6Q;N}W-BL(VR<(A+7TDgQmggu+ebHu;y&guB%a<#Z2Q~}#U z0dFVT&!;`sioI%kl*f$)kJFvU^Yiy*>3|}rHtwBQrdYl^CNqZ!p6HFVwcUKbz3uu0 z_B=X~oZ7kdcxW@n|7zeQTjc8X*Q>#y0oVHF<`&}|G*ehu-JGLVas9-k>#TcSu7O-Q zOzU;NReyi^ov%YG8EQk;u~%O=#$2PRB(vOQoM@@{jrH=kiYFtTsR)q_F~3oupXP9s zKDm@6dqo%v_PZv#aV6yAiy%r&^b2)#y}O3?1|v9nL0 zj}Bg6Js@=PtW@EXRK>IXw>&dQ7W-Tfl?WDyj6Cr1{gtRv+JQ@}K2&xQB8Qb1OZ-Ne zw)Es`$0$COcl6e#sNRr`MXhsK{K=Jk25eR+b(>eLcwS^=Wd3mSq2){8z=XBD(|8TZ zCr8M!=|bbgjO^m{ili0S_U`4WUCB=K+`EMa^`lve(7SANIQ@09^QB#md7uSHlxJ}h zjIQB$yL5HqxX|UHDSO238C-%qc=#C)PjXRWjN&`wYVPKen7~g*&ZQFK;%4)A>RXxf z2*`b&t}@!Q)iotye3gwA7)NWK)GmKM4AC%&vTHa{XM0dY>2ru#1?m$i9uus6DR3fE zELDGGtWbox`D7w?tJpMIA(>?jH$4vn%|SP1pHasQw7SBRg=dA#2a9Y4 zZvC#)AKtD!S(ZC4(x(x26c;MW{o!@2cJcdS^*(fz06Z@q-k`da(E2U$v@Q^6I3;z~ z=v1dET*Z&VbIXElrS2<AKIgKH+LiuFQnY@c};SD#hf>xYC#RrE*`tTBQO@%c&)|JlhJU)-|Qt~h{ z$(18J=x~9S4*8rs5j1nObIJGI-Fqr%Hj)!e`HSfV^Xo}0Hdbn|MvEUJs81>}CeyQh zaQwp-UgMRLf);&_S+vOOnfJ%P1xlp&u%aad@T_a;d{IE!bZtO zE`mV=xiHS$`=A&Ni!T6-Vd&id@ujq{Po6QR{h#(FKr1S@{A;Iqw`6;!p_I!0(?}lB z7Z9#Hvz7-ql1W*`#d5DVDyhThps?DWpUpSD>#}|>JT-3+o1)O$;D$NGg6LXo*Fze+BJ5&``tN~}?sP2Pm1Tov-?s@fq{LGb0CSn21`)=Q`3)wFX( z9MjK!m?s)Lk0y*n==E(bSFATrD&Q`(kF15WZ|gcBGOCM8Hh+@@V}FsXIC;t5_TXy( zi28koXoFZOXfSPCQK`5k7L48}p$R!g_D1NAWYEY6hq?abl6#9wB=Nv~w3qRH=k1E3z5iOeU3Mqa=+j|Ki*DE{OZSNb`D)*j2u`8$GW^3~`s9P`Pgz;${@Z zbKuM@x5hD3U72NHx9tdyIe7^wi&Z}T-uL2hP409Yar5bhg<8o^dBIq7&QwF>By#IL zHFu;s{7{LByJN5atdG3{F5NkGYvjn&44c79d~|;S`oa)-YtL_MyMee=rzMUHgPTdj z`vAi0zK2sFh)LWlEVL@t9As$2j>O;FE4|86D{Qf*>{RT!60jOqt??j18@nhlg?}Od zBPpe$9_9<5RJ~7$#U8{p3hKn6fy;el@6@32GgD~UVrS_)vh@j{L(MssSto1EGUZe2 zFDqG)`YeHbu76);Fyi3>y`DdZIf!HlU>6ZYNVl%gBKnH)j=oL{h(2fxy*lFDD_e^B4yAiyACfLy2ECWTERFO2IE>#@Xm;H~{H z#1bVp1EQkIOz&_`%0xff0I|ilhwSL}ez$3I%K*P1kABvg$>4upC@j?n_696`r zR?hLOyu9G2$3R3?2FIJBv%Qk2l7_|Kkv!;2vpLbH);<|Pi)gPr`LE6M{ ztJAhY(yur+>x`K0JeU+`wwjCI%R`J3l4z!$NC+%W69l-ZNselyU43$+Hn@)LXklq_ zGysQCu_>(PV@(nze9>6@nOT$QDZ;_b2$)*hFn0%?w>uuoRe_kO$P>LrSzW-V!ho;d zX$2+5+?fB5lkcbYk^E5$Bx1|rpoNLYr z-O;nt6(mV(0%zFX3 zga7z8vhC&{A~{9sJ(K+5W*hxZ{yTb$eeL3>5I#%BX>qq*d7(jkB+f9!tx94Dgnp(T zn_kXNvFqi3@%jeoj$T;xJ{HL{0_I@+f1LnRf6&w=Ln&=*TketYbs&XXiH6PYpHP$@ zAplR~59e0_N)3iQ?l9!j)YVUiSYXr&Lzef57v?-^vczAf=55JMbf&cp! zh%Y|Vx7)~d-96}-A{yVo-i9leh{9s7s-$(MwnZ`iaqK98?d*=8@8_R5A2AP8U#$+L z3NrDZ;E z1UC{c{HDxo%c-;atna^X&-1zFpu1Gh?&sQiPi{_>3SI=Mih%G0uxVxvUfRrm^04|= zX~bgR8Ai&IK^(W7-dTP0Uo(JaabC(31WX->Q=D82B{SKfV8Hi4gd6_V#Vnpk{8hN5 zfZDSZxF&Ev^M^(5E#hv|bGywCF!2K*wElct1+*UIpVrgSKK{*^%F&OOcx`}()z^&a z`X;N~5!_^@!|kgMXoTKe!Qs7otT$FOw;(s^;GFN=B`z5;=onxd2q6`+QK0A0WCMrmZL{zSO#;v_w4oH<>*@AskNZ z+b)knV#J0qrg6ShQVY5kJvHf0{-gkD>QtoY_B{DHRnpWng;%oR*7K^6mPCHs?R3s9 zEyY<2>nRksh-xP8gq>`i3j`F|Ka0{87xX@J_`y$?gS46AVwAndwlsNNQd)LH1CxCT zM~V z9sdE`vk^@AgGit-bm0!UP-gnn-gAk->v?36SJBvh0O>fdL+f+TOd&0Cgdl0YdKJ5L zLB(VYf+pQ}$a>>$TH=12v+d$Nm}H7W+jNUh^V8qb%i(nc*c}fW)*J73pBIW;755HJ zDzVS}xa>KiuEu2H`&Ywx_kq+$egXQ~T;IdY6ZcMQwavb>?vs;hT_AKDt98Q;by-tt zNx>lo{BzAfdt2~(6X-i0%P5oT`4E!ka)`vxLya=>gPrWkSR+K|^A%cQ^ zd}pp%X`yl~!{+MzxZtXYGP35Pqk=;xZh4Q~?VpYc>GVLv-!Tv2_rn`AE8DxN%X_-G zf&aiGz!#i<<=f0$q8s#tD_xtn-qV2f>IQDPBLWwxMwS*y-movS@z+P*RebgR19~Yi zS`F|9^&OS$U;eb1Vt%}36m4yYTDtLo`SoH}I!p(c+ zFlZrIGMt2ULk;iAtFc0r-50lKw6G-8tM3j=Eh?JaemMFZG_%J+5(B-+)kR{ayW}|! zv@_xKDs112^|0#Lq~!W?Z62obePG*un%docExArwq{*-&(6> z4m+bGzKD9r+n0y(O9M$Nc$40+tw}Mz&k~oYM>Ttl7zZ2PI90lz+^YoK_YBP!;IE<$ zne^8mQ_#LZ>$$s9Ghv4PoN^ctAHUNtwJ+(wK-z-my*ayQyfX#?{k;pMjbDC-nWccC zcjsQ0$Npw%#X~I6hOZx4q8}ICRSKmZJXSk*hb~&%*YrnnFvL$R@OINXc#Z=$S+&dS z&E9`15YUMsk!3+3;@)9aq+#1FvMIBOrI*n=wlHXU>^=be=j*t~`Y?Sw zv1ZxLkna3%Di~?n(L-$3E(0 zDLSOUR-thsjkEu|fVQ5r;Shgug`A*&;^42ireucq=Pckq;p3?k5T-Rl9hzXJV{7|G z>89oH7Gb^k-Y$y)r|ePxRV3DMOBhOKL44A2?}V+Dwn#;qHKzS$RMiuFFaxhE1LlWJ z`#LsDAe;b{X?ik9UDpI@XCEpy?WO^?K6A9cPRmy=GBeu$ ze-Uorrkmfc^FULhk>KpxMWwG~5QUT48+kw2YBy$^)$F%cPV_$iL;#YD>-=c$jM!%z z1bk3a6#2N)uw7L6a}S;+R(;no;R2?ur10dr>*soB={pQ&{Lc^&vU|ig6R1{Un;4m< zSTvq<9PU$DNpCKz=7wrplIr8JPcjTrza0%gNfBH~% zlDv{AldEnddd62;-;HrD(<|s2&(iZ1l2WgX9>o)zo@oi7p#zuyRrzl&I9}3%r)AlS z6<)?IOUM+jvlUm<$ntMkR2|^JnO{MwgO0|@Tnh?xWj=4)!`cZ*!OoY()4pY=XjL%s zfy1NG-~a`Sz(zL@%utA;M}lUjEcwQoH}jyrnuw$j?e|bjk%#&SgXD8|>-#;jI{vko zb1Y)By|Rmqj%-u?yAe5oC zb!ka?Iqz?5WhMa-eaELw&iPs~|DM(UV?x!6FfKMCAQHhWo^_!x{6Py)?QGFFKAfu7 zYVp$)AjaqySw44EoCL}P zlAaz3H^u2zk*dtsRWaSj32-dabDSh{Y9R35bNT2P7`xfKJ=%I$K*1+x8A|zoGNQdG z3o|PYX`D`83W|rP^AS0-S?McGI9Qb*_3_q zX=BITzwLnDZ%1z=a7q`xyBS?L3_^NQBFXZoi|1&3ZDRra==E*;yi5v^Ce*^j*@olSs(DbLh zQ;RbO$D9tTJq(qc9tKX3`egS3@&dp|#3R`VuOwQT_cV14vMK;#8@U1N*+KOHWxN!< z7*0Gt_l%cbj)KR836cE|a%%ljE*#lftv2VO{6eJY?^AxY9aCry;;!LD;JhT|1>A-1 z1}-+f`uQ(fltX%;On5)~OPvZ{0UP zcxubkIs)gnLIgL=_Kh<$u<?xNgv>dBWjpX`pIQHf-rV} zpilmIdmc5x`1-Y#l1tsWx1VF9WiaT6=2H)h$m9XM!Rl9qV3AXZc-#$*U}f9UBWakS z9{G8yoc=O&dM3(`XkzIX8#pt$3}e5uGdQ%?0WsR%v@z>Ek=f!i@eFlYtwMXuh%(gr zc3Rt*Q*jDGU4zdXZ&dzRWVD9>lnj!sASN878*||E*=B^>?C5D-GbVuk*FU-(MbaBWGlk22eF2Xh02+Q+bi{Re-D{uR zd?_V6I}I1lU-io7?LcL+gQcX`b2Yh<#$f{#3@qok>!}}^OZiox-A5^Mo%l#tq1IhE zi`P08^feq%%eUs=8t7#wSw~ZI$3NT|{&l)>fa59hPU|yP-Sw#IPX(Bt8;*D7>c3NT z_${|KPZ4C35v(=I{cmxJBR@qfSkhlnf@8v@_vr@Y>Y{BbMoD*QUB;2Rk!vbP!mX1z z;U@DV7X>>X+5dRXDGu=45Q?2l&WYVv`{4hN#B_?d3lk-cB0`UBP@lWLO#n?aP~YZS zD5;euX$;~VGEILF5{q3aHPFqP7*lh@d~%0pn=z!R2S%16RIzR$F;^|HK5Mq;c>YRtn_63ic19r>_)5^Ss`JbdKg$(`v=pbMXz2P;ak& z>wq)e8YMjXE^HPmj`5N5_zLSPpJ$(A$~lK$g^-NqwgKT%S~8CDWw>HD{du-VXtGoy zKJXQTKe|tfZp{({+ErHrR??E&b@Ic^%r%;8%!PI`AdT3Yw!XJBu(F3o4=!ey_E+2> z5Af0?Nu8dV(0os;*$Rh0Fg&+wB_Vrs`FP`cxyf0j!jg}LsJKkKMjCZIsUz(Wc*_V; zTB=Sge3s?01*UhbIYtu~p$*>3%sur<$jBW}YITo-U811HFD_=|&4w)m^q1U0^V})+>-j`?4pFbbH+`Pt|64hv{gi*Ty Q-(10vlTwyMh#Lm{KU9c+tpET3 literal 0 HcmV?d00001 diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py new file mode 100644 index 000000000..0650848f0 --- /dev/null +++ b/deepnvme/file_access/py_load_cpu_tensor.py @@ -0,0 +1,22 @@ +import torch +import os, timeit, functools +from utils import parse_read_arguments, GIGA_UNIT + +def file_read(inp_f): + with open(inp_f, 'rb') as f: + tensor = torch.frombuffer(f.read(), dtype=torch.uint8) + return tensor + +def main(): + args = parse_read_arguments() + input_file = args.input_file + file_sz = os.path.getsize(input_file) + cnt = args.loop + + t = timeit.Timer(functools.partial(file_read, input_file)) + py_t = t.timeit(cnt) + py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t + print(f'py load_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py new file mode 100644 index 000000000..976967dca --- /dev/null +++ b/deepnvme/file_access/py_load_gpu_tensor.py @@ -0,0 +1,22 @@ +import torch +import os, timeit, functools +from utils import parse_read_arguments, GIGA_UNIT + +def file_read(inp_f): + with open(inp_f, 'rb') as f: + tensor = torch.frombuffer(f.read(), dtype=torch.uint8) + return tensor.cuda() + +def main(): + args = parse_read_arguments() + input_file = args.input_file + file_sz = os.path.getsize(input_file) + cnt = args.loop + + t = timeit.Timer(functools.partial(file_read, input_file)) + py_t = t.timeit(cnt) + py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t + print(f'py load_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py new file mode 100644 index 000000000..50e477186 --- /dev/null +++ b/deepnvme/file_access/py_store_cpu_tensor.py @@ -0,0 +1,26 @@ +import torch +import os, timeit, functools +import pathlib +from utils import parse_write_arguments, GIGA_UNIT + +def file_write(out_f, tensor): + with open(out_f, 'wb') as f: + f.write(tensor.numpy(force=True)) + +def main(): + args = parse_write_arguments() + cnt = args.loop + output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') + pathlib.Path(output_file).unlink(missing_ok=True) + file_sz = args.mb_size*(1024**2) + cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False) + + t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor)) + + py_t = t.timeit(cnt) + py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t + print(f'py store_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') + pathlib.Path(output_file).unlink(missing_ok=True) + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py new file mode 100644 index 000000000..a64209a12 --- /dev/null +++ b/deepnvme/file_access/py_store_gpu_tensor.py @@ -0,0 +1,27 @@ +import torch +import os, timeit, functools +import pathlib +from utils import parse_write_arguments, GIGA_UNIT + +def file_write(out_f, tensor): + with open(out_f, 'wb') as f: + f.write(tensor.numpy(force=True)) + +def main(): + args = parse_write_arguments() + cnt = args.loop + output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt') + pathlib.Path(output_file).unlink(missing_ok=True) + file_sz = args.mb_size*(1024**2) + gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False) + + t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor)) + + py_t = t.timeit(cnt) + py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t + print(f'py store_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec') + pathlib.Path(output_file).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/deepnvme/file_access/run_load_tensor.sh b/deepnvme/file_access/run_load_tensor.sh new file mode 100644 index 000000000..e410c98b9 --- /dev/null +++ b/deepnvme/file_access/run_load_tensor.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +input_file=$1 +if ! [[ -f "$input_file" ]]; then + echo "Error: $input_file does not exist" + exit 1 +fi + + +echo "Running load tensor examples using $input_file" +for f in aio_load_cpu_tensor.py aio_load_gpu_tensor.py \ + gds_load_gpu_tensor.py \ + py_load_cpu_tensor.py py_load_gpu_tensor.py; do + cmd="python $f --input_file $input_file" + sync + echo $cmd + eval $cmd + sleep 2 +done + + diff --git a/deepnvme/file_access/run_store_tensor.sh b/deepnvme/file_access/run_store_tensor.sh new file mode 100644 index 000000000..a10b3c219 --- /dev/null +++ b/deepnvme/file_access/run_store_tensor.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +output_folder=$1 +if ! [[ -d "$output_folder" ]]; then + echo "Error: $output_folder does not exist" + exit 1 +fi + + +echo "Running store tensor examples using $output_folder" +for f in aio_store_cpu_tensor.py aio_store_gpu_tensor.py \ + gds_store_gpu_tensor.py \ + py_store_cpu_tensor.py py_store_gpu_tensor.py; do + cmd="python $f --nvme_folder $output_folder" + sync + echo $cmd + eval $cmd + sleep 2 +done + + diff --git a/deepnvme/file_access/utils.py b/deepnvme/file_access/utils.py new file mode 100644 index 000000000..e83168349 --- /dev/null +++ b/deepnvme/file_access/utils.py @@ -0,0 +1,57 @@ +import os +import argparse + +GIGA_UNIT = 1024**3 + +def parse_read_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--input_file', + default=None, + type=str, + required=True, + help='File on NVMe device that will read as input.') + parser.add_argument('--loop', + type=int, + default=3, + help='The number of times to repeat the operation (default 3).') + parser.add_argument('--validate', + action="store_true", + help="Run validation step that compares tensor value against Python file read") + + args = parser.parse_args() + print(f'args = {args}') + if not os.path.isfile(args.input_file): + print(f'Invalid input file path: {args.input_file}') + quit() + + return args + + + +def parse_write_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--nvme_folder', + default=None, + type=str, + required=True, + help='NVMe folder that will used for file write.') + parser.add_argument('--mb_size', + type=int, + default=1024, + help='Size of tensor to save in MB (default 1024).') + parser.add_argument('--loop', + type=int, + default=3, + help='The number of times to repeat the operation (default 3).') + parser.add_argument('--validate', + action="store_true", + help="Run validation step that compares tensor value against Python file read") + + args = parser.parse_args() + print(f'args = {args}') + if not os.path.isdir(args.nvme_folder): + print(f'Invalid output folder path: {args.output_folder}') + quit() + + return args + From 957ae3141946daf9a6bc5731e261032a13a82f05 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 24 Aug 2024 08:01:51 -0700 Subject: [PATCH 48/58] DeepNVMe README.md add xref (#919) --- deepnvme/file_access/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md index 1183908d8..a50f6f438 100644 --- a/deepnvme/file_access/README.md +++ b/deepnvme/file_access/README.md @@ -1,6 +1,6 @@ # Using DeepNVMe for simple file reads and writes involving CPU/GPU tensors -The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA Magnum IOTM GPUDirect® Storage (GDS) as appropriate. +The purpose of this folder is to provide example codes that illustrate how to use [DeepNVMe](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md) for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA Magnum IOTM GPUDirect® Storage (GDS) as appropriate. The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. From 1293d450fe4cc48b5ee3398dd7b05a7019b742e6 Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Tue, 3 Sep 2024 13:22:19 -0700 Subject: [PATCH 49/58] extend max_prompt_length and input text for 128k evaluation (#891) * extend max_prompt_length and input text for 128k evaluation * Extend max_prompt_length and input text for 128k evaluation --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- benchmarks/inference/mii/src/client.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index 4e20d37c9..85f5207ea 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -347,6 +347,14 @@ def run_client(args): p.start() tokenizer = AutoTokenizer.from_pretrained(args.model) + + # make sure max_prompt_length is longer than the target prompt length + args.max_prompt_length = max(args.max_prompt_length, int(args.mean_prompt_length * 3)) + # check if the all_text is longer than the max prompt length, if not expand it + global all_text + while len(tokenizer.tokenize(all_text)) < args.max_prompt_length: + all_text += all_text + query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42) request_text = query_generator.get_random_request_text( args.mean_prompt_length, From c961379a4170a235e3623b800c00496e2cda634b Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:47:06 -0700 Subject: [PATCH 50/58] Update requirements for opencv-python CVE (#925) * Update requirements for opencv-python CVE * Dont' update numpy for now --- .../examples/research_projects/lxmert/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/MoQ/huggingface-transformers/examples/research_projects/lxmert/requirements.txt b/training/MoQ/huggingface-transformers/examples/research_projects/lxmert/requirements.txt index 9028e302b..69bc6ba07 100644 --- a/training/MoQ/huggingface-transformers/examples/research_projects/lxmert/requirements.txt +++ b/training/MoQ/huggingface-transformers/examples/research_projects/lxmert/requirements.txt @@ -48,7 +48,7 @@ nbformat==5.0.7 nest-asyncio==1.4.0 notebook==6.1.5 numpy==1.19.2 -opencv-python==4.4.0.42 +opencv-python==4.10.0.84 packaging==20.3 pandas==1.1.2 pandocfilters==1.4.2 From a256c04a04075d3f33ebdf1db2e76dd31e81070f Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 10 Sep 2024 20:38:22 +0800 Subject: [PATCH 51/58] Fix labels & eos_token for SFT (#819) --- .../DeepSpeed-Chat/dschat/utils/data/data_utils.py | 10 +++++++--- .../training/step1_supervised_finetuning/main.py | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py index 0b9d6df64..7e3e6776b 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py @@ -149,9 +149,13 @@ def __len__(self): def __getitem__(self, idx): if self.train_phase == 1: return { - "input_ids": self.chosen_dataset[idx]["input_ids"], - "attention_mask": self.chosen_dataset[idx]["attention_mask"], - "labels": self.chosen_dataset[idx]["input_ids"] + "input_ids": + self.chosen_dataset[idx]["input_ids"], + "attention_mask": + self.chosen_dataset[idx]["attention_mask"], + "labels": + torch.where(self.chosen_dataset[idx]["attention_mask"].bool(), + self.chosen_dataset[idx]["input_ids"], -100) } elif self.train_phase == 2: return self.chosen_dataset[idx]["input_ids"], self.chosen_dataset[idx]["attention_mask"], \ diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index c37d1f4cd..d9527af54 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -270,6 +270,7 @@ def main(): args.seed, tokenizer, args.max_seq_len, + end_of_conversation_token=tokenizer.eos_token, sft_only_data_path=args.sft_only_data_path) # DataLoaders creation: if args.local_rank == -1: From 90c2a9f2125075943a0a2ca2cb8f568cbc95077c Mon Sep 17 00:00:00 2001 From: Joe Mayer <114769929+jomayeri@users.noreply.github.com> Date: Tue, 17 Sep 2024 06:28:00 -0700 Subject: [PATCH 52/58] DeepNVMe ZeRO-inf Tutorial (#921) * rough draft of zero inf w/deepnvme * fix format issue * another reformat * another reformat * adding mem usage * correct pic name * changing to mixtral model --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepnvme/zero_inference/README.md | 28 ++++++++++++++++ deepnvme/zero_inference/media/nvme_config.png | Bin 0 -> 17884 bytes .../media/zero_inf_mem_use_cpu.png | Bin 0 -> 41786 bytes .../media/zero_inf_mem_use_gds.png | Bin 0 -> 39589 bytes .../huggingface/zero_inference/run_model.py | 31 ++++++++++++++---- 5 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 deepnvme/zero_inference/README.md create mode 100755 deepnvme/zero_inference/media/nvme_config.png create mode 100755 deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png create mode 100755 deepnvme/zero_inference/media/zero_inf_mem_use_gds.png diff --git a/deepnvme/zero_inference/README.md b/deepnvme/zero_inference/README.md new file mode 100644 index 000000000..3214ad5ee --- /dev/null +++ b/deepnvme/zero_inference/README.md @@ -0,0 +1,28 @@ +# Using DeepNVMe for ZeRO-Inference +ZeRO-inference is an ideal use case for the DeepNVMe technology. When you have a model that exceeds the size of availabe GPU memory the [DeepNVMe](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md) library along with ZeRO-inference can be leveraged for high-throughput offline inference. + +Maximizing inference throughput (measured in tokens/sec) in this scenario has two parts. First offloading the model parameters to fast Non-Volatile Memory, either a single device or several devices RAIDed together to further increase the effective bandiwidth of the system. These parameters are then swapped into the GPU memory layer by layer to compute the forward pass for inference. This allows for the second part of the process, maximizing the batch size. By swapping in parameters layer by layer the remaining GPU memory can be used by the computational batch which leads to a maximizing of total inference throughput. + +## Testing Environment +The environment for these tests was a VM with NVIDIA Magnum IOTM GPUDirect® Storage (GDS) installed along with a single NVIDIA H100 GPU containing 96 GB of memory. The VM also had two NVMes each with a read bandwidth of ~6.5 GB/sec. The two NVMes were put into a RAID0 configuration, bringing the effective read bandwidth up to ~13 GB/sec. +
+ +
+ +## Initial Results +The following models were run from the folder DeepSpeedExamples/inference/huggingface/zero_inference using disk-offload of parameters via the following command: + +```bash +deepspeed --num_gpus 1 run_model.py --model $model_name --batch_size $bsz --prompt-len 512 --gen-len 32 --disk-offload $path_to_foler --use_gds +``` + +Where `--use_gds` is set to enable NVIDIA GDS and move parameters directly between the NVMe and GPU, otherwise an intermediate CPU bounce buffer will be used to move the parameters between the NVMe and GPU. + +All models tested were chosen so they could not fit into 96 GB of GPU memory. + +GDS | Mixtral-8x22B | Llama3-70B | Bloom-176B +|---|---|---|---| +False | 9.152(bsz=200) | 8.606(bsz=96) | 0.291(bsz=8) | +True | 9.233(bsz=200) | 8.876(bsz=96) | 0.293(bsz=8) | + +Throughput measured in tokens/sec. diff --git a/deepnvme/zero_inference/media/nvme_config.png b/deepnvme/zero_inference/media/nvme_config.png new file mode 100755 index 0000000000000000000000000000000000000000..3c61cbb4cbd53d68276693790746f1e12fb92a86 GIT binary patch literal 17884 zcma&Nby!D#nv{0aE+M>bTwNRkAyGw9)O)2h@07Z&Ja0{*{ z{k``)_kF)}&-dIvviGye&R$_<%{j*ybF3fl)#P!XQ9c6z0Jw?@vKjyYN+|NB^ArR5 zmuB|4MdS&^Nkd*5P&xKu2YG>JE~P330MtZd-+e(xUSruS=sE!agwB6oC`)Fv?f`(C zgQBd|M|Y!xOi0{X`&8fUEthfFe5v3$`g_<*DjQh=ltHu}*%$&96-faiRh8FRD08wz zHoS5UC5pt)_DWjn>0`eSYq~MK|qlKU*tXmN3mtA^c>|qH&Rw>wh@GP1$^Dxv=zmAx@p8j>?r90!Xw`uCj+{+$s=G(<$Z(1^rZh1T-$wOv=J<09B zcAknPY)Xu4AMpuWi5!9oQ0cQDKa%66UOtYrN89JT3!5wHfQFihJA7eGxt2 z(vXcs25C1~NVnPM-1QGbe39QHJFwC!>i+S>m)2l$eOZ)yhmG|!+wyrh@R9t@{aqnY z@KJeYSVymUdO4cbcC$-p(@9cIgc#3vcM*?70h2WvEp*~H{ez7_x*srI_Y{#zcC8YD zjaZ(wEPrUDl!vTq+&A5_yxB3`^tLwd-vfTEbF2h?P(s#hio&Vu%4$NN3|k&Cxdq5jSq8^>an6iWj(tLg4nf-J$>#pXx{Ifl9U8KZYzbNuv=fS zf$y{M@YQ=#GXv$LU%>E}v(K)5a#r&XFj?jg#`rQu&X#HJUx-ayJ5Tuqs`N$k75Eg8 zx!!r_9!Nbp@QaG1;xreL?coaao@CD45^&t{9IH0QI#R2FbV+JR>!$IWq-gkt$9JOQKHi&BWkB7o|A*Fo( zjTgyQYFh?gjNGn_Wl6!*Yg3Z_-?ZY}G~Fydw84BeE`OSJn#DvXh_X4ZA{%T^&)q+% z0d)~!Zv8ABBfuTLPg5GF@2uq0QQm*KivEGOF zB9a0Wc0ANOa={^>$#gH{_|;Tb%c~Ves0x05_DvP*DRngebT>;^2@(l}7^92QJ&7j@7r7db{;zv37$Zog5YX3J3bJA;Jr7p?h`>%|-bynv^g@ z5%2KL=O5KXwoWdM7|?fjZ~Ka~xzM{>BQYQS_c(S_SU8x$ZbfZh4mZ3e| zS%6w2x;aY>{hXuwXt%lEBA7z?Pd})~~r?0N{C&@d#w=kKo(7@fd z0pESTWeQxCNBlAMf1g;7NjYe!8nW0_s2Oh!Q*Pu2A^$x?_$?9pl5G_XL+3=QZgAhl$UQSa1o$oaJx4C^bMHLBYpn zL<}JR4LM|ZTea|LRh zMEB1g+*0uLJBvl$w$}{w-uOAe zrD-=9p@pk08@JOc7q}6p*J}by+NC~&17x#%tu+0_AVJx2ANQrnVY_=NtnyeHj=CZi zC@x{|NLI0mR*zRvWxpa?B7G80$*=MIFWMV48VnUGZPkoS;&ye{VjhQ?1MlOYVngj_{P$T?nTjk8=@+=N@Tfa1M$c8t(x zBX6osnbguf)L*FFS&?V!jwaDums{M%rE-%xiHUCtUAoE)$-)yiZaL+jubWRk+hR4l zkd4khs}*CzT>{Lmd*jt-b%rdEDQ`S@&rR*@2v4EzV)}-?8)k~MPiTAr?ETH#&)7lIG4AF z&uZnoo|46;noU|QWnvOf!$l}}&aUr%PkJ27Jeq270Bfh{JLUsCPkw$pTSs)LGwPR; zu?vp+szQ&y9KXmLJb2su0f4EAlGZN!q`;LaF6^>(&Fr-p(wFHSFXus7m+D z$YxGWD+Lk;17I^ZfB|y zd|~ivTd*+F)W@_|xZ_a1e&dVR9EY%Dp^sSPw(7>$<)wuD3LyT?0&COs$HQEpgB-2t zt5Sz`q2;O16e_XSp&#$TaL;)qV|z{6Y$S*f>URws=b^{~xn(8*F=wUjgbUAtTZ6gD zJeMA$v^9<}ae1Ne%b~`iJw8tfV6%CV=~p>miZ9MSAy{cp`N5&Wdc%j;*O#HPFe&)A zr02w|t8i6N>jSudvybK_@9IkR32NtZ9jx(B{HtH=_Db?Z8E%vHX-VYYcusN5g;)CA zEbR)g8@b1>h{Z6rQk4hQ_ijvi`HvtvaHt7V17&ueM42m*kSF#XL^>ksd(?7?@kZ5MCrfe z#4p3m{4z*JqnCxFer85>pVZIPVf>&{A6kLiDwIlTW?sI{H&$3#(q5ag4wG~8)e__C zP}b@!(`BHJmTBAaV(y1Hdy-S!Y-Xr=gbyT0@;i)>B|E?L)?Fbg+f%O6@-rx2bQGRy z6K!)32Q`*CD|Z^Dc?`lgH)O3q-sk4u+9yde+uZDc2E((Pt%sNJ8QYI?Uzr*f--nrS z^tuGxQ-xbHLFTtc4WibA5B5&dx0a*p6Ru^>>HFJf`8z)SDruj0)y2BoRgZU$Ramn3 zbjw)!)XiOW#mQnYh&a`a0py+@skC_y&T*>2Q>7Z(g*}A5an0x9V zxsV~Mfs1Mbl1P~dCCWQI8Zm7$gnHTg#Rb1ceDj9aQ?Wx;|CmAUeC|$}^h$a8her5; zFQB}LM3}26h*sRglDz_BggsaMD962<2^N#iJ{I+YMF+3C+epDH~@L#;>7PC1d05y9KH1nzsy)|MXrG6L7PcE<1R9#+UXuvkHF-rgMhI^y??ESH{wQ}kAJOvz zs3n((_qvTVeH7)V7nM6d3GB6r{aDrV9zyXa{8_b62lQvJ_@~s*V}etVwf_H$p&_YT zGsQ%~p&+u*EBnWgqU!#ikGAWu)tJsMbnP|IXFK%G<6Q zVqljm69tO@IW80Vu*Rul3t7YvNyjUZHxnB^_R?LYdCJx@6Aklr=&%j&Je+xubGsm+ zFXe6Es&yQz*RCXm{bGAVz`WDFYo&(DS8~*K=e;Rnd0nYIGwpmem0mGj;93DkpO+vY zZOy21?x^!EE%&h_Y2jWlV0(t*ZKX}_d$ zDbYU>VRKC}v6@-GznR=<1Et&Sk{nS!lfZr$BmEDgOETuYy;^KrFBIrgg5~)G#{U zf>nkm%CifGY2Hp5Rk$RhjdtMQVm15@Hw|{G8|kan@UJqqW0WUM^0NXNXfxd(cQ(BQ zrpkvnCK-d6U4VQgPJ13h9T?1}6jj&+5>9XSLWXKzIj*e*^xDp!3!OK)T&)Xg&y0)= z=D4OGZFeFt@y19jnZA=t%m`8>l}c*#P;i1NNrgia!tvfOhVz|HjXRZmmRcti;N`~* z)xOf|O%ltbyNrz^g}z)S>Yoer`DJaY&48>fv07lTaB{i;37y6`djMaD%|JV?eh%Sq zb9A|9s!=D8a$Dt>v2)Q)BN=Fb&Tiv!3+s|?LdMtKT-W2#0A`msnwvyS)m?1#VO1x) zG}tuEDO!72{w7NebhAgMx(Qi8>2P?-{$xgq4+HrVqp*yAwa`c|Lf%QQGvkDn*F{gW<@wfb{>HBF<-NWZc$9tGPPgX!M)7YYT?I5q@g}(8cguG ztn|ZmN|kOo<#Xf%hB&8zo~4SMLemKMdp-rQjJ7>+NMJ?2OppnV2OBt*`W4=3qL{LOMQKZ)^vB-L5q z6zt1a@nN00=`8klluJjZWmg#-aW1ybb>NPqd0vP`mZx_&KKxClsl*G8@>dwQ z2Si>VcwNQ8jzSyc*Fms+S#J#9ZO6mgDEV>qN$QyBBdhu7EY5sf8sXwU1}!x50@}RA ztywxfE-2EtsduY!wS2kp8S5sdMK=aukyAcLubh+J^-|nF#K@F^ccjKwf>`l-Q&}h? zU3+bCHTTLf+cMpH{YvYwUaRk(#99!>N8%P_uaU)JST0+Egh(_1Xn-7pTQ#BH9UPtC|F$?}?W%8u4f zXr@kPcYBj0rOxTf&@DFSx%-LVfqW^ReO7;T?6Tf&-iAxT;_o}p!)3^ZFlL+e!F7U@ z;1}e|UxyPMtm{vP@h0t$+eXY8L?eVJJT0xoDuM&H&O*?40tQGgc(w8+b}{5{M)PNj z3YVg@AJu7IaC6^+X8YhF3I;bFDwXybek?cDxIP}g+r&hR9VsR7THK#KYfZ@E_tm18 zs+u{>?)(h<2bvbMjxdR|?^BwknJO74$6$&faVY^VMw^mejrX8h2zy>u6O&$h$>cN4 zCwfsAe95;H^7f|3MRp@z>wfK!a**Ah?fIgKd((QV_36A{$yj$8FF2Efr+he!*)s`Ul50t$P@CDgR8V$^WI}`fIS=Y64TtgDM2FqMU9}=~+ zUf$D|H>#HEFW&H0Jcjf{$1t;@Y}j(VDKHKt zPxoM@kI%@8MW3pb_nGCibbRZ}Qo*z;-3))!ce8vY*j?wBHd1G=y6ndofBX=AF}sYF zhkgHi_B*NQjs%lKyF`(z42@ZOmrLh5of+#1x{#GllcHCr5b`%v1!0y6E3??l;n^z| zXR{1>8Ap90ExWl;C_Nb%#~@X|M-p$x;Dl|B+-7s|j90 zx{nWqAPsB^y@r#$@1`^zz*NU?%q|$OXc7G3THVvAs+L8k4TX$bc3^eoekMn|Hhv#_ z+m0VZ81V9yhpBm7UAav7(Yoltu77CN4d{o(c>i9e6!buEZ5sTpT&~Ks%hTjPz{vu1 z^2YnH+2s-mPB@V{vI!+^?)-_W<%IrgBOYo}PRZ05Xm`fq}yu+RX8lv}$ByOE^9zVC*V z$5kp=-CUgWW)BPrL-kseKFEGyc*4%$xVFc_5@pk(`0%PLP7?5LiWK;#^!wLRSm|hI zW!26wUK+9Hhc&jFHul4HaUBR4#N;;m|K`9&{*ugH)6WJhTKDCWR;?2*cIGqqi7rJ9 zx1|P)HPzm;+C=hdOLc#C`hFfu+aB&-6N8@xKowXIq+DbF`igNVyn<0~RNpbZm>1uv+hZoOB>Wfg6j zb2hhS_dM+P(>iIBTb!cS)y57UJ`5R3a46OtQUfUtXYX&l)XnF_^`fXHpwRAsMp`6Z zAidTyRalzjlCvMopI!*1O7lpG+UcjIOlifOYGyP17F%VW1Lax9w>TlS`M7fxB0i&8 zNV=VEr?ea>C(Bdq+-U;tMh7Y$+F!UuYMq~7sabs}5IM4V7_P1Gv$sR=XJROK0)6}= z^Dg^QSUuXk?ym+f9dEj&*t~652z#b1O2r1i&i)pGuCMX&wYoTDp^Z*ngM5XGhAXYe zv>jw!VFEdoiJup>i%)}9o_CMu?$(3Tp~d5QCjJ&7uGLk}0!6d;17zZp*>2y>_)5YJ zku#_B3zvYeWZgb(p;U3Sc{+Nx0i14$U)y|0AOZv(cW&p)B#Ikc`h0Oua~HTcDUpcZ?JbsOzF+seH%3lhkHU~NRO1_I!>Jv;9 zm&Ho6RcNh;6xH8ot}b9LNI4A7x_ z`LNeN>U!$Jxn4Z`t*w~50grgHyYQ$s$D5$_;|b6U99$-2oWCK&;Wyzkk!-2%4>p${Et7uzeu@x6T+%Z>EK8&aZGBWy)` zD#1fF2uw}2S9a%arQT=eNuX+m86!yP;0Dc_CVuYqH7%QZv_!bNRq@%W7!@eVJK?|N zHx>Succm(w&O+2f@DbyyB{Ro}Mdgw+J&q&yYjLNr!3x88Hh5~MpW<80Hoq79g#9IV z8p~GeqR}%5Z}z{z@z%Xv@A%l(YHp}CeW06{>*q(CgK_-Wf-^78{fYY2jP3l=+(Mj0 zB8?><6t#=N8DoZW!TvEzG9vLk8ljhCU7k=oupKP z;RUQJA%inB7ZwfP1xGsxUna>jk-W0SUBhW#Fa;Dk! zSBqhrG2!VIK^p! zvMcWHg@YZJatjVJ)vOluF*9b1TWGk^QCA9;Z4Wx(Vn*&6WGpmz)>lK3se=-RmlPrc zhK_jr9*opsnU!jen`lm4ihY3dD{%ktV(=JBLS~d7yChpyOe`zGjmZ3EqbR?}SIH^V zWHk%=@+hc~tx@;E6K#^^V1@bEENXs!f003+g5qPfag-X{IO;D!{!=*Rv{l*uN1LGq z@$1AgZAV($zd7byDN(MznUIWRMTYoP(p#!rJa091Ev9T`=3y(_Nr zLSdjc+WMO>{#?dXmqd7S0F87QZFU{DjKxBLpeh@YuI0y}u7wFd{&c=+ zzRYLWK`Cm`J-ogXO>P;_(Fjhg+X76!f|L%_!uL1DUwG@BiaBI-5GbTnj3{j=yFuwA zD~`q|jql*tpw5)crIZq{aF5Dr^cF>Q1u7dz)2v? z;YB^EifI7{ebwEsaO5{f#w@LchCEV&;PTKrA<&qYm`lAl=gSj} zz$#YXrNQLJkrGAOc6gE#aB4M7(-YmJ@&ht|Mi^AM%Wrgf#P~6+zM?FHYo@tc#UTlR z&wQ6$gZ5jw=`8vs(8#31yTQnw{POOS(k7Ae^m1d)5hz(NEcxtzzXR{ zO1c8+;mYu07|HtJ5L~snY}1UNQ6F#E%5QZnIbiVO$;~ZVE#aK#(f*LwlV^_Dez7E- zQsMwW7?G~9Tg4ZquhYz4bHiEZU6vozLG-`t8HFU9kAMb#DmMLEw~fz59b@TspfB`i zYZVS4$L}{Pnjl{4d@UtQq>fl5ff!dQCpK@f)a^|A=Fu+eTfMBtlhKDL=Y-o-umvG} zWrN!i(YTV5vBM%S5lNI9)(s;9r1k`p*v{8lTGGYTLHmF68&NlyemP{ON+?v}amaV$ zDbub{U<(t?1sXln7%au|A~q>oGWo^Z#uo24zs^C`ZGCtBXfnS(JL(n1zzCe2nMXkBL#bO!5Y`Qd&cMA6nCI}Ln0H-Wd?$-0TPCz8@16J3H=6{_jh}=8t zM+bI&Ca6>45w;j6opL{4Y%rA)*lm$SInXcDKJepnkGa3O3Y>ift$A3=vCV$OCIFC(efm6$ zgJWnq$pz-liTp_j6+D_PmP;L-V5MT6b~~>Pj*rf3lv5zv>}+)nejq@PBx{86#Qo=($kf`%2lKsc48S^7ZOX14yvY2tE)8HiK>G zD{M)KpNN@5_Knt{?_!O;D<>OWPLj*d(!RZ}^nKM|f^agrs)7$8h{G{>0dy#6fyk=z z$-xT)*#loWl=Mel{U@^JMGb2IaC+WQ+YZ440CJ8#qy1;M{GX;c)h`Cy2@*YOy3z@b zr0HIzk6TH%NAA;>3T-qxK{Ykqw)6J^mM2JO3{4{s0W)A;)#T9*+Wh6_Qme*ZHCy+~db+=;f+|^$UpgJ*DmFI#QhoZ~f)iccnbG>%2?+D@rV!;LJca5t^*W)WF>&ee%%=>dwn?3cxnNclzvIHMlH?VJeiM&uQ ze4ZGt381WH@l{$)#8i?J;D}5v%fepB27NDu72p*OH2)Nv8=d87>v64#+8pj&^E6k^L&Z$Im+#x^oNgZOS*?F4DolUt> z#pI)^&Qq8mfir`GRP{&fQYvj&4eA4}W}0nzC5ye*>rKJHbS;_rua*6UpEf9DpNBFX zqj`Lk4=jDsGc=`e)5ThMGSW;-LRWbnorXJ3^VyI;gbhs3$spt8hG;R4+*9H!;B2Nz z{&nJ`R8>UK(c!SvaUEB^Hyijc6P#7bv(Fo;>EGS7=@C;ri@P0{u(s)?G?v_ec@Z$# z9Uy+ZkK#;hweh^n4DAcgA5=WA=`k@`vAmTmt|pe`_0y$t&{wpsS!Wty@~xYhq&p_fx@QKJu_sL? z4rnTjzXUZndN5?v+s4D9@}u-E_{&&+$l$T06^A!~y6^(fB?fJqYEC-SE}=j0G*Gx= z{?r#{xNe|9%1$zxcd1{&+^{8@Q>fdg;mh86u@_GJe3t_#Z0D}mQmJBG!GTgHvZ4BV z-scl|8nBI#8kav+tSI9gXH0AtxKNGytHk&jEzgjp0@+j=9wG$cPUEMl z74!4hpxETL;EAYxU1EEkDp9dXfwuopJ}5Xy`TCH2bq=*0F|(Wq1Mr2mvkCOpVNN(a zr;|JwaGppW!NetduqxZ?CI?7bmE2Pxai72KHv;78e{iW(;61NPx6&AXW!$pG*=v%v zu{4DZV3jUI?YTW88+Ge5Qjv?Pz!RXlxnAe?v94>Z6?k78WB#z5`B~NOIp1%S<(Ky* z(x)E}5Ir-64LCz&W;gf%fA;&#udm7#4<;>n3c7u(9uMDwcElV%`xXWUMTQhdHC{T& z;+x~K)-HcAOxHyL=ylsXLAzX;uT(r3P2$@KKH6-0$boMZUCO?t>A+=?8ehoI&c_tV zCVZyWQk)3W&^B1PEK$UjbW^mRApXRke(oX_uZQMXZxIR1tI!nvZ@r#wuqoqR)8wn>sJHuXkh@N@^nc9@p;`AWEMGVgu2l$?wLC_zeA^>yGj7*@xwqs!!xD;9Cuix2JYA zY~T3~nZo@ceD9P}-hKP34lfP|Un0KWkqz3?w*zIap8%AgBl;>fQw3WXudPFSw%*!C zc(w=H{5Tuh>w{d~mfh}Lf{}CAUm_;Ss=E1hssMvxF%>CO7-FWuFlr?^CM++CW^O71 zzo^=YE+f0LU>K~HD?{Q`Z&wV2Bkuis_U!8P-B2SU+8)sw={YRhtpVpsXcmDM3R&8Ee-J$?>Z4QER?#ufuZOl-~n_62{%X3O_ zyDv!3z_#v-K|_P-rYBuvo$h9VR5$_+v@~*j1IlELWj0PfpEwe^n70$9HvJJMU(@U6 zfl)zM3L|S6dRD-grIYH>57&V6f6qwx zu@aT=Ha+wdO+i=rD#?7}2XQ@?l`|zr4Gp=j}8YG|7h_6>}g?4IQe^qI6&v-PDV7Shk zdY&Hlp?_{_*WSS0I>38SA*2*y@NUcPMg6@Ok2tn!Hv36Suaf$k?#f1FEX`Lb-d}%x zx*b{p0afpJFE9(-$wnsSH4v~iRqS{jXOK@y#kW#^GjhA(r2c*BAxrjx?peehwL@RX zHLov?18<};ny{^x#wO8Igejwd)ssw)XJ20R>$xM{a?W&&@An<-$Kyw zl<6|`(c+k{^@Ufyw6e^->W>yyq%f|tHfnJe!#Hb+-Toru@=Isv&zT?PgLnTpa@4U~ zXmq}nJ1gA#kWZ!Sng#s8=>K{2^4no71s$Q--ZxSPv6Mzjjmz&6Ed3u6s+K*Fei(rn zB+&cqi;tu_i=XAto?{@=Akru-2&Ax)koTM;6r)6R)aHPG zc67Su+j}oheQ9oPXJfSM;E;E2b$!Mb{T5o_N-KslcNqzq%@O_P{bfUV(sh-cZ=y2^9_)9iP2H}H5CkJdB zVvh$m9n?n_WC|g4B;L71j$5MfOU)$HPdS~_?%RE1{mCITd?myG7&{_FpFw|mWTbOT z7mTF5fy>_o)USBHh)?In+j+WbtTU`E7U${v+itEZ0vJa78#~GsF+7Z3vdJDKj%6#g zHqA_4?ifEy`NoL}jXV~S5S zGjc+j_9s05w~+epICMfUM-E4_4#~jQD5p+6^XDfCln<)9shpOENS})MY1V_g?*urN z_4TzE1&S4^FOx;y>z8bM0|3M<2K+kTnY4|T=AO;(s-h*!iZTz#08RRt7k{DrGGtE~ z?yXd`Ng+*}3!B?{msQcQ9HtRD&eG)dcFW-LNxr$<<+nnpO#uH?7xwZPs z`c7MW4wh~3u1EsxsNY_bbnts!gm8BCkURS<@80&1ouqJOt4QD>W6ZMOh{;})_cw|? zch6R)Z1oa@Jq=GPS<{c$@Xy{V)`nS9zoQa8zC2{47jDC=t?PbF6{TvGFOi*Jet$rX z_3)^yX%&SOdyjScL&%fvy?^zpH+=IpeL%d=D7LQc+VE=+(tA&N8&vo@HXIh zIF722#d!NI`QI}AZ#LzB3UFU5`dffa!8A(Vvq&pl!*W^E*7nD!S#{N2ukLhCko)gG z*%W|a?@T0F=T1Lj_W@PnX|qd->c_xh47cyeNEicPv8OG{%Y~AtkEtY zo59>WB8Hx1g7aGLXe?-)i7hlPQMu$lp!9UA=O~*T_P*34t)<%DtFpddGmw4j4zgzG z$6d-I<-3Zq6EI&Mu~FO$e*n-T&aVuu5fb<5r45 zlRqD4G{HjH9rn{?XzhlLIhA4Vcw#plcu-GcaZAtbhBFI$8;9f9mb|e}LRkMqCIj#pZL8aRSqt^Qi3|;rjq< zU`$Cimcw`~LdlQ#_?ob|45aS|{Y>F!)%R#zPs|xy!M*rsrMesl5AO^r9@b@&uG*Be zs*9L*o=Z}Tb$rX*Ncf1f+6upKtE(ZSn52{VZD+E}ktK}Fn~t+nt(uhlO&*=$zO*L_ zkC*cF{^rVS+&GlnfaIbtboHteOl{&N6)A)kO3yjc^z~UvG_qaESNUhVl-UK4tbf`l z*D=N+80~IC)G|X>G&iIAROsGiFe3Q<&=1U;fzZY=H@~I+C%|whuBVilzrvXUwqEGj zajt_8_4?obE_|1pk;+D2*{+o_B0D?OmDXZNiUqXA32y#7%A1|`e@3ZQf4xNz$t6gm z7!?|*E>t^t#Z%!`t*#b~sn-PR#P)vzqoLb~q_8kZmOrh!O@pN&{FBTmoSMmpZyTjr zN@9mY&62-u^4Z&ad%f}-1^fqjq@)((i!qvzuuOiTg=;`WIm-*oo~m|M4EgjVK&kG# z8c1WfERCy*W!Qr4PQ00BQX_G5Q)?9Fjnz<;-{Z6A{rl9zqd96c#vYvza@x&qmKSF3~6MA z0_bRfkJ9F-veSCdikZrbzF2BCC|`+T?5rANTVa?fNM}~oz^NS%M*3332jKtJ`G{X< z?$>AGynA0|$fNkwLW7cnR(6%8Nn5xe6Ck&#AjZdK$%3caj&1cDTBu4xbA*i&G24P} zLf3Lw?|YnbRINT)XxJfzzgMmoV`ymuc%jmeQ@`+$&|_ldP`2%h8m7~mMv)l0oTdd| zEolMIJ)5~hecC60ZNW(ZJyj@rx5FGCXNGU+;VwmX-upV36iJq_EnGOP=lgKtS9UI}-k|b}wyp z$Gj`jdAMZH4951Du;xWa+|wEXp=&QR#d9P(O(G+)tFEM4{MA6QvKqb2yNcS3ruoJ7hp zZ4;}$%wzd~+}BjW$D41;roEes$r_zW_i|>aBA=vP2}`MJ*_%X!QsIk_StFvdUpP0) z#@ALPD~pK8z9!UKgM{ijl?#ivnz6a`;11q~L7Sh;ras?L0_c8- zNeY3MX?Bo1u(}6LiRqrO;gfc3FNU$E!9=Irv5^h_!K@2`^EYIk;MLno6MgtuEA92K zw>OBg8cQ7=g%6_+EHCQE7$YniJn{!Y6~uc0z_r17j9%oEUm=AwVt5`wpNr#27|2f- za7gA?6ta&r=aJq5-OKj(3v78VT&Rge(YnK~0)S?j%4g4c|5S^KUWtvLyuodWePo3u z(fUENyxa{lZ+1`>4|boil?odrHP5tQMKc49gsP|VGOY{&7WBvt@fPA%5?_71J`JY@ z?}1DBhf~4j!H=vZfnb^%y0o73w#o_1+~hQomJQ{!nRsPtOTkhhos9W^l ziW?-We?AA0%maj;8?v%gg^03@r;nE#h)CDeM5WdVN1k~kE(NE>A}k7xWZhz zuY?gNI_i+Bq!NmqN-iz4TfvH1;e-SP+0boBO#*xz;178ESCXxEJSQeXVgld)z84AU zr!>KM>Ze&p=tYmbZ(H=gVS#PYKa-V7a)lQphX*+amzcJyWvH@AW;JEO_2mV)vX-^= z8Tu89hV%|KMd5$O>%b}y$FpQ!cam*)nBu|_H#Q96Y2o4@b5KE)$5YGSXC@=zcUKr) z5lULPZr5L}E&Z-`9e%^7bm*_Q;_X9E?ELH?k%28)bTV<&l2h{wkpx@qeaLhbIg1}# zq`ZVY{-)bou5YJ1d8W?Yw`Jro#Le~+aeQ5HJ!uIKB_06|(yf>;W(r4~qY_{%@qyRn z4~6@jMNN(v3|9}4mX=B25bs~|-Qc!d)VK_MRSb90u<}3lN}E zDvR}HnfIk?ev=)DP;I50^g|7a4E`CHtbv-orxSu5#n3jB6r1T1-V{nO}vd} z8tyOgwYW;!oA@-|F448wihLsGcZa9XW%S6q+Zjk>cHL1LEJ#tw+G$al&YYFi{OUx1 zOZ6(QoZs&AfGThGnHPMmUbscsW|WRaWY|6I`MKg^qttQM+HoxcSD*w zt=Ca4EL$_=siTLuh?)3UAUxLjewkAnOlQLc~$ndwV6 zYsq|?^LFid0=j15V!gUgW7oAzo_t&Ert}m3UdqQ*w9z5>_(hO|oj){>j)7xh6UQVJ zlP`gVO24)=d#Xw=li$hXe^Jj`<-4^*OU19>Yk@3S!Sb_a6)WQMdCj-i!HS$!?VBJ{ z<*K64;QEgd-${Yu2;l-Z%G{Ofo1pfrtEpL#_F&KxBuW_h3RPwj_U^&c>^dumbWxQ)O5-SG)i{yB*iW|Lvr1?#IXnx0rN zy(=yW;Fj}iLE|#iIYvdhDsNh(r06g7x1?JlOZtPGIll!p6vcM9aKCqoum4X2xEBtq z|7_23I~(Ei#xCjg8YIQ@30(k*b?ZU1zet;yQ*WYhp zLrvLrT`(^!Lew)x(#G*GN#tn1bKP zMhE?As;xhUjl`ARd)Lx_T@?qVAb1epcrHkeNT--`#{>Os+;RoN5-HsKmpGfKG-!=|N!gr!v8#rDyRN32M z+CU`it&ok!TE1{Y{sL1b8#;~n!p%iX)!k=5!o>{teY5yYZxZt%2~v$gE!zG0YV*E+ zx;a_1GG(Yi)^?fCBV#Lg{;M4#SIyZhgq`O-Jw=p-Q zo7FIK%}-16_`aA~c1pk{c9|R6HVVF>TKE_(2qw8a%k%x|n-A$+sWd#;&}p~{m#@Qu zpUk{p!#?dU^bP7iE&i#ZEKR2f|6whDeY`W& zk960DI^gsUcdHxCq!N}h)NnVXuCH~BNO*=E)%CTCj)01e<=j6eF?w!$)tPf^`PQ<% zwnx4{S#uM(fu%!s_Ht2q|Ia@+Jgs>$F*2Z}^1FJ(yN#2jz8;dj{yBRh17q_+EtC9h zKex}Uy!QT90x-KpSg*MTngwcjcWDxMpz0PI^49qWz=_J-<6GOja@iIbb^K%J$yB}c U{@2$tzzome>FVdQ&MBb@06di&F#rGn literal 0 HcmV?d00001 diff --git a/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png b/deepnvme/zero_inference/media/zero_inf_mem_use_cpu.png new file mode 100755 index 0000000000000000000000000000000000000000..7857265af5c777b0908cff933fe42eef77b2b5d6 GIT binary patch literal 41786 zcmagF2Ut_j+U|{_q9~vuq7Xo(cj+Bez|eb zLV!RZASIN9gcJSmz2AM#`N~{Z$Xd)QvnDgM=KekRMCxb(sVEsJ$;im4R8^Gp$jHcH zWMmguuU;YjN4U}4g7kI4QxEuz3^okdB<)&WfTKrxp&nYjw*)l_;QLXh1(!wZw|we#^`<^MX_*+*T4SXfHUVtSFy7=!Gq5eVwp{72R^$c$U4%_bR_~$9lW`Y-7TQiJ!l==*1%?eYAA9-uGkZkq>hCK&aD)NA zx2YSY|C7^HT|N;9gM)Fs-dT}dkmZ*1f~NJdwMmM z9WuAD9R3kc&IL|msRPZaC&XS^#3EH~gLSMQH4WXpjvmW+j>C^;6t_-~O3dH6qOC|5 zu#~kxwnS5HIKYuD&MhNGGbcJMC3Vmy`zC8Iye^~dC(0jO(ltXt$cQP)4}DuC`E&l~ zxQdS=F2B>eFk?|dPRlrEw^O{nDW-6JMhxIfSts!!La+bL z969dWbsuk3%cD@B`M;KM!f{)SozGA&DR6Y(u$kxQs_&&kJgzkzt z7yvZ|;UB?%!1UYM@!vlz&c#tRhap9gk4v_$0JQJGe{Q(#pEjH&CjgXfl~q}mOsjY9 z?RB5h3dL#oD+6T`W!Gn@d;+ZcU6YE;Lc*~<$p(RAcc&x)hdklgcWVK>u4NA!35ES$ z=F!ac)2`jikCuhEbVho?()m={75eDB_(bSdJk|_S@c}zi-D#(a(*ywNm_>(Ab}eQ{ znrPCQyY^HDzhy1(g*LgqP*PZ($~TM?$b_Fw9($}xnLz|Fx~k`Z!;V*Y@de1ZdoUjJ zCPd3B5>%>iUA*1RKMNeEeOHO?97(NPgwa!c=hVd0G;PmGh<2}SJ-<67n-$610O!G5 zo64Dd`w21Xy_@?TdTd$85G*o$idS4N(d^`F8FM^`o|k^5!mdQqf?w!^6)VM?v1eQi z&Q@V&fi+#XiF+~l$V^dc=5{YFKqoQni%1;R?P%sKz4@{DoVcaKYAIc)m9@k>Z&8r^ zryly;t-~jtbI)!>%w$PRLB(=D`ogWjCe6W_L1Tk#m|q`68a5(C8GEedKS*k%YSk?} z?Fn=`Ny4}?(1#P$jZ`JOqv4R5!sL@T(!--2nn-2;vE>|BsT##fqeB^+46j(prPqh2 z!p$19k`SIwvMcy_X5wpfJr~dPf#Sani<+XVczY_o&ezdC*1i-DBC$X&N<$}dN_{vP=XQi zOltCb(9;zOTqrHgef#dXxx0~<90KxV3tgMcU)`Mzn(Cp?n|Xq^SHkbGjRuX5cT_WS zeqRXXNQ}wf9Ai#C-1WnhP=Uzi@H3OGFL`tQ#-8J%+;G%jsbHl_F7!i+CwP>4M> z-gK}gM+`GGv4jH@5-kv87oxN8uHEQUdvkY}zJBdRJTYzUn5g8EiQ0S5CPQ5m+~;y} z>!ZIayE6P_UG{!#d}XOcDWUNe~Iz>(v1E{8ORts_G zXfaWB5!BqUk;wO=exs^EV^JLwUe+G(@yl6)p(b$%9nFio=OEs(8FN(6C~11>>iNCP z%aYLT7*Vlqxl5y|lkfbk!Shm zA0Ia`f*GdhjrUiO`$Q^iZt`;f{2NV-s;n05&6+RU(dOg$dr=+RGhhq9rS;*qyUd{# z<0Cc1lsxqdGB*f1S@~{%rB5*N{iIRlz=}T5#B?ID|2>*ZjyUlJ zv~C$%X2GZbL(5PIvi~4ol3DD@2gO%e=WWI<^uBuCw58tD`ThK69(JS=2QY`Yk2=W{ zzk?DsWU6UFd_|W5IOoqhzT`P=4bs*U9{6#kB{I7qk>9SsJMT2iHv8|*x~yrtevM~c z^p2gPuuGPnb!k^7AJ`_eV|o=rwv~#NO%g6p9VWE6q1qULHztKfVNy@&@)o#E$b{lw zc2pdDm9cyeWeD&9sAtcmXfO*6Cr$5}x|N$f0z4CxMn3?;4)-X}c$7Pqh`5td{q+`P2=WCk@M*~3ZuqBdc&{o;O7 zojPZ^QrMXVvW%5(o%STFnR{W*y%Yr;^}$eBms!~`2 zZ1)9+^P?e1TFx3Xg~WinLy5YJyz%6j6XbSxq((1MYsn#Q67NzpX&hZ>giE>zSsR5Z0g{WOq$lY@PN{pc=(rbKAEH*y~JlR=U)4KF= zAkN97wn|a~m+=5-H;(fv*v#1Ic`=m0+14PzQnJ(xM~J$f>KA8bi=rJ~t`H`@ig_l~ z4@GO#C37od{T`GcA1_tp^xVG-bUp4lel4B`o2}lAX=e6P8tY9I2urzTW`y-^_~lhF z6M)f+=B#d*@^D+C^v7jSKc~^(Km_)E1xp*daA-e@b$ZVKBSyw6O}oOI7}FQTkLlP5 z>3Hyg*iXws%S6jrqDh#rl^h#Ayj|hcR8*1xQ$Lrh&o^TgNZ21)?LFH%iuJLepC>}M z1&SlpTjGv7q88o^Sc;Vi!tU+$+}da9N}gXb)2>b;Tn{&1`$#ckqFtO;8G@|K=gc*N z@N632WD=9mN-EBNGz`&uG2AK3Z#T%;DU)h}AeAp$z+PZb z50%MHs*=WE@?g5$jm)?Tt$@H1;=;GMTEZr2#y6q@D(?iq4s#s*vx1L6Oa1aPTUH0) zW9V;%yMAAsDA!hh{=lsYgS=h<%K_^M8l#WY@;gdHpt1OB@e+pI2Q<7bvb@M}3#lSL zNS*<@$TNF!v+B!C1gO?ab6}h z1(x>0CF|WxTCQmAm4sMp=q6@uRHcuKGCG*wZ9X4M*}I{K63jgkrX1_oo$oNEnn=8k zs4c7yvN&WfDn6h3@>t%LVv9T}ttqIsCAF@VSkVR-i!&QDHD^$_4$gfRbAo^GuX>sJ zO&5W+a@xv7P?-?R`=COBd%?y7*L>qAZQ((TK$)z=r`HYypKGY3PJ_8#Lij-ovaQ!R zR(?A};c^83cIT&~LC#otD*=$lbXOMMF785%7iYmO)Hw4yZuQ2}cAsj-XR0mn6ZFP< zdz&@E?ld|CL)2(JKzGSxykh|>K+iV7iWu^L9mMG>jv2Qe!qn6^3>sq(RF?7p*SKZq z;^{oHJgP)z6&{W8x>AdYqdR)6&GBZRzyQe8&hqazz4t+vI8|HT6}ZT=aT!;B+GliM zphz5S;_>&Mu57kqumd^J5VCcOAT*6oc3bbsRmVX7l1?@MN|+%dR2>v4Xo%prkKein z1L1#cwxDyP7YOvX;9~6MX{gDctu53cXX7grs0CMVX8)Ak9F>fnPK6%kvu^+Uu=&GL zG4pXf8H-J)5VvyyAW5Y5R^0YWVh$|)N(_qqY}sK+3?p6-NmAX>T{d!iI()o;;Wy2% zPOlFgXWkk+eWjldXWEh{%}$1oenF?A%e#*NJ72(52vMsH80Rr$vNd+aYFXF+w;}0j zRLAA^A4jxzJIR_}$?@DEBm33=D^eZ(A`x@-(=P^?1B2Qz`i0RBsQ6H*HLV5C1_8v6 zk&}JJd_KSU@^T#SzqIvR=#{_QzXfk8C@DkuF3L=ka&j(9l9<6iDJNrK7xae<{7oIV z|0{>#&-U-wdoXYvgFlY;&jB5*n(?R+=3HLoO;aXgx2C8?la%8{;SrRIptaGrkNZ*V z|Iy-MvzUu5*a}CjQ=7BTYQzB$ER|!tV$7BJ@nQ)we9_F9zhiw?;=7PMEca`9t~Ftz zY*Rg$xZ<;Jp$?knpwcXn^YtKB_$L^5&qLcrRFa8ba3-ZzxMo6)1D@h*ep4!eeCTE@ zL7~&7&HCdHzvS}i=VnpnfE~ZyMo8g8#+OrYVuVQHdrn*XxD&KL+hV%N*Cl+veNI8H zPuX&M?W^~#ad76!fWXL;3y={I=ADQ(@3u9dn;$ajoy?*QIXd&a*?n3eptY&S&^jsg z8@Z(?D3cs-AZZUOw|Dck1T9cYWpQEy6{V6VA}hZfX?C;AQ|IrePxu6tlnY)yO%(5( zQXP#wt6ALyS#ZZ_?+^Ri(dHbKJyvs@6oXk=a2j`xWAo9&Z8>zc%yPRw;`-M9qreT7 zun8>$X{!u@T@kfpqR^^5xEd;j9j_yVKFLvXZKN2hykVB5(qbiNZrf&Y|1=S_-0!k% zIQM}`Z4$&ciBh_K>@CY@E@+5UQ%&A8PL3Goy_cKZ6mH`ezK6NLuVbLp-$k9giUd@>ET%g{2N;8X0nkw z`I>sNOomd$iD%pIM{(gdj{I=jrqsp`d>t}WY?`UZa~~=oFSaB-+^^HCT*uwNw4YtT z9pEn4(Vu4gIQ~R0p;|V()7RYhwVCgvcs05Gd)=cCE-zs1JhusE2d0}>biWt?ApDzV zJz{`Ckr=lnj~K#*I45j?d859+r(a3tu^Og}vrv)wuVCJZy$0KJXj^_S4W-ij-xPB^ z&gNNWOXa*>*-DmcmqEF`{n^P2c$9Bd)BS&I=4nn-Z4no}R7ad1Fqe=@%_P73Gv)Yo zCseTvC|k7sjt3Y3i>?^fkR>
Qm5QIWPs8KbJ@P->ROXQWxK2I9gN3 zN2n7?7^toN0i^`n$Ze=o)3e)5R06=o0s`fLewu6S$;qHWd!Jin9ChzrjY?4etsO%E z1n}inuSsQ;%7ES!L`(ov=BB8Mjr~F>#uW~~D3F)Z(Str;iQRb03UH80qqW<1)yHi+ zTU|O2ldgjn3K0jIs^m9lGzw4BiUw{aI$9M{q&zXMA|B3>=u6_BKoN}D87eZXlL{+h zrzVW~oh2VDs{=?H`gM%@F%8_^(7jsF8(-+zy7q080|d&PSrNwQ^m3S&l_&;+(l}6z zbm$Q(xK}!_TSE=d+3eaUr<6%Yso9<=UvD|~hLfZ|o_I^m+6ooJTd1=&%;&Xj%H)2M zyhdn>XnSm#sB2m3f46?&Fj8J6B@-2%)NA}DD0|F1TF}wT6}37&i2m_>lP4?FU^G=4 zXWp@8*?gYz!6{IAd1co??w+(gK9-Uek!)yRu(-d)K=x?L6}=Xmj4W%NGIwG^wm}%vzg99|Xrv zH(Ce9h_Mxrs=gC0PH|s7*ulq!TF}|Bzahsk0a4FVjTuZ*A4r<4EC7IB+)pi-Pxs26 zWo1Q6P)#e8QP~Kg9>Y#O1tRi;B73h4Q&gf7tZayD4PX$fcbYY5?dO@BV{C7ya&=8r!)$$onFWp&D=qZDZ>cLv zi1K`%i#g8V4cfu!spk@W&{<==151%ek0P+e(jdA)B+h8T_Of{Nar7+ou}{PqNnT^D zbX(ua67|o+AvctznH{8k^Kl&OPiimBC6Xo@KI39x?Z>v4kDAQQW|yEoJ1h}cJ&1U* zqZK6%eq=+^b#>A|5-&7B9tJtPZL1WXdY0J7FSb4$idne6++;EKQgZ^8ZuH)%i84!^ zM?UVoJk9*O18&u1z|4niua9b*(mk9*6NLmmoInUf#3Ev1P=p$PpQ>ojm%TMGRl2g= zICgn9KJIzvhv{JFjR|6*!Zq`e_K)kYSymKMM3{B-y5LNQpC3MI4G;_xXcROr6J;9W zcGiBA=i@Q~6C1%*az|HgdbmrvE_pWukvbWaF|G2zZ7EOxrECaPXvo3c?q%szZpaUH zqwDCf&C>||&a*4X%GMyzZmJdrOe1-%mQOfhHft6kvqtl+keGoYiSn%6Yud-PlzORo zCnv4JD5cE!ww>GG7K3Ys0@<;$w~Uf540Q?c-vz*5eF0?6%u5D*pJ!c2W2brN=Ej<_ z+bU{OtyoZrDB1)&U4$Qb_e#l2&Cb1=tCF}vcK@q<$6U*UGn zL_a7mIFgxVz5Jm3c*dWc>LINxX!Vs$Ph1xEs@`Sl+?Mk+XKoFu8Z+E{cM8gDggGVQ1MBR z*I>k*TubSOIqozJCzG@wU?t-xE;J2~Hk5}ExCly4?6Oy&1YMYYrs7wv8~BU#I|UT6 zAi*GTjo=L#u0e;15xYgvSN4T6mga&=sXYiUDYmT1JoX6HEuVK$bUQ&ho z$r`M)2Qb^{?@eu`jsTNA=6?!C&sM0y?!I!l9B_vk>qGHJ&9DELns3{wIUzcE+SL_v z4!<>s%Nj4Djuo-5ZPXVsl6nOmi#naFc^HFC@FcL&7W|GU9tCe&zO;{G$|EUy)loIm z`~NNJllomtrELxO!VU9 zZ|R>~bcqe+k5zn?!&O|;xY}#)~fb=SYb5isk$JA*JtgDoo7&AbHb#D`K0E+zHH_{GGCjv zW&ks)jw0#&*Z=5zkLQfAEb<{NC~ClG*1tv}AMO@)uB zLe?Wvf)!vA4H)-r(LJxn-P(80E{mXV@!)J%bKmZqIl@{tABmSx%y7kcYlQuva;6U1 zs88R?Km2@iW^K$fEnu@YG$$Oarp(;KOMWZ3)SCpL>K%3=!ZIObTz0@AeW*O zpKEl;lKzI870h=Zks(T_Ag6-U$6e2Fz(@^?hZF+<}Z$>q}n>vQYsT-TtsqXStj0eAT;MN^xa?v7uar483 zlHAyzFIyZ7J4Dv~f8J7M)NZ+BYINIHEFt3S3TsC!dhqgYy*htRps^c)^F7n?P>CV| zMl?^rc)@0!dj=R04W?=LeI|muX)0Okc@{Li#uK4mc6x1_Z6v;5<;Bs;aqkCf1C5T@ z?-ltFUKm?RbK>I))gdw@0C4})ZXuu?pK|oU2Yz?n(iqC5!E<=7^kk@A1=myl}_n^>@>1ejaG5T1)$Op04V1 z>3TPS@og1!EcSVJjZlK}gg92y{u%l`{1LOq4dy}$DfbALfqR_CB4yWo#_WOkvHSKb zNa?`F>~5}D(nxy`!$_g;%{hYTEB{<=@s|jQHfz%4nPrS|=9=TzYU4Y#){4gk6F*B~ zjx651>R2|13>}czfSUeaHsB%BbO>dV(Z(r;cb(zBG3z}+M+K{X`*gAuo-n8;Y7w7B=st1T(`w6;ur&_~vSo4pp6)n7 zG%U5(l1V$Pqk1JdZ{7J?CP{Td%>34vWn`X`ctqV!G4Tj*hzPZgj~c6>%XRQfr-h{6 zU-H13#K}=_(^zz}-gm1<*x4JSgmDjTO2Tm|P= zGAr7iCF1ySYcH*msIH=v7V0c5Vz0;TacA&RB-Uev5huS=bdfm?qY2dTTZPRGH0onR zy~#Mab0e&Jt%B5{df4CzFZPFBJBN)RQ&)4G8)d48drLiz9W^-R3X@D`HRJ?Yjs-n- zi%b6gsj+CC5r#!ESm{birN8mO7F!#6LE&_Y56+W{fg5vHmw#)Fd7TrukI1kdz=RIn z-&X@OZx|mL%RYI~*rz>F)@JS0(s4TVnS-?a)l2(!p8^b;Ce3uP0jFdq%_o9P zd8L(=r_XHb)o{*>Y8d`N`+&Fpe$Ql^-6|JdXvkLg$+2o+pufTS%$4Ih2uZ|GzKEwJ zC+T+qbEe&XIdMA$$2W5CuD=yg(5Tb{Ub;Vb3l6#JgI{?*zITDF{-0LZ>-KM0F{P3tT>IAnUyXko z#Fa^yEAJ}(XrmVjrkT<9Gk&GVd{^@OzmCxPp)Z%E3P4;Ndh;^Cuc>Wrd+h+&?p9XS z_V0V#k^BQh{-$3gLIC63cPDAC&ucin2wx?~25Tt8SA#|)jYnwSBp0-=v{`U&XLK5v zm%Uw`fX%~8gA)PEnuuuZo67AgWs>1&HxVuMqkmu7Kq-#0_5}5kK5GUznY+17Wo!@b z9o`NogjHNr2pVF^F)llF+xid{uwl|^6Y=}U@XMq)Xr*5kiq1+T7@5&o;H+xC=-0He zR8!@a@m{3xF(jwpVQ1&6cHuW~$Mv41?Qt+@I`cIqrlio7C_*2cQsYm%w zd#{aWiHrIN*W2mhu&rn z2`OhS>+xM-6=2dW-2Q!%cF1y5)OgG1o#5x|y#*5H_qE65#yptx+^X}e(V%ALQ-Et~ z(3v~ZtJqDx1Q896=sGoE8SoxjN{|9gY)yi|b4e%SSk* zU7m|RZhR@_{CHEBlR8j()Vi~1bH5?KX@yPuO^R0ZUMN$n%o*HTD}RJOqmNLEX4LW# zhy%Y4m?ni-q_fyy<00a;+@rS2qOIGKoDOrp&We%nKN&wm*3l1*_ZhNbSP@8doyY}Ba4tMi^ zpDru*|F!*#QOF$gM(psA^Ury2A>>>7VeQN4{kYMtti__ajoo`A(B%T{cB4D%UeiCF zTiE8zY}sJf;Oq}QyZ%^hko|M-x>(*2#S1ldc$SkCdTESAcbe8K0Vsbzl)nSd0uN@L zX0v@MU+ay-IfFrTW%E(2drbTm=P%r5H?}r?{H2K?mc&p}6o~R^_xvBDP8y8&`tc#i zMwx7J*5LpbO;ue-nw+d$Bf*s>@Uh?8OQywG?%;mo#I>O3KN>V`w$xn-#ZlRpe$6ze z5&|y44#bQz;%MIHD(MG5DFn@q?*2i#^XVgHUwpjfZtnuN!DsEcf$OR`zea|fc8zIg z*(2Wtq|d=85-pX-_TcD1hjxQolR)&*R-i*m z+Xz>l8N%+)oZmv~JWe-u@9M?VD;}A0FU?w`Fy(f`H|Jiu(0V)oY-z;0;C#q8`7fnN_CFC&saq)i0JR<4k;6H6TFE6%`WV!2I=Tchj>yxq@&k1 zU`@F81ib$sHlcYPV8u7u{pg+=a)uMq&^!x*g|DEvz_Aw|V7h`T>`vLsSEFvq&Zk8JoaWSQ_W7c?Qa#mAc0qE*!( z1=Q2`_EAmD-Cv`gj*QzQJet&O_DtqZz@(>LdhQJFt!G)AE`BC+EhLbw!o=(5+x4P{|yAPkH!FOg>g3O9xf=Wz#L_ ziva92|GapCa&vd`G(p_k%5OlQ4&}vI^h!Q=v7`u7k z^YSkNbk=d}rRklJ`r+^rfc;~fS*Jk9kUKu1+v;lR52f0BuWa=ZY^jXU!|}lVPiOOh z{IJvw{ELgkDW!iNuh?GP-(j* zSFE9uITR^PXmXYH$24r*3CGn&0_W)rt>%MkEIRDojUTp^ET5bmGly+mDAcxQnM^wr ze}NtW$IY=<(8wyDcpvwY8%v^V_0PrNZ(KO1#m$5D?Jf5|6OS&>3u{RnxO0ft$<6+Z zNRK$5MR$alPOs>nbRUt%-B*gg-g^>>I=+sHLq&C@*HtHvyJGvv$|_*$KGK)`e~dMi z7UDFJRbu1dDqB-Vc9Wj66=2_I^(p^$5vMxw`sf)eXu5-@b5oJg8a11&YAZ{wkwuwH zbr`ZPP7&JWti|LHDHuu#j1%`7V+ligJaKb#%aRCsT3yI(rEjgxh$Sf3T<4bN8{}eA z)s&^YTYj>o`)U0blBRwRw?*j;J;O_lv`LffG>RFqE5biE4(vSAoju!&+dNHxy|!9t zj*ejbO2&;v&YcmQ%@2OU28Yb1(a6d@tb-;i({tM+RnhbU`p4#7$+x%Olqp&?LmWh* zALrbJQp%&^NTxFfc*Wy|sJ)VF3#(!B_Nga}lHXHRdxesGPi1r~!4Ty$+6*eTvkylL zs&^V3+M@>Z6ey2~ypl)xNETcg)tk>gF_FIM^J^-bBGAFt-RZTz$E2CG0?Wu%aGVbi-p%?qI9 z3&kTy1?>*TP4MMCkB=$K~i=$ZL`7H{d>&7|H%H$NtM>{{06P!^r8%uFo zEMjlWSf*bR#6%*x+6>9XBrDG9gPg0s-?tJh{ZQQBEv(E3TL~}pGdTroJQ_b4D&$ND zI=hu`Q-I8eCtyw`HvNf~^&eig<}cxuG5TqsZGp4J+$w(ywYm?ti9&}Pz6IPem$8~Z zLb53vI8~E?J4H^E0lO`!s`yHNjNo-ie(7@l>yn0;=o^dBb}{FQV1w73>IW#j5$5*j zt`YhlT(+FAzaqUA_Eyci4Bv7Z+-3nj%iRVruj_x}ks7d7r6@^)=?9K!klbl`ySGmy z#=TYAWsz06qYwO4;0YeY*Xp2bP@Hz$ z(SZKt<6fmF9P_yb$J-6j9Zjf_{3l=z-%lL0tAC(fhiS>V{OY#VLx>;225G>68fNy_+Y)mKRMn>L0J3n zjrD>WKHq#qoZ?GjRDE?sK4=H|K9CgA_D!VP39rclwj7C%m36CAG^1fQhw)p0xc zEMG-~hCCZ|RJvfdSCY@fkB$_1pvHO}rW?_l9`J3VT5Oyv?Ug@XNbjbEG2nSs1VT2Y!|=JyBX9`q!dfBwiMASU^tiWM5~xVYLTeRDTX`9O5W zE(+X}@kU7M$W6|}xSmDKfztw=&RM%&J(84$+a<-4nUh^F@01vaB{c|Y`x}p5d|gV7 z*tFPClWmck768JFu%q2Kn0>k@PBuSCpq>aTrVQ! ziR%u}z#iPFjLO~1?orwtRYhaP0Xk}ngC_r&eDbBYr0r!~)sDrp(5Jeh3mwCIOp|Ku z-pSHuW#>%~fC@>v#dXq-qp3V=?E?|puQnAr*=2Pq{~CNk!G@7>-(p-`zDf%mznzZ3 zBlk)O-ahvXfN?$~mkz>jBkQZyFRWnUkB5cu;=!FXXxt64I^K0EKac8>0=r++@jI@Z zwb&mv-*Y@IO);?LjG{fSe4E5AnJsek}CHXxs-vha8qHeQ2#?>4k4v*|Cm#NhSwMGq#r#yb7jR3y9xb$~6xH zV>*RogZtVEo+jN4Ly@@skIx7TRDlkVR!%!)9eFa7vG;2;X66T>Egc+vx-(u#fI|z1 zztDV;-#f>Eg9Kc5K%gcvMIr38SRU&ols@?aaD#Jxi`{==fx8&2DL3S@JU-PC(?|6BKyo2_YKz`)E2`t*Xjao%d->YZ?JP9N=e zrcbwZI?EJ{ZDq2{#PziWCJ{+pLOaEHv};x5&}XA%!Ln?$LvgE+cV0XqjW@qsVU$*J z_t?wU&gz}DKO)}SkU8z8#Wn9uHz?SPS8=~RDb#yujCpTo`rX`Ck{srx-k01mLwm4R z1xV^Be*j#=Mv9&BU6*=@y$b+Jf37)ARTln3UISY^ATL@4k(nfiQN?y6lWT4t+@x}V zgkQBVjPT@3pZsmwb)n80wzuks4$P6_lqIyx(Y}2`zDdlIc>tGIri6fOKaL>oToJl; zr)MP##rLeP4W=vS-XGa}sIyO&u^5}^=gygh7wqzfzkavQDi_@65Hn*c_hJ|pLFJxG zG)f5Xb+FU+JY!BgukH%~=@oy=d_9Gv5`H^KT^IWo`3YZO^vPS7+e8?r8N_OT%mfRL zv@}~A$8itla+LsRKa9&TiHqVNGw0|e?t4kuG4pW-?EpX~+U8Rk$7WIp5ub}r5w_)+ zhm?c28oSdOO-Ml^G5Ow-pusUQQuolj>H8}0d-nfk3IZv62XT+&j#%RW9F%OhPeG3( zRCUiQ6s-dXbmr)IU6;yt*fB5&FK^iTxG;U{fKJ*H^d(X}5Ou-on01qdafijPhdAbM zy*l|gF=vA-|KNFIaYb#5pw@)aX07|mFQ+L+R~sg8-Sf4++d*Db^lL#CSFbKoV?Hao@QUw?!xmKKLWDaJXwEj3r znPY}(?l?WqIvlu=O5B?7h|E#SnrQkuYPa=Xb>!2+Ym-cH;f6OdZU;mo|@0wc); zM26je@9WW4@;}V(rJK}qe{DPk8JSfqw)pWMwf_vKa@z)AtcKKo*{uEZ^yChiQE-}N zVH`kM@;M0!T>HON{@dIq-xS4a&h%v#is{L|=7%sjUivSh_y6Pf`HK2FJP6bFN*j3-MgK|e%7QDCoNnwQ z&9SOrG(B!YYstMH8R>lJ-5v7$%mQ;a5n8O-QnysOdAvk#+7`ID-E`D-i%L&0E%o@s zV&L<~8i5xNX@d?<8m}`nN*wk|n_k}_NPM@kuvBUZoc;M>j8nZ((fs6^-bdd~|u1U+y(-#?`LZTNS6Ls6ZWk60YglzP2e^V7HdRO-=XPxZMGa&Jpt z+*$WYmP!AXT*xBC_}Q$p*2E?a);`{{m0vpS3$Z+5;rnFNbjTyHmxqYI zXH=0(RIy8!(toeXmRm(FiZ_kN5dYAlw*mU2Agtet*XI`bck`&}-K;m~GGeV4e1Nn3 z1)WolOJftVuXs;(yI~K4`g;C}w06}#N<5_vgB-#yo{@|-5#FyoFc%MrG6%=UIa(F2 zt8Z#tk4-5|A+`X-KdR*S(Q`MoD~B`8Y&UlAs%J%Et1MpI&d@D?|1r$M49S zbRLqUM>(Q8Z*Q>tX_rk3lK^9qwO-rWU)WHo4C?dfkx5K|5~JKQxfQSnoXDgh9f(#v zNs*^vLYJAG(~^ob-k}3sB|aJL{G%(RTQz>-cd2Ajw-lrOcZ2ih5H5a6vBP)f;;FJ{ zI}f4vz+4ngd3*0JF0nSdc|P(tM0NbMN2k_!mb+f-gh>5V_)`d-2u7<28|Lh2TRW`+ zEynhMFLt7NK|W7gCj_qe^7vllZDnD!nNyBqZ)mB+je9N=9LF8b%^)?kfDgpjHyyiU zwqoUhSqBUo1Rq7OG-(^Gk|Vy_BR>2a&ty@ns)YaBak|mrGRewlOO-8YEhodB8NnWh zwHnx3p9kyZr+?e(lLG@!^LPlibA2q*K{zCulZ8>5w8Vf=hq!@WthmJ_P1u-bK4E~( z^0Vw3a%}OV`C}rXKaP<3DcV=;Q|IIEF*~#ZeoAkT7C1ire7uy4Nm^ABY^_MiuNtNO zFyo!CI}dxbC0COB;`bRn`gbEIvGJ&)Xym6L;cfPmxJ*STPKy^u8lp&cz_(E{Ff&SR z35^L+>qY0A^Vg(|rooO9*k2h#8^V5w?+4oG4*8k#F~*wOYAAk*fujBVnv%?J3SOt( zc#0JaE1Em>@jKS6t!=zVJG68~^d4#6!&};Daxxvf896@7X-ea|;Gq0Z2PKpQ8u&`h z>9n$Z%uVO@JMW*v40+We{2K6|^0l9`M7PmH z8PLiHHt2gIpT?~fYvT+3zp4_W*V7Z*%sWd0N}h=MJ_Be>`?b~$Z9aykZ;O^(^8;Gh zo%o-%+(`dn*|BmoD37VT+6)<9Cwm-m%4i|Ss|qmIDp%&5jj5?bsIZ>;pVoDMNq{D6 z;j+M4ClxKE8kM)j3XGyQUi>ahuaoEWr@vLtJF3(Jy5_mbiZkV4D$`2c#}b4IbE4RR zub&}*ez#w76Q2J8#cu^ydZGV=kdOaw_voKfNKVo}QJyFIcVuJu)G-kc{&MEC(69e7 zkN%$c_4keSucQA9QUCRz2D1KpO(z-IXMrjxS)gF~pFq&4YhyIqZ>%Bt2x>U^S}kRG z5(2{xGc;+U+*9@)nBE3oF!3_J!`PYu@If8(E%Cr@^7@v07BqYiACu;idDqO!V+;6f{D}E25G07F$O(ZA zE{3j0uIG#t%k&()@{`n8^L++he`anG-!0<_T;3(DiqPz%794%oUokGjkk#-lW~)cR zUza@1TYvqyToVDh*?6XmFbtL@%p8n7y@_gUku2nO*fv6ajjvR1-!_ofTh8jK&S_uv z>+zOZb1W?w za}9r)=l_Kt*Rpz?y3(1iH1?%Vsnh8O6}WZ|rxuYO^HGbp#q;WHL{NV{3pbPV&qc)t zBM*jVcz0;#Q#D$*%1#FgM@r=HK-~Ixntn(KAm4ppX7uR2OuwwUv*r^immGK>k(r@C zy{ZMCH*|HfVpd1uXy@#MIoG1LXu!XLYBmjDRJn8Ltt_|%-YdPcob%G_A)I^TE6Yoj`)n7tICZoPC2H zsfF;87bC+bA@ciek8-9LVm$HEMO(LXCAMiRpy}cz>S93-8o8b1csV9hxs|7Tug_j; zWOrp#x2Xi*Ot;fC{du8c#7Ikfb6&eo zf3A&OCOk3UUGeDJVd@pwGNr1QXcK@g@raj8+qy+;CE9hUshhJ?Y*P6aeDD#Ly)mnN zuV0Rq-BsS zsL^P&zzr|3i;qADCS>y0UoB5Mcjn zopfe_l(~xmN|`fykd6zeGs2hO6Gc`j4KDTP;}FAN;r@GtLNrdMmYSuGpUg($^S&J! zHuW6O+IbS-ZRePjVe`N6*nCX-$Bo1PX2 !}Yw+An}m{RPL;tDvJFJCu0!*Z#5 z{@#n2bs>0PZqO$ZvY-{&LYgfPhn|D(|FayEtI5(}vAI0|n=-1 zx#o8v{7hx44J=+8hvU-SEmJPpZ_ej_H08^Ak9JB5jHW-5u~e=EVAq(>2<@oga+kt$ zt@f#qDHaDfHn)aCQ(u{8=W(!x23|NrcRJ^HQw6!f)AiSphZ^apRU#JP#BBb7)C%WY0P*YlEmJA%nOkIk@Y{hs6dWcr_EP&m-0@K|g zdJDC40=ShNh4_mmk+%7K;oc9uJ`?yQJO~D=pWMv++M@wm{edSUzB#15giF15HGf{(FXZ}8>y2OR6WyGzR{`6gkZ8l zFPSp#ruUy{T#j*Byc+G?#YFq@q{_!EPCQe?(-oycnThU{YhzT$k{p`lt65R@7cUX_ zpDew__Sp1cvZNJvQ{U2%KPDNRtRJm8JvUls`@(8MaZMmM@4I~>vA!`$w z-f(&ARit`IUS$RNY(_j%$bU8?9&T&Lh+|i(e*qd?RV<%+{3wjq>~y@naSp0lz#3bn zL9?>RDIX<4;ee0ha3GYdZdX*G=nL7QAf6|`FqF!;BN14ue1Ba$$WVkc$mz}Tb&Wxo;JdcV4!;#Z?i=9UM@ws~ zHVWF8z*1GLEZZV$-04`n#xToQTrH_JdIjic2+&lTjhwOwDr2@I8XZ&C&k zKtIXrh&?ON+*#-mrN?}Phmfzs~<*c$3cLmAd>AX=gaFX+aW`gr1xRm_hcx?PL=>}sslYM{zXN3 z;fdgOn^uc=H&wUkU7pThl0x!Vr60%?j1@sUyn^%j-?L)SgCQ0M616clpVu_TsI_r* ztXva|&aBp{xP=lV29(Pj30`*c8A*~9Xg2|W- zS)Cgh&?9ub1u<~I{fO+SKV@T?#an;ow6Yw_DLfkc2ECj3yjOtcTw#^X zfaapfthKX#ZCU{jb(ZB=%Sfk0U6gZ=telUK?8DiO$`gVXv*(7bTH76JI^@m3t0Q4c z>@(y3S{|6MuFl-c<5nx$UdID%e#%q5@V&W==W=-65%eRme(8;TFBcxv?S9Tggzzgk z^8XwS`x5R%kh?WP$&}s;o4E0o-OO_2du_a{J29aNjE2R8eOF%>6$W&=%oPn+vkg@w z39dcnz0yBM-apGb_nya=x-%H9g3wDCgE4De%-SEkHhPB%Ke&l%^ zR53|7xpA9U=C;H(tt)lykIC=^o(+X}@Xwkk+Js$7kj2(I=eqZSJI=$(bpjO9g9fAb zy(X&87F11Q*iQg5u$wZzRcf!gXHdkYaLAm%vmdwPGyXzcYY@U)Bub&xDCVrgWCZQp zAzlbS>UVi29q)O6#j^iBTQbMte0!Qg2v8SQ2&oSkJ-_uCFg?;O4wS#(*llF8Z4RK1Q7^&|>Tx0=kOPhb(l zquXv+$tGrQb)ek_LzoEqp+5TazF(HBw9vCC%i=u*8@67~vaVMEPM8&>1Yms>Y<05Z zA}0;Yb;51rG&@>CJse``yJA))*lM8YiZ$%4v_mt(#0vIN=fc&K_zdk;#)jw}eYSM( z$4QUpy-`&|&dX}Gu2d$Krzf|8v}Q%R#PsiOM~=J=7YR`K{pH7^H6&0A3dffP_fFGZ zCLB2bCGsg6>QlFrerzeCN}btj2(FBu;UB*^*hw(g?(RB`PQ5>aJn(d(<);y3EO)GW z=Q<_~OmKtPg>S8#=ioWDz=@ zgiRd@qTV?dNMNNnL$&+Dtx^ugqfAYx_5x>JlU+cQYf;$P`DB*D`y2JX3uQ_=kkf z;r8YYp#-+OR;ETH%fXE@JP7tc;I(yfjch;0EbK~?B3zzDc3D@d8kh|)(_|Im_!!AU zgsc12{z!x>swnb9i}Vgo9KRm(cOAcR;9e{5z#Hq#YCIaUakzlyx9ScflNRhUxn27; z1KzhDSiHWi9TM>@n>vd^n(reQc16#&8rZpD0vp$)e?2F0aez((4;U@d@HXW3Pre&m z^T}~xkmZ)!DoKUe$EzO0aNPO6x^ZPnKh$5ckr|FGhmkecu=nuI2es}zNVSTZQfU4e ztc;`Ic$Gd$xh`;B73fB{7V%+=upQKvO;isoBzq+ykY@v_-ZYI98x@X4#d12FUXdDj906s_!D%6V;;s2 zg>1oMKI5*Iqbivq#vMCZs?xawauJVe7Ek8!FP0*vS1s6n=4UtMC6C4DZ-~h$uFm)& z2q)G!CQ#+7)?qZSTaY&&Cbm}j+0-)kBsx?g4x`dX;JPpN_sRqqTTAb)cTlphXn5PH zU<^Rj3DgC$yigs=ej(fi`|^I5=8aL*;FeqIJ^?ayJ8O2-2%C!(??+by#^Zd|ym#6L0MLNj}X;{&TnJRdgR+29QeEBozi z`H8}5$Gi|{^R9n}2p3%V zWOoAI)8_b|Ii11QyT`&k8Rknl0+WVvqTp47V^5?P zSQ^|i19^x8wL?fm5UW(Pl7nphBtgvx-u2E@~n;o*~G_HuG0GbIkYcc;h=d+89vxd!T)m7)7=m=5lzsF`P}lu?|%#y*5zNy0Nujw z%6EQz;>*_GnSbY>`J(ib z3MBiiyQA!>u6Y30UR>SpZqiJ=Yb=#&Xtvsa(+Gs)fx|+IQ9u;39iOcYMt<+i^whFO{yS${< zWO<~vpopuRw6n4>M7dH}?4?z75AMlX~`TI!-3qBkfY2D;eX#u+-l4BEoIz4B&4G+7WgmrZHLHrI>>H{ z!{c*Wp^zbUHnk$~hDxW)Ba7HKh+9eLU)XpEU9cAgv7{ZSXow%9NEDLEOTURw)RsM|L09H`J#OcGneP)foKv_FcTSpxmnS%;&s>UB_oA z{ihsvn1{r3wqoS9h-24pXD)n70TZHVKb8neJ?Ep1MVzqXRGiYZHuItnm^syMk-%M{ z126lfQSyfyNu|{wcmJ(=Trd=^N~EJm zFWe@?kIeBw)QsYpDDGz6TBTW|yUmxu)@nSyVn^}{Jarntp3gS&c~oRbj&1v9sMwWT zOKDYEu)7=O10U^GmzY|>a_UC;4viJ3s_7P0Y7qF#nq!VCN_)fv+XXD)xK5&cGW6BC zbg@Dw)N;pyU#{csqgI*G$Lb!A2XZ0zv_i<>e)1GZ&2~W|q=cl) z8CcxAW2lYZBc$-e`xu*B6}7dA3n?fAyy+?6;*o=e%)WZ|=dJXu5Qdz}z_A&d;4OC* zRc?0$llF&YJo!R&pHThpv3bWnZl{<|l0=w5+QEtiYLiy11)9Cb!;C|R;S##8{D%AXVfuZH@94uxymv{Ae&E2CjfA)yOa^vkD##Ea6>3R*eUj z*Crg%Ui$5z{gvAd>X?VhQ{N89qHoJQi1O@AbX%0*62kD3zo3z;@}l%%Ho}@K(ppHr zHyy4V6q>2(T&x(uw(^D99Q=Za1$d+a=&-Ay1ovk9dgQh=P;Gl&;Q^8} zk`cH=_s9g&&*1*v8#oCkG~`C25ceD!>T(+C?oJ?>0>ox(6sv|(z3+mVZH_ZA$#}~W z-TMKZM+D!HnFJA~AI9bR?6i4AgG7#`1UubfhX`*VSotANNxZQ1Z3!)4QIpX0RRE z1O$J{dgu)4V5i`=HaT-T-8hv1}Nz*8H&q|HA*6VPd6Yn-VP52Z3+1} zrV1ldy%;nRN2%2g{=0Q7xSL@v2Ui?K_Ej;ru&mgiAUM@Xwu4EewPCFBYM$J{1_&~c zq{sYXIUnd2@)l?IUc}!mQ&COwYjfR@A}(2%imDrY>lSp7AqwAJa4F+Qkz-6(ged~_ zoV85U*Yg@8Q_P}wb0%>lyEAL0)}5&~@0MjxD?hbIxV{iBkWj0t7n=6{xKr}mY3WLG zeVFg$r@?`@f#d>q8KK?nwfvxyA&7BN@+it%O~glrYW23nY0k)1$xySc+pDa;96 zQq22y(d?1{URDU~(%4tHUjb#=056D7Pu-eCa#x?~(b`vC?WLT+?jp>{^QvUtTx$7S zt1!0Kri5`YoxC}!(;7yToj)e$gw=JoFq3sVoO1k`$N*0?JVT;@1b!5dkOtPIwP!(g z9zD2KW2(I_xKlN&f&s+P-9+D8jIDdw9@-x)#5%@sch!_`=v7V!IoNgZ(hGC3usX)Va?^=`&-&@UXk%~3-6X%CEbuIe zHwTP=LpDmGGC#iqtGyn|Yrfp=Ht4G6W}9YxOX0RKvRviFqt4XJI3Hz59xsEld7i_TG~|g6pxz)WU`p0x&cEj+3DoQgP5ZVXuE*i)I&`R z-b=jCgg8;pWvyDC)mV}0uczA#5>t*7ZLC;DX&Vm6&DCHyZ*K)zuBxgf$b`pFxLivjE+NHAVf;)ATzmfh%OyANpVdGo)0sa? zy8rs)FWj%n$NEf{k@9DM(|!ntV3M$_8}xRj+1~)%;x6ZL4KTC6nF zFj~iGs}2eTw#p_r#I}Yb)*{+r%lEbfXnd^h%Fvt{)SY@05edH1v=jb*ydSEpZ4vY3 zb&T#I_pksII^J^~o{@UL6kEQhPK7OB+B%5@Ft8Lqofd&4MwL!GS1GVU=d^e6IYc

0R=w=C7$)@a^=?>vH6A^!042X|6Btgt(UWlD%-=3na$NM^ z{y&hvGVTfG46Ja)R2IZK3~k!nArvxMG7A^jJRUtvl-=AehXNT61O1vJ$poWJ7>XoD zX)iTz8tJ0w-hGCD3H=@Zg_N1}NMr+0th<>~ryv{Qs*!<&LzA(SC->y3VB;5&U>_Da zn#lrk)<=%&e~*9B{=mPuy9;uI>T(4VwCnfto4&hPfVkaZlXc*e$DwQlC%#7AbGOU) zHA8=-nRQeV$cs>LTn#upqP6+*HdXao-ntfmY@^~H&JNGv2GX}5%*8=v=V5SX>xhn> zuKw+&?)$YL$GG2RB}J0kYtYw!QF=tMxl>5{qm@E!ju@K{?3d zW<&$~ctc*9KZu7q2oRzgZO|&=%hPdiDQ-WU%(;Ct$!SL5;wMSh?dZMUD!!Fw-ZY{z zv47kyPx$ssjpETaNo1J#A*sW%x2xV;GdB`Tp41)_k%8&Mr&^j@;-?B#g#FX$Cm+6k zTw`(^&E?4&e?Yi0dRJIzxR8pR*`lSw9W?MMo$W;07cbc{JMa3`XZeyOL%Uh*;^~3Z zA!_mwiyw`@FMJ@!?w{sYo8WJUw4W{3M7hBt?oY<}gwjk=c|tV27k=DBIUnxFFZ9J5XrluRJmBMHCguVonOcQnuI)s*tmf(n-QAsMRmaHD=Fmr3a z(_JLmeuA-Sm z=Q(M~e4=-|a5QQOG)Z*4ckQKMA+-~q^Ae*YHlMjPA4ZeG)DlPUXnPY(`qjb}?Pz>6 zZBAKk!uJCjnx}I-^p+FQ+>A1smcx8e4&9cW)kCLg$3x)_`!=ea=4CW&uanB(r}JoL zEagypx6X^EnBbYrZMzJ+^AGa{1Zg`tzjwD1ZdQy>IDghnnT4!epBT>}GlU!0Hy2|Fj zDPMODLC{`k?yN|btbvJDj1y)No3()Ce$J2D_FsG-ouYobiTKlIzEYma-jYP!o@kA~ zcJmzMg|11znV1gb!iMR}c@U%Oqikt>eo;+L-}v~QSezJ0Z8gFJhQHm{h+0SdgG~K?Yq}Yt-Q+W<>U>vK{C38L=&| zyC%1@j`1I>tJPg_yMLMkX%zNJqX^{yRV%m?*rtrWK}+6R2)B1Ui({yH3bMlA@9rTJ z(Ma8z!DQUrv6#0fKzT6Fqp^o@L;nq0M}lIUl;6eVL=!cdzGor9uDAE}XEGyXjRg8z zR&YQlb!zWL*o>Q$BCR<2wj7^Crd{G(q#>%`gUCJROI<8f1hsJ27ZY@4z^OVytM+gD zeOz{?f79=if2=qV%)k6ZMATtV`%u$Dr(D&94+z~QeHC3F%hobW+R;%W(ivEov6`q| zd3^-@kCW$qptCexE0$I0^qah1+`vr7moqgewiQY=cx_;dI?Lk#w?zlQJf_$Z$I#?f zw+z+v@k)713miKr-b!OQdx6Xbk1#=!gAri{tJubdCw1$> zn?>CiqH7^~h0jA;URxvx>aK-bbVXp-!(cbN_q5{jM?8n&W&?IroSu3|d`j#>Ke(^d zs(a#JHC{*VwtS(!xI`bFbl2$OalLJ+MJN1mdh^ zN~>*Zq4!i~zUAzU=u7LGh8avwo-<(K&b&S63|QU&kO8y&_ZhIz(7!NX6uc~FPLl*r zUL!p$04ZGjO~A0B2br++I~aJ-x16%O^!5Z=KF)oDvn7WzE|jP05S3~~YO*wq;p{Sy zq1N$$2K++TPDXc6@0YdX0LVAygD7iY` zaQvC?&3mWlsPa*3HzU$!WWVYr9NXS(td@fPWGYqXL`eX8V#*6e(HS^HyHZ}ZuI3-0k;Jme8GV4Pnk(E;_n81MBVv{kBv%_;+tJ>>2u zf@JP_x$-QkI+ZJU3%^fSE7>aZUjpZ)J>3VNhWA3yY&>Z;+#w~J6xlga^=^QbdkjwfMqc1xdP9!EGVy?X0dgZm8gOQ1?Xk8{W+ z&e)=36v~%C5KyPJSzVV23~~j2SasF|j4hWxyR|#$gTFKLvi>fAgChP`{?_0`-BiAQjcDvt3-%)QeNPduBK)LM5O^A1hQWt&E=J;Dsz0B0&=gF{H&Y(=Y z#zh=g3>7oazZ=xBB168(EzluqKg-u_Av@fkqiaVv-JjezqL-JesW0POQMns}f#Ow!a@loy)1x3@=4z}rbj!#>-}x~Bh~3HBWs!z|QPIf=(H25G zD>fN~AF*xyrovJX_u<<+3Cs*I^bYzM+u%oDT}*ClpE_V7vNHMzj301l3Bo}j9Kw!F zUNHj9;-5C!rIwogVSCHGbT%7CrdhNL|+2le|MrJn~r zo|v4{CYwJ=UeQ?!S)o39NzkL5zFVLuXggLv-``7nrhvJI{i1+{|EYkn1)eKlM<4!` z0=D&g2&)FYOa~DS66$Ao7n7Tft)OjL2$|3>Yv$H)_NLoW+wQEdWgxxL<6mh~Q%OQh zhEU>A!=p1>G3Yw|KT^PWP5yfdSeqFE&?Ec^^axd^n2dAVU}a#=Rq0d?5Ct$Ge$+0= zIDulxjy*%b?w%oFY!FZ+e~~(^JC|kVE>Uf<;Nuzr%A^U{UUz7f*AfC76}=j=oxMML zO>ANSXm))8>zzyDSmjps)%1HKQJC!1o!?opdX68|NP%wZzZNGurJO(qlFGJXW9_T| z{k%Zfb6=O~Q19mHpd2>}FvC4A{aQYZo<{sUBgAr}{q-}yek{LG?$W5TE!TuyLu_3h z(65av=uS)JfggwF|32Bk!rEIvI%vwr->j*r5uC44@2n>c=Ay?S8yq?gW(7uTf0|TQ zI#z$4Cn^C)&q9|s9!>`7%cBoHtutZ_2(Pyvy(K8gL>@lbo;1xq(QeiSUpRPw>}>d9 zGT=9|i$7*5Z?dS15iTheWCKHKAuUN<+qwy3(MQjZfb*`K7H2faXWQ(?iHliw03}Uo zzAUPxzT-bktrg-P-xByn2Bth04_J)I2bSq~z0p@?EBStY7r>4OD^1`SEPk_&L^wr^ zE#u&&B(OtlQqX0+sF-__1nvj$D7eF-m@4!ORT-FANwT$?-3-F!S-eubcH~PkiWqYi zgFxze;r9b~(F;EgW&ykX3J|K93R6XJnLqQPW&##LS=bwglNE64Po%0I34=cC=6-Y? z8|dG*Qd(oB-bie)8@&zO{i1}wA4_9l30x*>9m;yypNTOJvp(U-l0}tUNUa^v%!p@R z`|-8Ur0SD_Lw!(-Uw=4WoFNk)H)XRrH@rWUEhK4%dX(-Y|NK6aervQQ%(8zni3#e$ zm2vj#W32GqmN*}l2KeN(ZoACemTT>9GzRzCRRZCFlAO9nNm5;q>tF(h&<&{Sk3YtV z?xfX571HkhDKKfLPc*CB@5&+(vo9{4Agakzo}O^+b6VyC21pmUVd=Jz{Nte9z@=;h6tV7XMo&Kym#f^|8Xvo}ND+ zO)P?6Qc>T7Q|I4N(!c!puc-c)I?o+@o+&GPXCtJpoaKP|wqwxJfK`8%HWQBR*{QTv zzk<6IwY4k{>(yA&vq2lV{1g${$ypiK;72#=bDDK4=UF|flOHav^t~aqjv=>yDt!b` zQFrSuwG{Pm+YFhq7Kcb!Ym7yAbfRxrRoTrsjCPrPE;;^4lc4r~z_4k0GA)%@>Z~p` zfD$8vuL%{9J)Yr2(nI}JetG5uG&}}b2UErFY11=<8yMQo!LvBWIe@jI^yMUr@6mk~ts+KO7qb*f*dH!}_eq2oE@;XHrU zwRmN%g5?po^n<+S#)ld@*@0#!%fhCxr-!bk$&#D$Bz@OALC}f za(}_?{FyT8^YXQRpS)#tDV>EEY&sg-+08q*+$p5LOvf#MR(JX>w_y4Wcg9PDrXSS4 zFLpZv-bpHV&{1Yv0OHZ+U3j2hsl|&TfB0uM?w#pehkjZ}$PP0son|Mc0k^%l2}uGC z6BWN`zbz2pJ6@0?;Ef#ilMm64-=X@YCiU_k)TC5~RVpp7S9om0>0W|BkyBTs zF{3@!RT(>;Mp1QZQ%*BD;%5@m;+;?EOXEJBmZ&c2uhXwN{*<-y09lL7mgV)veYaOf zkK@$}DXkiKArJlwdaJ3$8FrQf;2g|{sZ~jCw&y_ zZDuju{S+1JYme^ZKn=y#W|NYqw^M^Z+pZbhy4#CIkkzon#rJh$@uk%VKnY&w9$AeK z|JR@v;V+=pi~j+rr7m|WxHLUp+Br}KJGcmx*etJno)b4&7Dcq3JV@*ypz$mS11xwb>MQS zDbf}fd?D$%q#M~)_aa-T5Au?^^9+%+pbW)IxeZBndlTWnTIrc-893$6W>BNYGp~7F zasWkOwbxcFAZaY600n#8EX2HADhHB}x5|@TDZ9Icnjp+Z0v9A@xz|##Rl>wlM5Ie2 z6D(5l5~KX~s~AGl@Dq=g&^M!k#KHP(vM&L@SXytFjFQw}#B{(+*^9b21U*7Jv=?-$E_f%FW~&(zjUIurC&Mm}|<{{Zwq8oEsta3$`- zl0+Pzgrsy14DGgD5~Vv#juPUZC0VQb63NHgqfxfPD32ucy*JSNSm_%U|E=$Hv(XS! zRm!^@vSp~Ijs<9ddC>|vwfO7;4}?HVxbD>Db{ifAKmVLD-u&sx?obFg;5(xAF!xVH zYZUQUL~EDyZxOAoe~oA@Xte134-lenOFZ_Dr~C6$0(Dht9EL!7b50l_ zD{-U5eQs)3F8eu(`A87+wr89n%$; zqZq1gcecMb1qxg=DeAn94&dy2Re=H19YtB=1cWI4mvq)!$A3m=G5z~=7MJ`vo%IAF zeYO}6A3toV`8tkjT4 zl>^oQ##v1BZ8(c?$+rdT`0V}*NRX&T3LIL9YVGaRG|vCzVdn|ep%6xifxnuw%~<0ql_&@s$Lmj_Hygm zEIywM%&a*)`ukDY@YO__eVdSmXxm?<{1Ine-$$8p;62@~i7#dM=v~Y|9x({E-AKGt zJPpetzgJu_u}69hzj24$^uGPTe05IWepATATFS`loBMn=JENtiE#LMj_jY>?Zkln> zqdG-!b{x6ioH(QwA#!_OcQ#(se-4LV;$@7tNT8O5i?aDtvB^9AD&#+)7|Kdwr7Mp6 z(S#~;I&VU8+kt!!0DLM^ptM2Hd2CLwLZs4kD1aU&R?2kc?Z{<|If>#i%H^Pn6Z;FI zFgh(9cEM533E0z5_Z+gPhmx6!5_?n|u zZ0PTetT!_H;YHt#Eb|rT|CW%I%CR}qt=*SK>O|IqiP5dmV}G(0PA)vp(enBeIkOSG z@yb5-(fH|$`5K=4gEO-1D;rntelfrvYti+U-dCxxF8sHSy;-t-5BKOSrT$|&HFP5k|Gx>-5H zy4gaPeZ@(dfQ^oNudxaH0|Sig*ycjyvt+g$o3;VMEZ!?jrYm7M$z?CzL;q$1_9i~< zuy}Vm;ux^-%Ru!RSw}pLvv^0Gds{MT(Aj{nT`3%rQPF+_@Okw~&!>j@y!s@o_r3aL z9y&m`$4M>cvhYUw)ZSuRRGv!Cl)U~Q!&osO_$`fyBnqu?v-o}khh~GR3MHE(^d|q672fK{*MdI{>)v0mncWz}V%t6z+f#BP_ELj#s+a!P#UkR&pPpf|^ zIb{{Sb!&wHw8$PeC#Dzr1g0S_ubjSD3uO1^yoE{!xHZvERHW`8>y(=ia8>g>(m0Y8 zy=Qia+4Cl1OXlW>Vyr~H1_Kcb7`K>53>o>?RNF+S1y0a!ThO>Qg(1Vgxts=ndl5q% z&YH}{F}EveI_>sKXRS)KE=+t|8k+nPma=Qu<1VQfcBXd~%EMI%zbi-IX#5Z^i=(aI zIjocXrXL=*;R-l65!#*1P@c@eO-+=4iz-(bz8+aJ)S9kqeOw#$oa55aDjg7+SHL{V>XoQ0ofA8K zb1vg=(nL%kP<49@n)nQ*vtr_W7Fn76YOUyqVeGVBHhG;3I^L&KnV8p78n$4BQ^vwe zxe1!P0xrlKAR$-@M&S99cUG@_P;YE>^%Mc%EYW6kFN!9fJcsNqmyV|?-9hz|jIwBc z@&hrv5|1$utN*(6m5!`sKPi#pWb*62Y}i$up2ow>Wpgk7?(guF6Hy5Qwbg`tZ0~-l zqPK9Gx9`N!)ZyYbGfsttW?Jz+Kx}6xGK_jBbk_=X5#r@;hw)N>xu9f1IoV4*kTZOkDl75{*2Tw&^UoIB>DE88{cAL^stK_;2Av`8eE8`YQ3c~3C18RfJzgx z5od2s!)$J?)FI)C7(=Up$k^;F&NPuM z3efmPoko?a1w>7UevMd-_rr~-Q4hDl&Gwnqi1C|^)aEf4&k$B+mfyEej*1f_5|#~M zLpHa$Tz-34Wx)|JuA*N>GUb+WVh*bxOQm!?qMdy=YcW9Ec@rvKaDDaR7XwN z?Iep2*(MFuqxZ@QZ1O*&2Q5(bobi1In^5nir(TsoP&( zsnfLX!l{dy@E_po$Cu-?FyYfEwuoBE3l`szwzsqp&WPpt1gs)nmxLnTrGJ0XQ`|A~@q!jPA`|I2Dc6_t zuIq&A#85_#c7i87Mza!$DEE3t$*>k5K+^M!X#Ei#{-w*J@d~>;BqlUEY{~C=I5tcA zC6fn22;JpQyiC-{j{od^ae&Y1VCh3p;B3J_0zrI=g`Z4g9fQ8k;;B-gr{TTj}?>(rB z|6d%|9&j{5$l`MNkj9s{#vqwv?tmjsHC+~*NyVe%N z`dNGqNPIs_?-T2@$R!hrqvqCG>arRLUPhk>FSn=G73fP!Ah@~BaLuD<^Q}wR{1#vg zWwW^nn2`-8{RQ3cKW@ODRsZNvHu^YL7dUrmwGQ;rW;n}W%I6Ws8rz?3U(gy0zONcf z2dGlYD|N52-}>vyFTFl3(|NUi!RaLXB zDEgaGuydI3|D#c`oOI&x!uR#79c?ZKkzYa^7;04DnU`D5XfV;>(jG;9{*zv z>zSP;(AM$P^ZMtXYgmPNpxpuID_G-RXAw)pm+jqXvz4IUeY`_WUfkuas`gk}PTr8Y zcv}Q1ZYbQo36lu_9NX2?X-2htNR)RAk}O5j}8xUokVa{pCwE@XnYB-aCa)Y(_Ak+@1L_4 zL&X%**=D54Mm1ci+fJw5SUWK+%sA$HzI}mAUJmudo3uzirH;c27c7yM?#ahU)ZB;$ zKTm3~Gi%DS2!DVQ^DxT|qzZ`G|^YiSc@7Q3L z3uIaU5vBFe?7yb8=r4rq4vL)umyLuqxhZVjxB3lj)8vGE(us$(fD0dDk0?Qg4)6et z{XmMlt|t0PM=e85rK4w_5|^dr;;fA3&9g-eyx{!;^mdU4Q`!D)p_)BEuHwuSw~IXo zMhn>B(l}XwvY4PMxje zkNAQNeDP8bHw(Sxh;-bbUjG)Nbs6^;MC;4HfoNsQOHDSxEdj{^NzqFCsj=(bD1)ZS6rTJDf<1`)C4lGVqF z+2E==s*|hTS8jQT9<9;fjF)IP8}w0Jc0DB|JL=oOckjBw@l|(f3NTL>58Jzhep$t^ zib^yFM#ipRN&Z4<>HfQv7If{QCL{v{hjfs0$Px;cH|{@ti0(9grt8{(D8qYxIX0da z@3R&v$LydQpzrq+R=J^5(%GMaSZoR}r^u80sLHlaVa+jLFQp?^{fcvK z&16v7*DQCkoZwlDy_Id1vZ;=*of+~PDXPch^fDij8e&bqAu?iQ=9_38Mtfqkjv2By zTb_`k1y^j1z~(hLel0(>;h}JJdr>aJV_kl*%$fYlJ)4FGVIL|`D93{2V<4+JBYEl9 zeXW%)nowW?!$p-qJhRTnDnqrmw?ombabp5EBZd~ER}H^mo#?KS-Dey->OZa+EEZp% z`?htNJpkb#>BXra99vQ0lZPcgM{~BY)%lUrhb_@~YT=G~m7nVvT$Wftl(K&gkSW!Q zIvsPiI$VbCKrze4+kt*)Y<21W0=QbF85u_|)hYSV%ft60as?0JpW0e;!CE#$dV zFYiQnxw=vo^;v5s?hgC4?E+`b?FG3us_E3s!_)-r)I5T>!e7ZbctXwJ`?TVhv9090 z$MzWO81VF@+p$)iZJKQR?n|53-E^GbJbg}m zatD*b86Ki*3&vgg?X5H9$QI_EfPS~>%=X*{3l(x%m<9n)SphpIV#}kHT|8jTn@ln) zOKKh0i<$r<&&bDqVt-IY{MxQ!J!UO11o&@s7^M{k0l$kG=ERH07d#?GG)0J56n5^Z256X9`xAcRF$h`(m7~sX-5B0@g zS@vwMXJ9*0YQGr-x_?}-{Uk3QvDFIHh8Tst`O7Zm;ku93Ut{>G#G#pJ{mFe)*|iZb zwv5@0#>#IrFh-OxMKGeerWS$1UOg@uJ-9oPm@KSYI+R+pKX`MTXh;7MB`(z-!SkOR z7#b1s{w5EY7A^UM%QnT%HZWX`uqH=q1AA7peCU-|K8r_cs48~?I|jZ#l#z%kB(>Nc zRX^zNbETa2TnA4NH?5Pkwp}l-`RvxB+?d2=vQxzo)1|U>8Rc4shz;3rLi17vx`vPh zp*Z`FR+i)}BSAMNRt`)rTLyv;3wB=mpFT~@OZSIPNEX~8JF+ZuZ+1@W zE*)_AzF#3E_4{(w$E6xoeZCUpr#{0EzX?N^HHau;4*hcY=rlEwYXgF`D^Q(@{&I)1 z1t7r@FMfrA+VV2-B==c`BXF>hhGusmi{hve>jzl8vV&%1ylXR%sXUt*CyA4vFIR9- znU_>RcZ;JO*ETqVHBN3XrdJF`_WrR|HOKUPt7@^AhS6{Kz|o`8i%(Fu`hgLdwJ<2j zypxTFa#c~h_(XIZPk+AZC4N09L<_6$6HvfQAiN%P>`wX@I7`LxS2*izr7FL*^PMo4 z!_Bp{RI8>aif zRt40cr=+)s_K2{)m1P#i7!?qyWQj~;+^zkwT0x*s`z%!1vG`}G661pQ*li<)+t;~A zn%ndz@0+a7SsV@u2al*fHT(YjV0oZ{0RvKjD{t7r}r#FQ?&=k)BXT4Q%S3>)4J!|VONQ^DeO0SN_Su+{)cQjQ87?CZ=D zln($ls2<;Edp@+8>wlG$dB=+Y#Kj)Bc>5yseIhV7_r~x^P<>mP*RoAdTpyJPf7q=5hin#bfBbXZ zf(MDQ&NtATq^Xwwud#doW;@N}06sHjyIZ!box1dh(H8BvG}=Mko0$@=Yf3~)BdTaC zN<}3UX{L=^P?@So-B(?Ls8(BYBdTVox)sSNacOB9lL(DV38C|)bM~A)d-k0D51t>N z=ly)%pI4hd@fD4z!W`uh^d8xFAv2aGBF5(AU@- zj4U`Ow=){2u3Y2rd%Q)4TKxea>8M^Q62){@7rQx4o$)rtO-I%czA?#wNtQW%Rr8V6 zK}{_n*HqY;A}~$43DsC!vC4nEz;F*q_I3Tutmu&i{Xb*lN-F2i-oE;#h>M6JPaO;n zw4i7YIuE|RwZ1mu@_Rl>|Em2sMKfGD-c=^tNPittv2ZxIIbB01VwHLa(&#*JMenR{ z6#RS#^n!!>^`Dd;_?G*Go66x>LSxLc?t*NM(HQ!kVt`WUhQO>wYv}*h6eh|a^5V7r zFHXT47#SX~lD|ly8^QcT!QXK9$A^eN?hXz7>?zFL$O}qn+;nXSkRCyl4OZfREt^UF zIrSEz-KD1q)G!p6d^?k`b?D19)|kAyV{^YEw#zzKnwZbA&}H)ceo3P*=?Kf|O?%9d zQFlWv!UpAb5`_Vk*qzgXO|^C35qA@T1&QfjwIDGDCrtDgcXHnko=*P@-Sr`QUWTty z>AeW7H#5Zawm}Ryz}E)uN8L2JCX$>58sj#OucpoidiACDS4K;C$>6QE@&HpA7=O< zfd&d{z|^PK34p4+(ueoH8(^cqu86KweV3m?Z8`!Pv&mujJ&Z$TecSZCy z+NzeW*|0L*54q6Qr;Gp}w+UrVKQd3npPt%9tz<~RNesBw#x#1;RcLr<_egKD4qyDT z=s}Bb9Ws@rKRja2J>+ec(7cxS0-S}O$fle@rDFGrm7)jr;yG1nHxS;Acv-??r2!t^ zX!!FGmSGv&khWlz*NH4AlnenY4{YWzm#QjT?w}iQ7vE;WY~3&*(*I zKOqPPJ3NaRF~y~t{sQ=9KbK!r;C*v1)w$|`OiK)1Mu;jd8r(^CYrThcZ+((mzjeb9>V3BXF-4DvgVQ@=Zpx;+ZYitG?cqOHt=au+iR%)>yt)&;g zYK6g!Jgvpw4QgQ8Bqr&NkAtaWBL^k|*QeXy;ym<^Z5sDIT>z+Hd+(lUPxc95GFJ-$ zmH-*(-OGa9+vd!3KwF)A=>r%?EshEM}TOQA_DQHva>$R;y?*O^o?n*4Xd6qn&h!_kZ)V8JgCcVLQ6{*v)kMD|B zaxXwC+O3+W9;*cm?;^6%)~EOb0Wj`?zU*P0=fKcPiX uZ;u}0Ux9pX)z5cKj8(KNz%^=vpHx;`U89ccEX{3S!QG(VE)6Ho-}(c?9e=+7 literal 0 HcmV?d00001 diff --git a/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png b/deepnvme/zero_inference/media/zero_inf_mem_use_gds.png new file mode 100755 index 0000000000000000000000000000000000000000..fd0087ed6391a83b53cd247651f60bcb4036b513 GIT binary patch literal 39589 zcmb@t2T+q;qqd78@+tyWP^yB6NQu&0DAJ^NLPtTQhTcn}AOg|^1f=)gTj)fj1PBm% z3r$0>(jkoluiyLaZ}0uj{BzEnHIvDDN|L9nXJy^jy6;d8H3iD+bk|8pNGKIw$!d|1 zkd~5=Tzc@=)$=ueZ`$L|-!8doDZC^p8(>&FU$|l=ttw4IQW-^oe?xY@e9h^Vfh!5g zZMTcprMY*^UL+(*qKdN8I$ow*sc?sNl{>*J1Rl~%*3@y@_Zmq$g)^FiIT;>gjAXSG zq6GyeIhh&RSxR>v@w`8Jr9e{`?f9-m5N3e3v$))I3M^j+wsGOCJUl!Qrs%U;FrAf%yl8$6c>T;uR^e?`7ZNe{C|oFP9-JjieoP zs0-`x9HV`}kU+jHqPpO#!!OHLp7G2i0?)#*bl*cP{YGtZkS^i=M7cKHw&Za5U4uKe z|FD~uF&PWLc5hI%YZ%wb%3NwY4kS=bioGM4svJ7T9&Tm=qqpV+9yuzze$*67XhP>}ja!j&~+5oIp}13I>$Ay(k?YgV6mx znD{K_WJO`WDe61BSed$BcRt>qV}RY03A*zn-8oO}qJ!aJBqc)7pN_yjXJ=*ejx)5>zG2JK?QM<~~YK;HY5~HXQ zk*pQY>B1a&2!lrcw9yPRj0O8k;L~oC?DTjw*w?Wj!R|0VVHlsYZ4Y`9>Rx_O80qYL zc)#%^)dBuBP9!a%CXHbwHt=-c^8<8X{-Ie>!q*xTo<&%e{owd~HDj@J3SIg{uV+;P zps;u^?ub~wC3>O%kx|%35?wC^Tv7C9 zeP@BBhq*&4W!n;VY*?F4=JdEW@*{I;(1r}N}d$uXXNrrub`CUB_5 zb2!0Px@u7S0r@KZAUfmY1e@yRqU2Y_KCQ8@==ZRrFm$(C=~gRI+=NG9pKo;JDQr!j zsZCPyB_oqu(T>gRz^Wx|j9JMkteoKE%clo+%#df{tPCp z3b~6tA^_K;u9>V$?pG9sxi@fYYoo^NcjG_5_tHFepD8vTOd+IErC!=ybP*_%%$$28 z%?exdvSymAT_}j^*#N&D~t&*OJ19l=ARJaIXlgBgfQrCwj2bOCxB`o{o93P?9 z0iMYU_NaIaw>q!GFY$G!eD_e?bUq;kF(@do5UF4oLE|>ae6P|$ze8dXmXz^5Uc^gu zL%KBqJm5{3I#A%oRafAW2al{+nHe0v6r$r@?xRpF(nKa9G zc(nt-jiQq$R-Cx7iv6ZDo_?l|4kgBlhSpr*+zyZXL3-_rrF^F~cPZNplL&VLQNwF} z7!a81)(Af|6R3X)D7&&5EKtfrqeICty3Z(FX3cn1O(A+5YQ5|6Q)go^xg#)DPK@}> zaAIFz<`v-hXX>0F`#PoF3}2sM(iNtjP}+nvA9BNQYCoKk`U@4xRoPR@<)8Gg-SO`; zTV0Yv;sew2rWWKcS1vuzITcphm+!E_x@A!^)7x0osvK>1L>dM|vaWamG!5pfO zPT_DX5}GIM{_s;;uBV1DTBFBs8&UXf%8%`@I0g1s6&0L8GQ=2YGZXvP{T4!|0|;6= z=p9O_WVY@Wt)Ir_bvF{a3YN&y)H=K6@5WxEQXXy>E_4U_BX)R1e@{JvoU_cJTtFUQ zzjh`1p?~_i#Smqk^2|%mZdkYWg5P88gOOHAReQ5dw^tjl$PK-`2HlbQy3D>p&;MYT^k27(pNLm=t_SFBWFK5K*?B zu!T-)6t7<@(Ryf==~dC1i~62d`4q$rxs(6>$AHWpzz{Sii{jQ$22c+;3xGgIoa~%b z`Z^CET1o8-!wY<`yM}$@+riy&SJgz2#u(eY@?43gmD>1nH8h67hw)?0LeT09HFgl| z^5!>Qhi*X=)swn4Q6**v;R>;IO}J)*+d`1dGxZ}K^iO<#3|iBlCdl`r!haG&;gY<0#zg}Y0@r&5k0#DqmJ-6SdseB4iQ zez+%9RwrQ&Xp6b}29oL`zM)p_k0Hs!DMOVsMdtQ|n+473x5{iD`{zVSh~I-W-=u=c zSJ$W72WNY!RHpVS@(5P1sf^3E+mF7UX9jCe!G(V3KNH6Ao~mE__@w6fl~(tI zg!Z?`VARO`>$$ppR6qn__QwuUe4VUxz2vg`)Tx@Sk^uxiTj80d+G*L=N0ZomA9QD| zI}s;0nZFdXug^2aV##CmPHQG>=3AUZcKzhB4c%zHlCCV8Fn*ot)aBD>_J;;D5sUNg z$G`eUbSw6~EZ}A}rA_X@x*xQwOUgfkIHFQ}i%Wyaj>(P{98w5eN@e`^lB#-@==p{#Yh`yB?-!@*oB(;H0E1r<82?NAL4mJLu0u6Q!viDFZ{F8=9nF} z)pZZiX1B?F8t>aW(7~-EZJ0d(c6SM}MWA1ov}>nms7whDTj_a!840eSR`4;(oJ6Mi zPslB%wz8B5t-QtzSKBsQuEwI;emwswRhHQX<$JlGJJqcw1~klbXF%p8pgA3l>6@l6OiN30CA)gJ&)B#K)jxg^9e#O2nU183H>=Y;M1DU=Bu$rm(^9Rcds?`~ z$w=4z@+D==CFuIlV^7Dn&av8>9CDq0EZ8dhF|&XOetnn&pABy@X|VE=?g%S%7tXMm zdQkqX=)e!~_Pv(Fd(#^ST&# z+B!{_iioZr8PXlMAZLLXS5>(Jf6-=NE(igE9`r*pC^?_~DYEhYXfQ^OmmIRDJ{ z+WQ}){wFSD0JAn;+=vIBX2)>4BRoxQS19{5eTOAo zV2*L=T&5$8CEJw3gIQ}W+yd*Kp~W!G5&#UO@1)x>@g{f0_cn11+R?3hk6@*Y-!}E+ z(M@RxrG|BvLC7KFYVm|R$BF8ipbeotnd@3w#ArMQ}?*B@Sb^Q-NddHI^*yO znDvyYJx6bpZ?MJb(Za(c*3H2RFXqj5R?ymtOO&}!e&l{1#i;7?&ljH$M84#u9PuYd zc%6zhIH<))5ck|eCtd4WBCgGR0@^hW`#DT8gPmBR>)0xPPt9h+oF4RQxJNlb*LU9y zeOeoF-tpDK%lDe*Q+hK58eZsMX7*S4vQ=qdBsy%bdhk^=j8&bu|w#6EiE1IPf(H+Zphd!31P`SPD<+l+P1MkFGTG-ljo^*pC1Tz`|TTAC#O7Dp{ zNg&XhflXY_%!~;t7TRUGaFAsc1(&ovIkHc$Im`x?d$>9-MN6(xzhZg-3MyWb#0%BGj*tF#A2K`s{&+ z&^^?8#i-dOS`SnWP+bz+?6{O(x>fEE$K5%NlAF6EtDQujuJJ^g3LPBX{%h~ib>4dR zmI6*Stmc@_`Z9Yrsp|l$v;LPYr?#%-i>K~^4Lmz;lY@9}cMCU}BIG@zW8uNCh|&#G zb_ct5oJQ&4rdQAadXjnijhoj^aM!2c@X=nBr3tmcgjLHT^Yt$px;(5{UrU0GKruT> z*-9}5!C*tgA#BFcW)%>0K7!As&9lb~X7WpJlVjRrzST2&qm`%gp}y1MvCX+9mfM*^ zYdd@)q|nsihieO@;QIN`NrX5b4tmFu_a%e^vYyA4BP_Vs9`7Riv>|XDyV9_@-I?H8 z$$G*XdJ^Sg;%Pi_V%@6;-pF65$;E(rA=r#v#E6Kxny-FeL$lwnkZzBcbgsaDj&DUS z8(5smyfZ=27>-ooM7HF&Ovd!6*BwSEyr(N~jD&54bPX7$e%Wx9$r7z{7vhx43zDPow5m1L>n+@=yBU#;x4u}2tbd89yl>PrIq z8z1KZfyy_9ReSI4xf*Ahux+WgH6G`$c$;mmSD5YBtJHSpNgjBrt5vF8{?+jEq7w6N*-9@d;klxDVzx_ z)ekbts506xPjZOBJD(p53CTp@E`=1^|I*R#(^c@_?-D9+7&n~rf!`qytuGM3MfqK! z)0%(pPo0FMp7wu!bm7Bp03%siw&aCMUl1EU@*}REPMI5(T{mO|gv=@G9hmYWmTsVI zF3)FYmHb;?k7djbpwQxB~a&5Zo;Z13Z%36Dq#5#nbd~ zbsp9anf-Nx)s_%JpGvTTs7bO~-Y~Pc@L9a+qQLVyn;lkj26`nG*T5F24f^bo6To5V zOiE1FZYq6VC~}3}&XnyE&RQlGIRYHOAC(^%iOs;~CA?o8^(6L^Cf@kv=kgQZlI*wq zemj0vacekd|BW!+2CXih zlY`ll`c=#P^=L&a?sy-MX4HH44QEk>13x*jxY6z?(sdo*FkHKytXcCiKP2H5+6UHO z{>ijtIy&Y*b@fKcbou{LSAY30T`ihrpSf1g6Awe&y=0li!SB2xTcBVZzUS^tcq`=0C;eNR-bpnW2 zlCx43vG>ATz+mx&DM;42S!hiKP&VT`b0Lx@n4?6kv9-K`NX z+h1nSy4bGlrRH!$of?>84wXk5VBqNY0!DBzQ?IvXUPD68zvGp%@wadl@*9r`=!44M zFqvE}SE9gI$Gd1ZuBhj5f27D!Fj?u?agBr3f%*Qh;7Vd+Fm-R9`0#v%g1Y$aJNEk{ z!gF7J@+tf04nO2gjz>YX>+THCXA{2-<`Vg+CmQt=(7XbN_7U`sOMiv1e=f(EE(I)a z9=omH2w|@+b|zdqq@tiZzmhWh-GUN^3)zoZu?}h%Kq;oqPo6r z9IpXj=(uvRu5RwNZ$x(8_bzW2gcq^Qi{d_OFpo)x%3O$o^rZlZ>=n#lML57sLZko_n=mE1~LF>ee^C&Da3^{fb%=6Ef}&kxlMB$V*c46(B-xg|Yoz9(iK#9As( zk(DwJMy8b#!WJYnqmF8nN+;f$>N6jZ=jlOIjJ8604$7-YWwIzHA)id9`#?Q@v?HTb zuxb%-fjTWxKg`(%c-D){V2e_WdMbplLF=fZaH$ zP!8gpBsf7Ya9|Z|*&wp6@FR~;%B9aKetNwqnnRM!#X})ojxRN|h;{ssvvJtaNuWTf zY%mO6)5pRwS{)zI`0nFKMH{55lEzFiJeNh%qjgGL*#9StHOxgW-~Pk{4bmEhrE%^T zk1Dr#j55LRnJ1eWG@ghFIA{r4VI!38Y_u07uUDBWFn>Fn)7<$|8%v_SE6kGa1r%t9xxxAyhdkSJq+B4Ri}>Udn}U`^z()WtI}TBz;+3r)sO=!dhS>@uu#Ic z^DH*lTZKGzbp!BF0x7z-V>2^8HVnh64wSVt4yW9|(n7I_f$85;OW2?T7sSR$HZMK(^$T8#Mt!U8Ba@=G7qOv=Tr+jfcOpXP{w|6NG`jYAJH4dml;KR zykIV$6~n6T!=9R>*PwT*D0FBe8P%%1FyA-+i1KqvJ47wpK-{cntfhLu4y|i98JTTi zYbs&gaH1bFm0OiWeC{KPjy&3pv(1GTt)NFXpEQJl#~YdA1)M zVqCg>?~G0Nw04$C4MAucINq^W!`YMXI}^7kqNs_nv^)HnWDz_JDkg;7FYfjgInv)u z)QuvLj?d4gakMyRn=!|__hGN<Vr zwHj|ux@Wf-mJ}j;A4A&S>bE%yWc{PQL+xJe%h|%OxrVt|=!suyZ4S4o+!^6tSTJCA zRQV&pN7%6oK_Oh174~)-m64q82_I6&=}~8L=tlIo>jr^Fdngj|iPo(u$XY)$sy)CR zfkj5}4boKA$3Ema(q`y*P{%m7@H^T-T+W`OPo=`XDLHoi>^_3N$VlRuanrqla-P(q zJYR*@@9_JTgVi&m< zYY~zm4W%acEW{$K)XVgKxH<{aSOSv+E>XP}rV@5#6|;^onyQY2zVU(>_h&it?C*Rd zjU&GGZs@Oh28W>^t%bLx%`oMi^|jm8a**dyP^kb4is1+G8U~1M@5uf|w7szell!!D zXFbA-d{L_Iy1p)v4crc)D^ zv+@zPkGo_)Z!Hs#ouu3Dlpy9-jBwY4tw8e4KTzCtSvE-3zx7jY?t%)oxqm1o+tsbK zzQtixnO*z_d@$*nZhhv^y_QDolt(cbM~t}28x9{iCYOP?bsZ=jY(Tj>M~-f~%4!}~ zZuJAE{$-tkM+(0@nY+@WW`LYY$ZHW#s3{xl#3mi`NUu*+cR2RwPCNuUj&mf8bj?Q? zFpoN};vju^-$3V6_1*i}>g$GMlsz(8hXspDv%9GYe|fiitQLA^;Q4q<_F_Bj3}0X< zIhUmnBr1@e=KCzk&;#n?6{y6rNsEx}$C;yT$ zIuhrSGlrE)Z7ZHL2GLIWOlF4`Qcre~r?y9wm+^Pz25$?R`awT3(h0T%pe`*e0do5v zSz@~PbzdleSHNqw-!QvZ8c}g^zbNow_q>7oDHSV8prTnp6LgKK13x6<#Ygd1E5D?J zU^Crix496GtU%~OH@``=8M_7K{c3v?+w|1DEK0Lep-0sIBZ(UQ@rJ8nBeSW^nmUE%!pNFIVR!&)`Jy$KBh(WE9^%yMPZt zAe0MDbUyqCc)f15PH^A%K8$ZV#fN`$10^suK;yQm`t&s>LLghMSH)*tqOr+NJ~d*S z9k$l3t?qe;cP5iQT8Wd`rEBEiaD4ZzC8!nQkxTv8COXZBeLqeq*%`7fk6wt~$z^oG z;4PsYpw9RivwVWDGj!}XV8At9l;IoQe$j@RedVr9oKJkL@FU9j6a~Rdx{Z}PZ!cs& z!DXp6e_I2eB=N9&=9&7d7V}O!Fi2NF(yW*mv9EMHbZ*jN)S;4JfvT~IUxoa3RxF2D zPhU^m`E9r)z5qOYAMOQqX7z;~09q7zB*%Xi+mOp!Lcn#bV{m8Q+$IhH(bWnyS$?A_ z`Lopj9z{$+leM4gF6DiJN%}APE74J$4Og8N=UVZg)6)f4udR)^d>IM@@lL#X{D7BF->G-c&EXT9EZfT4yU4K5XyO&wzsT+Tt+p zHeN}&Cchaw@BkkRK? z_S8dtR9a7+&!QfKqvq3pQD-F3e-4j`LChq}nnsWiq^IN4twD0-iG{55(`Ev{0lC8- zkeh!k^2FFvZ{LVN`0IH4((kAB>cA`jbSVbt60dY2_@^%fKZ%dG6hnjhPO_sjc@%=W zhO^-rZ$^x7EoXi187awo-z*rp0`&5^zFso6U*nkRvY0cqp7Ee~FCf`h#*4oEUl0PW_}-wcFne^!xv2f0pUjO_-jrUo}0Gj;7ez;W!nRGDrM zczFL`du%pr1b%yLojRN%pGO;rY(l0dV*~rBh2xr9Jd@DH0-v1-!^O5swYa>ZT&+8O>!%(50 zHTj!-alFj`To2=GMkfS4I&%Dp5s|q4kiejOs>%^=AQlgo;l&m;py7gfE)~2A^S3uO)Zs8 z#f@&HIL*YPgpsYis0R+IOq3()i|$#)KYp6$3H_QL-+ASj%(Jqz=KJPBDy>O1@Sn!= zUG>zl4&8GnkvB!qf-V=(BFaSWMY`SzVpC@qz359!>Iiz57diT?Enk$uvlXMxgt@Ho zyRzC1lDhuwOFWOe2TW*SCF#w3PkJ`oGWz}G`6)w|S2+6>r=HYH*di;>Hz>MgbcW4P zaFzEaI(;jR;xXlAC;gapbn!S{l;``*NV!3*uK3HFD`Zm{hw=YLU+;Zn)u~%qQ`>lb zrg!@5=*(C^rsOqI+fgl#Tn_c7>lqfV8vgkXl7e=V@*0RpBDke*O)6~ESMT4J6`C`> z|9YT|oR7EK{`C4Y85neGBr*#Y~+b!K-hi0oC9G&u~nP5aV#h zom>${LH_j8?t{GhJ$Aj`ioHMF1opz5ynezK56BIh`<{ZD4E4vNoC)ur0MS@oi_!$p zxU$jG_?pl}_QcQkKAD+k^sf_`kBcQ$YdzZz(Si=;&D25!dDc!F8H{Gx9|Bi<(07WR zfwt722~yCN4!{-bj%yz5dqx<2$-F>vlB$5)HAlyCTd(P-RG$js1kYL3eMJ2aej$Bq z0v+V`dxaG};tsN~8d7OIF?HTUlHl*Jha=-LpYLBYlq5w(gXZsdoE~_J5$9ATT&0d1 zD4NyT69m(vM9gsb+s^*PwEgMR^s+Ty<;r3+H{&tHP&(uB$T0>xPFw7f8*cJ@t!dYJ z2&p#48arUfbxzLHFKvut+#**`KYKhYj7~?a4Qz35Qybm#KP@}`V{w7FANJYkV822> zFB-Ni4wilO5a&1Z{`iY<(xz)w7od^Et)GIE8jlpS$;(D4bgbNMU1+F{3C1<};6~TKIuyR%GZ1}ZAfK=-T;}z=3iK~_A0woX3nM(++F@m?wdUr>T z8b?g02h>Y%P12HuN_fw?pj_WCrjMC$8Jv-wUN5vdHUrGRH6AzF8IS!eZJ?3yl$HMB zD~mk}a!sf0Zx_B6r9I%<7rOvGB(~6^uoFgu;52(k8wPH856`RshY6dJ^?hu4mxvh&aDg5f=iClrOBEqrVtBUr4 zscg{5Et!-|;k7J@I&g4lO?>*$QF+2SbOd+sr`Z#f+w{2(V{di~ec2#<2v2yLp%g{7 zRIYo|W37Gknlh7m|B5;HIA$;DFiz2M0ASi|HGWyG=8O6UYN4Frn!WO}Pnya? zFW5OTCSaaFZr#_gj;{&Xc;3%ZGdFA*C}8Q2Rznqh1HceEM6%XSP3BaS#Nb^tcF1wv z<0aPDxwJ`{U}d^J+Sxc0Usz~mXPiD6J?ubxQ|#9Tmz%qUpub&Oj=p+!^ADg)of@kr ze)I_@i_k)gMcHGrJ3si`N#C};ZJ#Xu;U?ha!KJnKhobS7ps8LYBA#@;!Zd5gV)qoy zFIq}t>bH&oEqE=G7m7Dnh(ke5P_JKMdkf5>rs6zumj^g?QThND`Pqs^3diRIv-wPn z5_FOAjga9y$Tu?|DDy@xN3HsR)w^_^)<)$)-P_bwLfqHL&|c2J!;dF?fuY7tGx`x{-EE0AEd zHwBXU&LP2BbYW=fBjf~^)3mAQ^sozLd*l6&!;mVmnkd_Cp9iG}R(y!9&*S8#-o!?4 z_CN}*tvTD2{1uli^~vBA=*U~b-U4Rt6Tk1xJ7QCpz^mLQ$MYt&XYgBj_`5v2-}3vL z{hW;6n7}3PrZv9ZyOX-@NB~iu6FJUHxHqY;ULcKv6)~r`!3^hSm&+Gsm-s7es;jEz z3g5G6Npx#USM2tEx2u)0tjCyCTD4YH8Ky{2pOMJ1#-pnD;UH*epxe|Mll2>y_>_09 z>q8s&ASbptX&B1X1BIXUbVEoa#e^!ehP_P}LT7qJ-X9u|SG=AYZ<*4c^N8wHS+L|# zn8WqpD7evt;7$qSBN75Y+AY^qagX{D&h*O%yZk^vjRBx&8{4{fe@N92kG-bxWbsZE zE#bv7-I+^Vdd*1n6*(Eh5&isIiO`#>`BN8@lFvfHH6z)rGiLmz8r_7tlvlc)T_IOT z><9G>9s2R3EMTlG^S&ztkn(ecpZ2a&D?arb$n*ZqvyaQHijsN$P`X?&y6hKhH7dXx ze?1*agVI}i7!Yqho3c=Phf7=*r>m0!4VyZPsX8eus zuyh{HQc9C{C7r=7(fqo2&S6IBCFg9(3V-06{TkDH^eG?tCeR znXjGIW}Yp*eH%<=*WT0JNR)fBI5PJQ77q4DntRAYCDZjnB|G|yM#nQ6o3EdMa zq@W3Bi{hokvp$GyK*hP zqS{MU%hab9X88sElA=5tWTjed9Dv;YSat2*J=kapOrPyT4j|7(8@WA zpV5Lh9E?y~i|pu;rNSWGD2rUc2kOQx$evLfh3M1|BllX}v@XszDTkB~umwd|E@|V< zeBls2Fsej`_g3CiQR=gmM+gh2*txF9+tvjRr`DLhlKo=k{Fb%3=&Qusdy+PpR#O3! z{{XsF&x3PYjiTjb8$UZY2^rov()3YuQhFs5H>o@4$uB;L(ZH@M@ zA%D?d@My+SY1qBqP&tu&>zU7v`sF>2q690#+I1!u!tlpUZf2H5w8W$bANwq#s_C`& zq0epp{NZhIm-l5{WJ1t&y~yK{4|X{QTp)TPRy17#N_L}NNNZKlu`o`74H_K)Prk@%jt?+@5obbwT z@{O!%T>uKFmua6a_Qs1=viy)vnM?E5U9gq*)8#Vs?HS8mwbiEwJ~3L0K(DN069fr4 z5oz8ZlIGSEqiYE0JiBDyIHN$LXn;?engENYAFF|9Nmb z>5FR9JJE4&&Hj}f=bb^ZH6J|8TRDJc)~bpz6<2fMF6%Pp&;Kr6pjj&o0!>v$& zgF};r9#8*_fdyU1vSQ_$TubwHQkZi?Li=LHPLwhzX&(Hu^H^ar^Gz&y-!%pjQ=-;1 zgO*^VxY<3|QIEO!vQHvRfcr-cRa)mG#$;SibDFo>q-#It*g2I48v=yr;9YV_T)(cv zYIR*%o0KXT;HO0QGy`9TFHX-BU*w13%Dlm^MIW$xr1Bh_dVFD>R(#ufrUtz7&b1XK zMAJ~$Fht6gc>bK~)g+~#ByR-9!X{)hn1VbU#Re=U)0WcCf>KuL03o0D7G4r&?};y- zU#vLk;%Q%Q>g!*wuUiDzNA*9$zp54((thTbz<%Z-@V1^#u5} zv>bSJ&gk;I{$_My$R}-k{RdoEiw7p}&U4xZsdUV4K9GHIgHZAMlD1}9DRF(>;3`ma zSMU&qQ`G?7^trRmo|-?oP^12{6y-8=+g-jwp}m!89J^(o>6VqIQ540oBC=M9Tisp| zt1hTsU#U)5FU)Vcc9<2Fkb`oo+LBej+A^#CB`y_fx^8>3-Ghz#1+Qsg&4ZPEhZ;(W zQ`70T*5N_sb5zWBJi!4Ed#FlzmNmfQ`r5V+ISd~?iWP(uXrWwgv*)zM>~}xWX~p;j z^j(lrv$C0$F~-m7qvTh1?b;odxTP%h(Yp8IF%v~lOmmEEmJp1-)`%-2C1Z1MTEKnd z(9y zJE4MU5gG|EfDDtacT&(AFvPRcA9WDh^5;nLb>)J%t^yD^}5o}SV6I>3^WPU zyI(pda%kPB6yaGcsSQp)2nqYZ_#=4C<7HJn6SdKw|_0@#TUxCTwu0%tRyfj##L0q5Bo9go$ghyJU(D zAH)Sk?92qV9v0cjoF$}}-!_XeVK&iNO|Al$Wo_5j7?^Q9>v9ZthX^iirG;)jG6cyV z8D(g+-@hc(F}+<2M2EWumd`Brz8DaA(u%C_o8Xg+^%hX40AzlWf@E|nM|+;q21=&Y z03p!NlJ89&o>H3L-zF1Vr5pooah1Y= zPtP}Qab;h!`6*^80CfnfQCyN^BKi033)r^j(;q)*gq1wEx3K@qOhckE&`FayI-bqN zbNUH)xwP)(rPqtWQ@ZbP4+FU~Nl0E_`450b_ILd^BZv2bTpD!!y3oO8k`tcqW(I*7 z{}^Y!w`-lI{Ts_InEo3woFlORtmEs@TQ0{nOxC-+*?s=4k;-%De&Lo?!Ji{}`0dGH-ASnpiIw^fIaDg9wJ4XB7jBP&K z43!uTb1{8&f_yY&uEuxx>v=HkfpU_Y&;hTZoXt9m_uj7@Wk#)y*pUx{8(*--eIfg9 z%^3@wlkB384@t}-x9ieOb(G-T%ghHg;^=VNO|PK`{*2BUFQ6062nJXWo4e3C2*|lo z3Fko1r*+wLDz@$&)Y_yU;RIUF{Iixu5M$CC&ly}3*ODy!-iuSA@w`>1Bg?~2FzlCU zT9F47-iy0-sbyF1E1Vzn_o$ZvoA5!tzCEXJ={{~J^;=^k z$L>-;ePk0U^xbkW9(vxE>*s(ED>X`3T4h}*b~3}&q@%2pTd_{!!?raKw__NnDmgfe zrXdUk#~sSQ@lnvd6cDES6tTlQ=S|G z$xZQMfJ1u8gw~{BW;l10YdGuqHCeEVzHVrl>5_FmTc{tfah~$fU!l1oSa00*A*l?v z1nGP<;e2M=^xEm(u;U|wK#{+4^?`>$n?D=X&Z;Vv!-e=0QKQcW3fOIjJjv8(_2u3Swg}o3B$tm|dFQ)0=(Y~^DyNd07 z`54*r-a4Dsd10eq%F#tGhKkweO-pW+Ow=b^w)x90or<$Bv7OY`Z6_qUPOlpz+^m1E zKe1S-0z|od;A=ku%j9nZ2ppdRdxny%s~*q0Pi~SXeKrwjfbWY%&f9f~bltjW>zG#o zdMls=HJm4*r_c5==xSHZg>&4iwywY=X6`Dj;ulkfR6(rm$N1fEM+CMlUJ%=NnYJ+1 z2Bk_x6Q*eHPpQOeq8nM&m((L0ZZrejOFDG-lKoAi{5e^m_jGgI%p(*vuYaROvLKM^ z6Y0D<$sUA9vAMVTAAnP;3=b9K_DBSD~rgbGB?{F9wp0R`XF8W@>Gt z{T{qCib$$?iMyDr_fljC$in{Vwe=n#ZwRdcb}&`SvaEGDS$uQQ?6+5@qO zdBJ~Ql#l>D1SV)S%>hb*ivKZkFASinU-N@h=q2ngk5a5E(C;0e1faY2w_!9E)vxzH zlIv}-?8eZ9*&E&0e(&jIK{J%csiG)*1%~JSu&JHT+p+JO%+x|?0IYNow6g8*lT7e9 zCQMz+zgvUOV9`KXFf_slwP;#ovJktGLd(U&oP0o4Ib) z!r-snm%H%8p3Bo!ZTD7biTR@2Zu*19WekOK9_q}NCtmpL6qV)spc&|Ceq8()0yw^% z#j@IkImKod^>(=P)nM-f&sEuEXX4m>gY!P%k$02wro{AhzwIaBdB{{8Qk1iA6dI)Zrdko{el!uB~G|5JnbZvz+o;7beUc@DDmtDjq;kmT1=z2Bo zbDH`h(wQxp!rfckw`UgURa;Qf^@pEb`3qbOK>9G-ro zwf{&vzcSB8)LD-cg)Aji>NqSSz)Ow9r3U`-^pn;D&*3tC=b4{pzp)Rd*F+UEj8{v~#zYw!8D(yQ{cZm*8F* zGwt)PQix8B1BqJwogG?n5s>W8(SJ}ww{#sypUVPQ0wP)3=El(;Qn1swT{5XQPHzXeYBgJN>X|xEsQIz7qTe z-FpagB`q|jRnYQR-|&zJ2m(%B|64FJjr;!+jC9H88DD~_r~-{JSf6B9spbNl=(g@Q zqf_&vq4R`8qGLBiJ$T@~*UM(QiFA#fii}%mH*x-dAo5u&T4YMl`xnS)$iho2B+_zI z*KsT6xVY+qAon;rwzLU8S8!|#-CvB{#hYvqe+HRc4I=2Vv%?M;A2Tg>R2MDjk-+(Nh;i>}GF#oO9rLG#nm59rgpeX#~sTv_UBUsw5cy*NEI z?i|;u<3vz$=ur|8CL006LFw~(;rgVZG;F8Ek`j!0^Nf`;JeW1yUef~E= zwj0QVa%iD;KdE7}&tr|NOG`l;dDKr~{1}U8TQ=FPlFG!#L25|efp^0ycZbv_}l21+!*~b}R5r0GE zu4#D9<_`BES6KX`Qk#;IkUq(fyphcw3bb2Xhv4*@rlj%jd5Y@|XYqFmize?6y-qUG ziHkSllQak+JNMIlOW*hXc_TfqK`wa_IUnln9(-w6-C82%z+(D1UvnC^^~@oR`1Bj1 z{Iy8rpI~LQ+WsDIRus<$d)?k$ARxH#1wYh+jhtaQ)w&%a?{nLxpUU zjT3L5mDr`yWtw>LK5hSM3}|c2U?+o?YG5c#%aL}QEjR|xWxMG z%*Lhc2u9|Cat8xUHA)^0RF-~Sg-wy`p(hM*1HPEBREvsn!oa*r}*i)Q#33Vxg0GE!Ol z^e2u!1I|-C(D+KP!A7;=QFCtXwA&A}S9vn4;@M-cQ!c#WL-BV4)4zMF8PBqElJG;V zZjk3~^jccobYYOj@=JWxc#&x1e>pay~ZT$4DhDLBLXD?>d|;Zze-#j3US3Q(OB6V}}x1@1Mg zn=?1A?HQogWgkT0TJ0VEOORIgX1<%Q|JZk^(LH=kwYNas8Y#FEC3vca4jhYU-n0Lv z*%Q}@5d+-Xqf*-z5*_RerBBB1C1j1I{U+ru+0K5halww)UHn)Ty#9LxlkQ&;%rdR* zYZ|0`obuT(VPdI$f6%e;uUk>(0n3_12Hjc-ic| z$UeNr1&QktL%>S%Qkd{(Ik9iJ18~%5ki6ShMMc*VyU>Q@q?QRB$(C8h=iUrzMYE?= zvz#hQ@bL0EJ&G^>{6g(F1T)Y1anty~_&UFM(t}ln$`t$Lm%jaCesD3Qjr>5|1pTE1 z5h5?3#va!^x9I+;c@k=YnrDU>Q$jrCUR-g$b5smWHo%+udE%ygoW9 zKXIwIB;4LS^K9RSrlu4Y>$VB}Tnu)}q8Qr-4dKJLKsH>2FBj%q?ix5=LO2i%!<}|| zqUPHom-}SGzTmCqiYtTijd~Pad!Gs~X*|owNl0NNtGsbLefND{!4oxn#;j4EloPnJ z;Mtu5m}ebm7i;!Dl<{T%L!nIZz6O%9;^e@1e5UJ*IV`B(d*n{~id28A6Z%k-(oEH; zyD=cLXtcX{Z6zPs3^I-x0T9Jc>*f@#SS_eBoeM|7LOGYX(_Nt6Ql zpNb@?6T0SGo<%1-8Y67KxGsE-j{sbn7JUM4+{%=`XB$JAmq-tJLovF(7X8sT@zK}i zfbbYb$2&EZQUR6w+4cmj^?&HkOyMXu-a&(MSbeifO4%=1G#5D>u{d76GD`8M^q7VB zxAd5XpyX3>cUNAw_!+lwOq!Sg6m;!yGoS8pwW(n+Gc0lt(lJOV|4himRX*nhX^sU# zgxT7Pq$5zT@i(F2)Ch1Z!|j8@WaF_kwIe3Y1K--UrkVL>*Hk39=dXSfzwj`%b0j6=A#8l5SP0ytPiy_5P%o$wk z;0EhNtTiMxMAYw0i4xH#EN*TX;Mg2M_~*^x%)31qer#duES(c621Q{ zf&qE|DM9x6@%COwlt?R6H`oDv!u14^7sH?1p7Ln7-(^W&ZvXtg$G7^Qzig!}kfiwP zZ&&*K43*=>Oz|^nXB&Z%SFfOwA$5~CC5XEbbNO?pjqX;K1hxHGiQ70%7yNuYmWNfy zVxH$q&DZ0PL`3wd*@@2OGI=1T?F4#(^ZWL~0TW_Z593rK6aP678PtT|b|Pda2&zla z58`dV(q`N|sYc1dc} zlyr1s!Kc7t^X{$|Mahu|^uHA_PkLWRL(Aqa4M~r6HQO(42T=9%8w$I9R3@B{1StJ4 z(q(!1TPBA~VU7Fmq|&KeU0&R%>lJp0Y?)-xxaE4oYSDh|G0OY~XlEdaL(qEoTtjz@ zhmnp>@ulV=7DsT}rAvzy0d(a_zx5~?CDx<)N%w2+`psV99j&^f15FWIMqu)nHcqdg za;IYiiuIOGUD-N(Ay9uYL+9a^y3R+zPdEIPc<8q689ins>l)~1L6Q24K;<@fHcWKPWF|d7^hy|2*~DGlE~>fq9mR$P zDuw$&U7gmWo-D5>@c8L*MBQ`gNe8!$6>hZuuRAJN=-ItTBa|{CrOiY zqX?q)Y})ATs{Lm=x`oe4zT)ZY`@2~}xv3Qy>ulExEOdfxqX}CcTGm#tb;MUR-sP?c zUPfrkZP_`CwEA{42?r?(RMOVY_ly6k=!TVj(wNF0s7p2caV0A_*I*S2{K>vVn!4b}+$(m=U8n6x+Ha_twM;}!Qr`xJ15$L&ZoC<>zoTAZRI9YoLAk(wr%eg=9uzCF z{MJo7w22_a6JH=O$v=Ga=l2Ug3pRP^7DVHgw*RU4tW$L~m22OHr-Sf|ft^`l^N$RA4v+xw#3%jemzGa0^DK;G zz3mm_zYd0+{%{QVB(djLY#An`>`27t!U;6Rs9-ce0OS1bqW1GsiL}vo1 zl;FN`Y$<@8Em5dP5yvX##rYCT`OrC>`FY^jt|7As1SIFf-S`Bld?c0oU=oSiV z(WMqmgSFU%Fi(ews1>9v5u&bO;nEf1BvL_>C?Zt`@>hq2B~_G;2JFB&-|^$&J;DW_ z$?RHANt?~{k=II{HHw73JuIA#76@7C1(N&Can)CD>#m%;YmzCL*2em!JWDPCpieEU zQW)k52|P`S6c;~hZdNvtP)YlAp}5jBkYfe1dyGu5#y1ze57$V(uJ3XSeb8S<(Db$+ zDKd?zJ;l>^!`LohqA0;FDRi+Zfz;G`{ki3O{QN}LlQuQ998JWdS&iXW23EIvvwihhE<=q)zh_v!eYg{*9Xc3@akwOp_Z*Mr)4K@G#-|RKaP4%C@#wN2u zd1j%aM|@>+gX}ki-^;4CbHfxLa*Q5)IXLk8{Y+AAT5jc1)rinov!mh9k3zKP{~w3N zKi}h@Lgdd9)5OB(MUHs|FSBWWAb&;?DUlJH1(KEe#Y2S-xRXfBKLDp@ZU_q(p;+AH zn;Cx0art|4iMXP4@47YfhL-;!MgYG#D08&Cdt{69g$4v_q@ByY6fDi&9z@i_ya9X> z=+8AM0q6O~RXjc!3AWXWlGWE0H|!4d6FtkOOJY5l6C{$s3eh#wI9SuZ?7{OkGzPcT zhzS&P&9d}#bgw`aFYy~v`^oaddjTu~C*W8!X{>z`b!eaVaAzR^IBWDaCahoiY9l5J z7TA@ZhPsB_Op-%FM)y-J0d{*N%h0BFn)^HLtdjvVe}QkG7jw36yrk{ouG~OuO5UB2MJoITdm+nOS5jK-Y~*PH=ZL;{0acygz}}8g9sr zopZZO4~23LH!|2H3loCltuCU5yEep|vQW1>jm9%TAP0=<_VMpPc+62RQ2hph0y@5M zDx3;MoH7WMC)mn*0R6??CTE2KgY|c;t@55uIdTw;;xbLiPtyqgPhyRYDiGFwrWq{Kvltu`dbJRj*~r>4Qzw zcpeDQcS{~Vt4f?TQ}{L=-9ckRx5X+dw7Pp>7c#c*VFfFTMY1E-$#;9Dci2|ISyaGz z`r9MN`on!Lq^0_%kC#aodHkMoo!_VNrNy;O;8J2}yE)RZnYl)6ZQJwuxzkgv=}XoR zOEGuKno8IZ0xA98tP}d>de2};cah$xP$vQW>-?3hOj(0Wp zt^P~4!}OCzsw#r?;MuSirBca6q<#nY1?Gum#GAO{pmV6u;;?`X@t76d3f39Td5ftVBQxb9WK+^Iu4i4- zjsocz*Q{9Fn9Vo*tGo3dN-?(J?Ps1nF-#wA;RrM7dpqbNSV^E&=5 zcCSFX-;j`mZp zCm6wX!Q5enW^Pdys32Hn`%v8W(4BpbrL@xR2O6-r8?ZZC7T~;S26+cJ$k$rP9ZG#? zT}x>doe~aM-)NShG6XahlVMMVZP>p)e3(|?xRcf%QDAPPUB|1v)c1^Jv@2^hwa`*E zVg;I`kv3cl(S&ONZFBP|*IAj2a3XTO!}jnUZpT4WM`;wpt5XR#w@NF`IAc`&8$W^C{LwC8jN>{NAyuGD?9!MQ+X! z0fo4QpKB{G4i6|M_RyAVF66Aw>z$ITVGCoy>uY4xHxtKmP|wW5iw_4L4ejAGqu_8U zPe_Pe+e+F5&PPFBRrr2@8iGPbR&+WAqk>YjVv%HRDt~7c)SkvXP?^_XX&&Y8`gQ)@dnCG^Tc*!K_e!x z(Iv|=TN-JvU(y;Axo(xz6Y+LG8otXq-KPq=ZO9F0uwN9ld4D~SKp~3sfE~=n0m4db zic_Yd8zcuDqCNZl#dUd|^GbTI^AukptDf>@oAK5C0!3cVR}^aBS;_?N4?;FfQ^wzF z4Xsn=Eb|lN?9@CB%(FGDC;G;dKS9?gF6M<#K*3o}^+n0T%9@fD=yE0VS;`SqzQkx_ zm6GU3^>rwo(N)fk*B=KM6oaIATwOkf<7(FXBGQ8{>J9-Ox=x)zPX#2-vZ0{GN)mAn z;LkN?^`LTgr|R6D5FedWZO#!nU+1L%$|J|Z6EZh7e+Epyqxpdi0pfof9I93Fv(q} z<-4oIWDI3(8wo07R*Ktio~w4`n&^N~lUCPkT8^Lni|xMZ4#dSu`nG1so5QmKCDS?O z(ZCH_P3xdgMgx}&`mRj9_e4=1I9mq6_W7H=nS5YTlDjWX`=zi>&qw}`UPzf-*1#~? zP|`paB@GL}>-e_%%ft??%ggCvw<+q^lZaC5<-Vt|N@iKK#aa|A!e><5{v-Tc9OwP= zU^Tb?s8O{QM}tV!ZR)(o!z%T)7^afEowX@t1cq~acR)C&&w zE=~roqIlWfKaQ{6eSpk7xj4{N7$N*(oWV44LCitmKCh?Sg*taPM1$bFWSvCdDi-W} z1Zb(HN&^jNjK6(DazC<5XK(|r zCH!SzbYddCcga5Oi>E@_VCH!2xtsX-=>8X-Xsfa!`Qp~@>Nfd85F1%D!Yudj)$P_4t=CTBR6zcKFP=XJB}B41y_JwC)q%5fNfDd%f~B5sn zn!zh;-LCJ6EZ;@hfXM57Zr@-ls8RXS*qS@nKp~|%)=(Frh|etjU?deZA8FY7`AgG(lqh9$?G=raA+oK>>CWf(-v5w)b&i? z!|?J#Dhmv!w-{~fz+pDgTTKD4S_UU2=Z6^8 zz;RUc6IA-!!FwNrLJV^d^U<{+%!?Owx@De)yhph^)-KGLGQi_@Y@!lrueaf*uJtk^R!dAzBa9xID;R8sTOw@aU>mtv?z<+ z0r<_38*8`I15Hj)d|wz5yWJWVKx1cxK|7A(^j7B|iY>vS|2K*)uJL6u zW`XVjQTL~O=y1YYQP-L956=@)nm4lsgdrw7g=9QUFwgCT>ZC9NJ+|cyCpN{Ua_AS% zH{r}K{ZK^Noo<(HH?K+!G3onLQ=Q@XOH)0h0CX_&^Ll3<)&&?;u3_Y#k52?Z&LoR5 zda_T&#KSb|j9vpK=@sS!u^8OW5RFBV%{JwTfYSQ)Zq(p91OJp~E4agQ?YuzIGf~63 zch?=hM1X1=1e}g*pL=N!3{}DMq8KJFS@KLw>w`Jdgu{}Li%Cn{!~5lrAp zr`lg@a%>PVpzMvc7$=><&0bVB&0ot6%DAUCt6|uW$SqjV@wv@_;4$&Vti8aLRJTRH-zOE?S_n!W&05M1uFuiDu|m7kE}G}B^(@pUo)Y>XK61UA?9_*n#(3oN z@MAd)yFh~J7PF*j;8z$OdMPdMAZ`==y(CNHNk+=ZP$pgN^S%~3-mLy(&sOo|p7X(n ztARGj(KC8iD{z7DdUtG9w7|lmdG#};eXie#g0XQGcH{QyY4bgcX6Ha+mC1diUH;?Y z8)SL5=5th|?%R|5*N>Wc#5BKVw<})K4_gs>wSX3Yn%CrBUD-4j4jI`+k7drE`@i3_ zr3U}6JX<=h^`=#jP&jD5@;JgP5}A$eOL%NNKG$JRzd+@n^uiL}7yAUmw6R_a6!c&N@n_^Y-4ip@l7(S&x0f36LKMSd($SzRf1MmEqJU zYZh%Dono(=vV&QyIZf!a0ND0c!PNzx_$E)K!u5ZE^k5%V|DjLu3)0hO+NyzZo-^Yu+5_dZ7p*y33$3FZ>Y>JVW5&WCLNQhb&hFn+YMrys-7~TO z_I!C6OzSprKV8=?Azlp(H3^=nLH$7!VD3p*7&!>qG}15(=0 z9NxG;nAtyNztKvyM|F$RHAY8D2=Gxa1ZlE}F59PnZWYo~VAGTBw9V6tlvfh%zsKLP z<4an{s%Rr?f)nq6tmR%Gw~6wwt+={_9@UJ(TAV*4;Pp*lx+8csl@~B0sT2IZ?K|=Y zHdl4z8InFbh(x3L`i~V!{u$h7+-%)G09M8BeO1yYRJ0%TpK3st?tLr`nZ(d0w82L! z-9?tmXW!ds4Ou~KZLSUt6uZ6nh@de9;8pq+(^#sWTbPpvN}8at!uAcPSc=|{-ls#- zu8+nlh{n(Dyfn~7%gMWX4dtU^lAb4quFkS*(fUER;c)idj&K;QNYR%Q3dpWhHD$mF zeKW+{Hsl*;NGY@Ea8}xc^ctMyr)g`FdVNXspG{l$|Nm&(df)wj%Cx0|Ip4{P7IQ}o z&dcOsy@h-=>Z%GkOR>e$i%#!AVts4sr=Q=xcEt;=ykQq~4PYCb@2!SE zx>?t6^xZBi10-8gOY}~J3X7JC+0s-L;f4j)$&k%gU8`Kl|7NGriGt8`v0XDi zI~n<5f9+&En(W9`RzgOcO?y$SD*@_NZR}do732FB@KLtdom*7?VJ3|D8mi3HPhfie zex(0t^FNb(T})fs8kGulYqnNHR=cl-(}5L--{?J{w)Ezt`lEbtiV3A-+t$IY45NY3 zMbDQBAtG3N{+nOus;}gf&sgbJpywTx#MO5e+$^96UZ3mNa-6sfegpXGME?r#nI7p+ z81Ui(fnya?lC0+1;lM-vE4_;lAsfm-@ZmEWT=2pMOrB6C(6dTe`CDrEg1`nGic8{bw4CDI~R ziZl@E@BEI7rS8)?R{XnF9tRBhTm$dI0;8TyMRZPXiv64F7kaTyLKY^3n0gn)jX|_+ zn~1Wa|CY0v$MjbS#2K7kk&LcyJDJYC=Kij)1m4e56(t zw73zC%^rM`-T!t7Gw4Ui$I`+j*Wo8uS&2FmkVaPi0#CtvD35JyUAdNZszh08I5;bOqkT8&$=+WJ7^_~~ z@~gdkde3$%&d(j>BR++Q!~lltINB$J&XM)*$I@)oEIR+D#Br?*hNYFT=2gTGWwxr! zc#2quAyZ=`&@nwSTZ%*OWE&%H9=9(#)vF#s{;7RIuYxhxstVqU^a?uFa2r8QP=X1C zbdYJ5{;7Onqq1_gi?fKFW%fdZ?Wr`Eu6naE@n^&4IELNWrLPG&b&nEuIChM_8AOqt zrFx9d*VF5`5VL=G%z`9x7wg}dI8|vpP@C;x|LTj4losdwm-n_97bDMZgxV~ZXfj80 zX5(uh@w3wbwe8Cz?O~XN#w~V{`3%LbpFe z$n}QajfP=-A2K?HzjCuSF3w;%7Hz2>o3#xHKiV15L8EEh<-^~T@F%vnyaa4h^viVU zi0;ghj795G+<4knH%2khcXWd2v#9d8?+=l+mn|o3`ZP;+gp9$?5pL^C5-y4r z+!l$IM*$w=)|E}St|zEts_*!*C$#RFi7mg;k3=H%rV=Ws>w28+`ck?qjCC0bBbS7K z{w*RR1Cl>S4TM;WZUtG-?8237*kd)6{Oc&n_YeGYc!ogxI{rwV3f`*WrFwNJ(6s z>|b`i_N2P}hTU{FsY#;ovZHJH6WBo3tG;x_UAW>1*YQccX>^Ds_eaiK`$>%!d)KMl2#>J z#zH%+&CB)H!>9Q4sL$kNlVKS2@1uLJZ@m0V1L88eOi2R9|IOGEKh6uAo-6)t(>=I!GWbcMf;6)!T9+mkZ(4k4V+kcj1n9 zdpXG3WzL<=U)W&#S(-F35YiHj{S`l1s}c;W&T1pxOOu_y@+*k?>=BSm?=CImYqb4r zIsw947`h^@#HOzC{MWNJ5nQ8h`s-;SYOg$u{+(-0JqyHSzdh(F|I@d>e|ByNIoaKt z^5Z=Tq66$!v>*XQ?q%XaI;Mta8lkA(3eO`6W^_F|170cMqjl{UWGC%FWlhG zmkV&_Y;ezvkyO>#U}LUl&r>c_2O<1dFK`hH9MxZ^fYah>&a-v&AP~r zht9gYzWlQJ`(&F?TC!OsL=cos zcn3xAnYvAyf*=do^Y&T@mV-GLq-ghv2&lSK$tyh79D7FcB(x4!Gt0$H5l3;FJ}qeV zksfuVfxtT-yC3k8O2me&zb<*X_K*^t{5@EOfd2l;DoqO{Zvbwmlv7xvu$ecVijrP>sC%FloT-Fvn*LN5LCm(H3QIjw4z=< z5)v2h3Tws&(A`~gJW+Cs06<5aCS5;Rqmekh;1#7$PMSS;D(ZgM$uAYMG{D-(&u%h3 zX1I)?c-+FZ!=~f1KVtikZb*LHgF_6OAeBAX7^vidJ5L*mV~l2X3R>7O+a*(SSEk2F zv;?+f%KV*IURuIHV_0cypj)N8Fs08|ZXw*TwVzM#Zb0yXS&d5+*{*@C8_;|RV}Qo> z2IBZ2obP``UK6jce%)`mk<|z2A8yF%(5C#D<V>5$-9z%7n+qr&Ed7{VS(e9Mt%0rxHuhMgd&J1|$?uXXq*(7iAx3#Q|9}{c9V12_P^X|p7BNqJ0B>BF(ltUa zgJIz&NMm1CSXBNgPX=$W19kX5Nw0mpsLP9l!iW(mVbNuz-I?aU07jR_%|= zZzav6jUF@T4JmPcp!|a`8dw;!*wb_eo0&WIcp*02){RADCN!`X;n;*1HXSJJ;^p9k zH4-{krM01gHHYi~bxh!VLDoY5+?&oYk8uKiTmc6@$$tDga>arct%2WW4?>54gM&^( zhGsixBt0R!+35X&c-|8+iR44UFbktQ(Q{OKdi%>w2jL@>18?%WUTz(UfUD#7^4@iq zDjv8|8`OR3FCVlLX^=j}1KlYW8;A2cxcJ0yD|#mu%(NXsbi#5ZtIQrG36N1H4>BKh zq}`DOf-kiy{u1Bcg0EBtS23%~u_<_(o@smDf2$I^O1C z&qO&X?bBkgY^lhWrlzWZvx1@uP(PD6LbkP%(ha?h40pf>K55OadjPq2bX672%KkI* zVjfvIwZt*k-VS&{pV0b4s;R#-$!^U)S@b>i<;$^IKApqMK0+IGg>YmmBeFC-Y%P{^aeyq zibQlmxhbzSPU-222fRZBF5lVdF)unch9P|3yR7(3tS6u!>y?yXd{aFDTiENw3#%)y ziWOtb_#+bE#J5)~HrFdECqV6Whc0Mm`W)w3f6`LT*gKV~b>6Nji-)yp4gI2r0A;+L zAU0u!65X7?9fxkJ2adR#)pD7}*5T^bt+Cu2wU@&sr;qU=P}gGvV$l03X98$!XtFxl zP@8S{HTF>DIK^_#D}=T8J2PKzJ(k6bn1(RF3&dX|vB_GCbC6+a2U5*JoKWypuPckZ@!T(%FCCOUDU2XWk+Nco!n~h3OfBheARMG7J%0^|EaqnN- zs47bC?doIA!`Emat)+xgHIQ4IkJ|U2(9oQI$QCi@)7I#F4or{V_zKjCVsbwZ`t0Ru zmhVL5$@dNx^SQ28Db26hIpF0JJo}`&Yac3DVLN2T2tY>DS=V0BaBnR)s-1xLv$&kJ z+O=q%RCk~o=(bC8h&IV9pL+e_%Qx>2T15e^azaz}BT>K)aJBL{R1<(}2Cj3DhIDyo z9|$p3EsIf?byMqSOY|hF)D(fc=yn6rKad{{_i;lA(Kd07Y$j@C=dWKakY>JuJr1$< z{W2%4?5na-Y+$Y0tV`mPcD7A!+U5frS2ZBrTKuEgHh)VJ!3%U?HlMupF%}CIXaa}x zmL9_=Zs);-mF24t)3^@@UJ%`N^nJnA!`VKGgcG28a3d)I#ynS?Ci=b=+czIpYBRT= z`ym~Mf!XtRY&tg-1Dg28KE%X?(mb}qt})qMKgSW6Vd{6S>fPhz?0a=Zqqv|To@P|d z2ff$1TZHKtoAr9*+ZH7FlOgJF6DV1+ui7p3rS0uvz~G&hZ{AP9c+{7))^!HcKW{6q zTQ5%=#C=8SG9;GQv7E%-DR&#Uq5rbOv5N!EJsx%z=ZPYrTHYv}PPra?7f7rscD3oR zF;Wb)oF1i5P}>WCZJw6>Y>)G=TW&Jo2qjUvC$4vN4BS?+S49;ze_%xGbANVE0g2VW zaZe!&bP!g)LbpcJUl>3O+kPy$T^0TKj6t=_u1|HhcT$ggIsagPHPdx@$?{R%;wVsSLY_K_k&>AZ6Fz+o?**IMAKJUEIlEGt81ui3) zA$OoG@#{HYZ%w51+@K~ng|SRwm~SnXeHVaXXNI>=h|dqbfNxQzasg*-=aHM{fZknS zmZ_I!Jb3J0%zUf3Q|xeIjt=W%R#nG{xQ9Sq^*rEwX{YDob;$hSW|c?GNGOteMd%}7 zx9TYJ8Ju{&8)UejO6_uXHg)wYZ6(9dY~qU=L#Ic9B{1XBf(Hh9{+)4a#l)Til_D61 zz*>}l77xFrT3CDQ;45nyt4<&*Exz}YF`T0bGGSodeS6o}G#a5V) ziUZ!{+TQ3zv9BoQX+aQL-5OK}WkYrolvSv{9)kyvSe;?g_GT+`IgD!8u!W}0(L4mS zA|#Kd~C&2NcRQUIH7*7kFCTep+%8~{Fe`QLo-1fxHE@U?65M7(M{9Ek$^@%W45 z^3S54&RkLmGhi?$8Sfsf5 zFOBewp$c7NKa}vmrc0Au3c-1Yc@*1A>{owxe4cGNL_Pry%2*Kme`A@;$&+BeiE_q~}wDpP2Kiv;Jsojv?r zz{g(o@mlSx+9$QoN$}#@v4GcI-x^nOBP3_NVVH9^l+W zY{UZ{=a`-I^<{vaivF@RjLidcZ+&`m&)bn%L%(SBs;m4F36L2*{uBJe6gB1=)dbK# z2B}!}J48Xa`!($r46llruMrUi{n-lj^X>Xu`y_Tz-+zmiP{e!e=6|k_u%Y^YncImb zF8qs|@0g=G;(>l~=}97YAVU??xsqH?T7SBQ*-9uwoV6NX+d-iH>4JXIyTZF#>8vN+ zcuWJe&v9Vg&UmA40+r#RE_g>(Es!^`4%6IXw573_etpG85#sy+weS0YbXzo=gp1U0V|yDff>Cg4kps|AOuKbyr7XW^k3(;UM@aV7{=QoW`MdR-t9H>Nz0IJ z9T{k_8LE$?PEXz1AmEFzYozZunb3CfoScVSrPw7Ca`cf=>m$PBZw&a7)5Czqxwz}z z2gv8Wy)M9R+n>`P>hpyDly8^!kX2PG_NN&aScnRY_^}tg-GA$G^EHylY?FlC()8K) z^R(CU?zL9HS3LVS$DI=mX^+FGBeu2E1wc14xpDPmPEglF1Fd(j1aDl1Y%`I%)A1fE z6w35{lK|zzkM~@k7R>29EELGmkwNU8xK|nmUXzCaHWlHd?Oh&A+rhK5vunn1i^Fk*9!Jf* z-P|O9S6E52F6@vz8Y1vK9#V#)m}KpAL?m9mr!U2GEa^PKTeG(-+HB?aOYx+!k5XNZ zDw|cvsd1Uj6!%caTypRitE$3=VsxgmG=fFuUVmpkaayJ z_pJ%pKf%(nTbJ)@ZWX7tyLCw-9{EU}fqYIhS7U7|lDM=wA|s_s+o$^A8iCicGwNV1 z1K%Y`4h-f3bM5?C}RJnYUxzy_cVmnE&?ZRe}C^=dD zM}xhTWq}gEDqTU9z|~~2_V1f}6qKdkMW9XiSg5w?xlz<>K@R;p48!Mnl2FuW_b0(YxTYJKW2^oMoe`a`k%N*a?s1*Ph2;bvi_}dPQjmN@v-UgvuDgM zl&hq5yXfu$^W6*m$34gw(q%tK5EkQ&L;uSWgq4lT?rr}M{coGobrW=eku$V*mV7KQ zpKDrfHs-?-O_*-jH+2n{REHW2kRBsVFEQq2b)oNU^6&n(Ns2o4W0TbLJesiGyML^) zVkm)XHH$>74FrRK2F3KPul@*%$#MU81;zi;U#0qgz+Y9im!5Wl_(nj3G1c>9AgPRrGx44CH zCQKY6F3LB#?y8>&m8yEw%>f6&MfAo=TYs(~Xf4hijBL{gzAldp!aoiS_X4-<+*DL# z8UBX^d5Dk+IhJ(2EuSG95SXcuTb>IQ`eCl3ZoW@yM4Q?2LOCgZsKR7D8rQD44P)Zp znuXbV9K8nzDe`4FTwe;o)Eka7!lPFp3uja)K^Yl;8zR5#j`?QKv{ z0l0OK%18rl`5j`q!^ooE=*71kqtA+4*RH#adUqG>Ctq|btjt?gL36 z9IFvn1K!F>8f6Nzp(2i!q; zk85g4LFwI)r3$V4YEmRQh*K0xT7Ez2%{QI|)lhvqQoMY)YY4x6AA{el*<%3l)8+Sg zSdw{zLDf`=28-9)+BIA*=a{u%xn$F7uoEyeo6*rb&amu=9p)YB+S|RLU%JD>v*ypk z&;t3;-_pNb`{8-rr|Ol%8pFr@o0}_2`GOWStmlswLmvQN$wpAC9AmBWi>3;ONm8!} z;yg*OmGl9F`)ydSV(5V_s>fOU<}S*XNsWT$7WI>|z+8ZDm}k9E^88P`2XK=}_kH09 zMeFw&3FkVcBRroR5_sgsVR}uFUIt>!9tB5B?+vxaZ)fz+I+dHgZu*vtxJj_j|EBp7xD#{|83v(AbD#Ux(n)!Z?APn^|?@MZE z)Vr2=-lqRz=9N=Mi34{pgoHAD-Xi{LoXc{&fEZTnE^Z`0CY+IgY1{ z#y4E7>oU6+vr6u0+{?lesW+x;ZgmSZT!Sp_tD~pBlGcCtLs2EG>_}+iTJ& zN)2H_wXMrZJI&>eKU5s~DZ$BfuF=m^{+KQ$ZvhSj&eVEE9(O9I{ia_u1X^_A?p)CL zwOzWz?l^kU6h>Z>X3~x@Dm9-AMgrW5uC7^r~jJZy4p$ zgRjv1d5f?cm>O@ok*D9BAl4$JK8-qs-$K5cb*{d*PCgvC`~6(Sof5=ONl7ftP1Ze9 zI$lzv;q!W{7jfQ%)FuaunsO0nF;r^-j|2uySz`M%jpG~&#k(A8>99WP4vra0E43xO znH*q(<%n=^)=cU1dH4A7F|`M<**ik} z;k|U-4L?hg=LO%5WNYm4m-;4LPH`XLTJxO1y~#WMSY&j0U((8o_u!3d)VRU8<_Pjq zIAo~L$+l~6M*YH`FDL`O|HZja)>3Hz>M616t3JH&Is5m)(IkpoVCEoit>+=-)xI0H zM?_eYh??D%88!{=zOK^Ocv#uuR^7V^Y3Wh{xi#?`wJPI~a&*%Gz?*HybLXhJ)>Cn% z-r-M`qa9i^k?sC#u58N(LbMA0tU_MU%5YyhIh(*_TsjzV`3mXGwqt(YNsS9LWz)*F zcZ{LwWvCyMqqgHjyu5h%Xhe}8ux{`mdTI=?68_`5!3mkmNx0VIV0r9Wpxd^zABy)X z@RUS&o@wX|V@#kBF+(7OOwN$&SH(XlH_Mx zj8}U0I6V$2*Z!Fv8!R*F1iSJ)z@8YdobbhbFjm!bHQ}Ca+x(l}V+yg;@pM6#r!YNe z`*^y5X9uLQ=SF_KGaANGiYx}g%z{{&V1rp-l1bKOJqzh_w3K7?b3soGp52=q!g&hz zbGP99`KH2t^c0fx8Nxxwh5Y=-A2W zE3)TH)7yg>*^v&1e$zVC@HN)b{rc+6{(B5nw79wLt)kOn2Ok*64a>}1Z*>kmR?5k{ zyETVepTCC@KE#bR7ka+mzEwFu-gSG+{5r>QkNoy_xt$PjPa{wXDRr)nOJ=^TZmjlx zW3>Fy8GZgkBI0wUhcCRp zRW^iDF!1Fizh2Z&sH;Sn?NL=cM1LOmph0&OL2aHEsRFf{^#?|+SZ$oc>;5S_Uid9L zwijJhpE~*MZH1oN3qj9<1O6^{+1sv$HOe(jeXaNpdem7~_{&L8MrB_s7ET~ZzizZE z*j8oxw5)}@$T5^iIZoQM#^C_v6wdL#P)=phZh4OK;)Z}t(Wy}h)p%B=T=`q#q`Qdl zFEd=FG2+5Yk`ikf2Fl@plE6nK>~&bs6ni0zOAQ+t5?erR^K(%&hbE3jMXzuqtHnD@ zL`siAz($8=%HOQOxC&MAS&;fr;*7BI#POtPWkB5@=`qyK<&TnMe)1|jxrlSyxL~33 zUMafmuj%ouXV@Xjldt#$Kr)5G9?yx+=Y06paU|V!kn?@q3rME)-~Wwd3S#h|N~Uh8 z|LQoBfUc>o32@9$bTQ#&4E%Zck~%|k*!ca*@k?K()2~|K$_Q=O*GDr--JTMC zbUWtp1-6#}^Wr~Dih2qkD57=MEz@zg8E5^pBT$(J?0{%0=l@I5RKPIb!R$U<3)*Us z(Y!&nMG{b-w-5(28AP0oAuw8{|7b>DA2TA{^C6@{%)sXy$R*XDtIe0#Hk^B z;d;Fr;~%X?4ujLll%NrDXxI>%bu5}n7rgP=K%5}iAc84l9;KhS!I9He=SGBkmezs* zuOBL-|Hd@+$7m?mAETkuAAt`s`{z2N1%>4U+cy3QxGHRo#LLX{gPsvgo$DM zem743@A775veH(JlAep#|5#|j_qw!pLs?13<3~~x+~=RGjMmwO4@q%g?C{;6Lqi2q=*ROGGV1+*41FG} zHl+4Z94@}ta9G&`Iw6mlrtQv=E`5_IMB^M_5&Yh}%9bZG<|&k6Etb~1=%=r*DP^^- zI5}st+cNl0znMlTiLZB8yE#zQfmNP#!r{?(jE%wQ5T%80Nkzb9sENcWSIA2e$O3RK zaQtWub3+G95G>`sIX_l#%sE~!tJ5c`yi%vmb-Ee|}_jas6Bg#&bQ-=pn z_FuT&|M~Cil;NzYE)5+cby5XMI zW3j&xjb6o}R|BO3>a?_Vqos*^m1~3COKS?KQ_}c+1Lqw~ zyBuJS+6h=I@{y{qm>ZPaD%;s* zAf1yYakV1>=8$MTYUc`Tt^C;Lr~Q!T>T2r?t!aX%R|5b50000`!v9?79I?0@s7-nY zTAQv;%EDY-9SV$BB~_3Y_+*_Q<2?7N6aKwwlj&-{q3%%1)dw`r@AE5f%5c3Y4Ge+v zb3b~@>*lS6tBt2_wCy^nqr>XtC}%O#Z9md#9nxUQy3I6%(}`XiL^a3Nnhre)h*J4+ z1zb=1G1gc6 ztUp#D9rkOQCGP-@bjI0SBKPa&Ugslv@uou>oI9ACyQVp}DN}M)xzdKD(D>Cg+1K01 z>vz(slB0fsIdFm`7w^iy3wycI>|d;ueC7%000000DdsJ z_p}0q3eviT!Fero(0cd#LjeE)00000;2U!Dds@FE?o*HGu)aQE1poj500000a0NvF Y2R3ZON_=qqu>b%707*qoM6N<$g4o2R#sB~S literal 0 HcmV?d00001 diff --git a/inference/huggingface/zero_inference/run_model.py b/inference/huggingface/zero_inference/run_model.py index fea8e0be1..230d601cb 100644 --- a/inference/huggingface/zero_inference/run_model.py +++ b/inference/huggingface/zero_inference/run_model.py @@ -87,7 +87,7 @@ def get_ds_model( }, "zero_optimization": { "stage": 3, - "stage3_prefetch_bucket_size": 2 * hidden_size * hidden_size, # 0, + "stage3_prefetch_bucket_size": 2 * hidden_size * hidden_size, "stage3_param_persistence_threshold": hidden_size, "stage3_max_live_parameters": 2 * hidden_size * hidden_size, }, @@ -105,17 +105,29 @@ def get_ds_model( ) if disk_offload: + if config.model_type == 'bloom': + buffer_count = 3 if args.use_gds else 5 + buffer_size = 8*GB if args.use_gds else 9*GB + + elif config.model_type == 'mixtral': + buffer_count = 10 + buffer_size = 1*GB + else: + buffer_count = 5 + buffer_size = 2*GB + ds_config["zero_optimization"]["offload_param"] = dict( device="nvme", pin_memory=pin_memory, nvme_path=offload_dir, - buffer_count=5, - buffer_size=9 * GB if config.model_type == 'bloom' else 2 * GB, + buffer_count=buffer_count, + buffer_size=buffer_size, ) ds_config["aio"] = { - "block_size": 1048576, - "queue_depth": 8, - "thread_count": 1, + "block_size": 1048576*16, + "queue_depth": 64, + "thread_count": 8, + "use_gds": args.use_gds, "single_submit": False, "overlap_events": True, } @@ -140,6 +152,10 @@ def get_ds_model( model = LlamaForCausalLM.from_pretrained( dummy_weights or model_name, torch_dtype=dtype, ) + elif config.model_type == "mixtral": + model = AutoModelForCausalLM.from_pretrained( + dummy_weights or model_name, torch_dtype=dtype, + ) else: raise ValueError(f"Unexpected model type: {config.model_type}") @@ -192,6 +208,8 @@ def run_generation( model = BloomForCausalLM(config) elif config.model_type == "llama": model = LlamaForCausalLM(config) + elif config.model_type == "mixtral": + model = AutoModelForCausalLM(config) else: raise ValueError(f"Unexpected model type: {config.model_type}") model.save_pretrained( @@ -354,6 +372,7 @@ def remove_model_hooks(module): parser.add_argument("--quant_group_size", type=int, default=64, help="model weight quantization group size") parser.add_argument("--pin_kv_cache", action="store_true", help="Allocate kv cache in pinned memory for offloading.") parser.add_argument("--async_kv_offload", action="store_true", help="Using non_blocking copy for kv cache offloading.") + parser.add_argument("--use_gds", action="store_true", help="Use NVIDIA GPU DirectStorage to transfer between NVMe and GPU.") args = parser.parse_args() deepspeed.init_distributed() From f73a6ed635659f03ac583a1d914ea07a2cbeab99 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Tue, 17 Sep 2024 21:29:11 +0800 Subject: [PATCH 53/58] Enable overlap_comm for better performance (#846) Co-authored-by: Olatunji Ruwase --- applications/DeepSpeed-Chat/dschat/utils/ds_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py index 9c15e5143..0cf1c28ab 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py @@ -33,6 +33,7 @@ def get_train_ds_config(offload, dtype_config = {"enabled": True} zero_opt_dict = { "stage": stage, + "overlap_comm": True, "offload_param": { "device": device }, From 130fb58608f96472120139c1e723bb737d8f0aa5 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Tue, 29 Oct 2024 06:47:43 -0700 Subject: [PATCH 54/58] [cifar ds training]: Set cuda device during initialization of distributed backend. (#931) * Set cuda device during initialization of distributed backend. The commit is needed to avoid GPU 0 being set as the compute stream via torch.cuda.current_stream() during initialization across all GPUs. Signed-off-by: Jagadish Krishnamoorthy * Use device-agnostic accelerator API. Signed-off-by: Jagadish Krishnamoorthy --------- Signed-off-by: Jagadish Krishnamoorthy --- training/cifar/cifar10_deepspeed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py index 521a75cdf..9888544d5 100755 --- a/training/cifar/cifar10_deepspeed.py +++ b/training/cifar/cifar10_deepspeed.py @@ -1,4 +1,5 @@ import argparse +import os import deepspeed import torch @@ -279,6 +280,8 @@ def test(model_engine, testset, local_device, target_dtype, test_batch_size=4): def main(args): # Initialize DeepSpeed distributed backend. deepspeed.init_distributed() + _local_rank = int(os.environ.get("LOCAL_RANK")) + get_accelerator().set_device(_local_rank) ######################################################################## # Step1. Data Preparation. From 5a61193519cb7066138aa79f12eb0c72291390b2 Mon Sep 17 00:00:00 2001 From: SCheekati <88806457+SCheekati@users.noreply.github.com> Date: Tue, 29 Oct 2024 18:46:55 -0400 Subject: [PATCH 55/58] Fixed mistake in readme (#933) Co-authored-by: Olatunji Ruwase --- inference/huggingface/zero_inference/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/huggingface/zero_inference/README.md b/inference/huggingface/zero_inference/README.md index f6dd4850e..acca9404e 100644 --- a/inference/huggingface/zero_inference/README.md +++ b/inference/huggingface/zero_inference/README.md @@ -90,7 +90,7 @@ deepspeed --num_gpus 1 run_model.py --model bigscience/bloom-7b1 --batch-size 8 Here is an example of running `meta-llama/Llama-2-7b-hf` with Zero-Inference using 4-bit model weights and offloading kv cache to CPU: ```sh -deepspeed --num_gpus 1 run_model.py --model meta-llama/Llama-2-7b-hf` --batch-size 8 --prompt-len 512 --gen-len 32 --cpu-offload --quant-bits 4 --kv-offload +deepspeed --num_gpus 1 run_model.py --model meta-llama/Llama-2-7b-hf --batch-size 8 --prompt-len 512 --gen-len 32 --cpu-offload --quant-bits 4 --kv-offload ``` ## Performance Tuning Tips From cab3361abee120620384d8f8c3cb1d52631f0f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Wed, 30 Oct 2024 00:52:28 +0200 Subject: [PATCH 56/58] Replace deprecated transformers.deepspeed module (#872) venv/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( Signed-off-by: Songlin Jiang Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py | 2 +- applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py | 2 +- inference/huggingface/zero_inference/run_model.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py index 97d3bff15..050819a22 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py @@ -10,7 +10,7 @@ AutoModel, ) from huggingface_hub import snapshot_download -from transformers.deepspeed import HfDeepSpeedConfig +from transformers.integrations.deepspeed import HfDeepSpeedConfig from dschat.utils.model.reward_model import RewardModel from dschat.utils.utils import load_state_dict_into_model, print_rank_0 diff --git a/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py b/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py index eb9db9428..1407c1dfc 100755 --- a/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py +++ b/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py @@ -15,7 +15,7 @@ os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import data.DST as DST # default special tokens from torch.utils.data import DataLoader -from transformers.deepspeed import HfDeepSpeedConfig +from transformers.integrations.deepspeed import HfDeepSpeedConfig import numpy as np from .vis_proj import VisProjection_vit, VisProjection_perceiver diff --git a/inference/huggingface/zero_inference/run_model.py b/inference/huggingface/zero_inference/run_model.py index 230d601cb..d0e16eca3 100644 --- a/inference/huggingface/zero_inference/run_model.py +++ b/inference/huggingface/zero_inference/run_model.py @@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM, BloomForCausalLM, OPTForCausalLM, LlamaForCausalLM, ) -from transformers.deepspeed import HfDeepSpeedConfig +from transformers.integrations.deepspeed import HfDeepSpeedConfig from utils import (GB, add_model_hooks, cache_bytes, get_filename, get_quant_config, hidden_bytes, meta_to_cpu, model_bytes, write_benchmark_log) From aa4459f5cba24cd52dd8fdae936740ad94aa49a9 Mon Sep 17 00:00:00 2001 From: Konstantinos Fertakis Date: Tue, 29 Oct 2024 22:57:55 +0000 Subject: [PATCH 57/58] =?UTF-8?q?=CE=95nable=20reward=20model=20offloading?= =?UTF-8?q?=20option=20(#930)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * enable reward model offloading option * fixed code formatting * more formatting fixes * Pre-commit formatting fix --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Logan Adams --- .../DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py | 17 ++++------------- .../training/step3_rlhf_finetuning/main.py | 3 +++ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py index 5b6778cc2..0e67efcf9 100755 --- a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py +++ b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py @@ -268,23 +268,14 @@ def _init_reward(self, critic_model_name_or_path): # If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory zero_stage = 0 - ds_config = get_eval_ds_config(offload=self.args.offload, + ds_config = get_eval_ds_config(offload=self.args.offload_reward_model, dtype=self.args.dtype, stage=zero_stage) - ds_config[ - 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size - ds_config[ - 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( - ) * self.args.gradient_accumulation_steps - - ds_eval_config = get_eval_ds_config(offload=False, - dtype=self.args.dtype, - stage=zero_stage) # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine. - ds_eval_config[ + ds_config[ 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size - ds_eval_config[ + ds_config[ 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( ) * self.args.gradient_accumulation_steps @@ -292,7 +283,7 @@ def _init_reward(self, critic_model_name_or_path): reward_model = create_critic_model( model_name_or_path=critic_model_name_or_path, tokenizer=self.tokenizer, - ds_config=ds_eval_config, + ds_config=ds_config, num_padding_at_beginning=self.args.num_padding_at_beginning, rlhf_training=True, dropout=self.args.critic_dropout, diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py index a5be5671b..1378dc4e6 100644 --- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py @@ -246,6 +246,9 @@ def parse_args(): '--offload_reference_model', action='store_true', help='Enable ZeRO Offload techniques for reference model') + parser.add_argument('--offload_reward_model', + action='store_true', + help='Enable ZeRO Offload techniques for reward model') parser.add_argument( '--actor_zero_stage', type=int, From eefb0ef3651f686b1217eb91ed5e7333dd388a77 Mon Sep 17 00:00:00 2001 From: Xingfu Yi <54938415+Xingfu-Yi@users.noreply.github.com> Date: Thu, 31 Oct 2024 00:13:05 +0800 Subject: [PATCH 58/58] Remove the fixed `eot_token` mechanism for SFT (#927) Not all pretrained LLMs use `<|endoftext|>` as the `eot_token`, therefore it's inappropriate to fix it. Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- .../training/step1_supervised_finetuning/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index d9527af54..aa505a25d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -191,7 +191,13 @@ def parse_args(): parser.add_argument( "--add_eot_token", action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + help="Add `eot_token` as additional special token to tokenizer") + parser.add_argument( + "--eot_token", + type=str, + default="<|endoftext|>", + help="Specify the format of the `eot_token`", + ) ## Print loss parser.add_argument('--print_loss', action='store_true', @@ -234,8 +240,7 @@ def main(): torch.distributed.barrier() # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family - args.end_of_conversation_token = "<|endoftext|>" - additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None + additional_special_tokens = args.eot_token if args.add_eot_token else None tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True, add_special_tokens=additional_special_tokens)