From 09cc51bd84466adedb232f5b56ec38ca080823d8 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 26 Jun 2024 14:14:31 +0200 Subject: [PATCH] Refactor decoder tests (#641) * test: remove staging_test fixture * test(decoder): regroup decoder generation tests * fix(decoder): save checkpoint after export from local * test: regroup decoder tests * fix(tgi): temporarily pin ravif version * ci: run decoder tests --- .github/workflows/test_inf2.yml | 6 +- optimum/neuron/modeling_decoder.py | 16 +-- tests/conftest.py | 35 ----- tests/decoder/conftest.py | 122 ++++++++++++++++++ tests/decoder/test_decoder_export.py | 87 +++++++++++++ .../test_decoder_generation.py} | 76 +++++++++-- tests/decoder/test_decoder_hub.py | 65 ++++++++++ .../test_fused_logits_warper.py | 0 tests/generation/conftest.py | 37 +----- tests/generation/test_export.py | 40 +----- tests/generation/test_hub.py | 32 +---- tests/generation/test_tnx_generate.py | 105 --------------- text-generation-inference/Dockerfile | 2 + 13 files changed, 357 insertions(+), 266 deletions(-) create mode 100644 tests/decoder/conftest.py create mode 100644 tests/decoder/test_decoder_export.py rename tests/{generation/test_tnx_llama.py => decoder/test_decoder_generation.py} (59%) create mode 100644 tests/decoder/test_decoder_hub.py rename tests/{generation => decoder}/test_fused_logits_warper.py (100%) delete mode 100644 tests/generation/test_tnx_generate.py diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml index 567aa8980..c709a30df 100644 --- a/.github/workflows/test_inf2.yml +++ b/.github/workflows/test_inf2.yml @@ -52,7 +52,11 @@ jobs: run: | source aws_neuron_venv_pytorch/bin/activate HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli - - name: Run generation tests + - name: Run decoder tests + run: | + source aws_neuron_venv_pytorch/bin/activate + HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder + - name: Run other generation tests run: | source aws_neuron_venv_pytorch/bin/activate HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/generation diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py index 44d434dab..a2efea9bf 100644 --- a/optimum/neuron/modeling_decoder.py +++ b/optimum/neuron/modeling_decoder.py @@ -445,21 +445,19 @@ def forward(self, *args, **kwargs): def _save_pretrained(self, save_directory: Union[str, Path]): dst_checkpoint_path, dst_compiled_path = self._get_neuron_dirs(save_directory) - def copy_dir_to_path(src_dir: Union[str, Path, TemporaryDirectory], dst_path: Union[str, Path]): - if isinstance(src_dir, TemporaryDirectory): - shutil.copytree(src_dir.name, dst_path, dirs_exist_ok=True) - elif not os.path.samefile(src_dir, dst_path): - os.symlink(dst_path, src_dir) - - # Copy checkpoint directory (it always exists) - copy_dir_to_path(self.checkpoint_dir, dst_checkpoint_path) + neuron_config = getattr(self.config, "neuron") + checkpoint_id = neuron_config.get("checkpoint_id", None) + if checkpoint_id is None: + # Model was exported from a local path, so we need to save the checkpoint + shutil.copytree(self.checkpoint_dir, dst_checkpoint_path, dirs_exist_ok=True) self.checkpoint_dir = dst_checkpoint_path + # Save or create compiled directory if self.compiled_dir is None: # The compilation artifacts have never been saved, do it now self.model.save(dst_compiled_path) else: - copy_dir_to_path(self.compiled_dir, dst_compiled_path) + shutil.copytree(self.compiled_dir, dst_compiled_path) self.compiled_dir = dst_compiled_path self.generation_config.save_pretrained(save_directory) diff --git a/tests/conftest.py b/tests/conftest.py index ed3166ea4..a681ed087 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,6 @@ set_custom_cache_repo_name_in_hf_home, set_neuron_cache_path, ) -from optimum.utils.testing_utils import TOKEN, USER from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, get_random_string @@ -88,40 +87,6 @@ def inf_diffuser_model(request): return request.param -@pytest.fixture(scope="module") -def staging_test(): - custom_cache_repo_name = "optimum-neuron-cache-testing" - custom_cache_repo = f"{USER}/{custom_cache_repo_name}" - custom_private_cache_repo = f"{custom_cache_repo}-private" - - orig_token = get_token() - orig_custom_cache_repo = load_custom_cache_repo_name_from_hf_home() - - seed = get_random_string(5) - custom_cache_repo_with_seed = f"{custom_cache_repo}-{seed}" - custom_private_cache_repo_with_seed = f"{custom_private_cache_repo}-{seed}" - - login(token=TOKEN) - # We do not set which cache repo to use because there are two, it is up to the test to define that. - - create_repo(custom_cache_repo_with_seed, repo_type="model", exist_ok=True) - create_repo(custom_private_cache_repo_with_seed, repo_type="model", exist_ok=True, private=True) - - yield - - delete_repo(custom_cache_repo_with_seed, repo_type="model") - delete_repo(custom_private_cache_repo_with_seed, repo_type="model") - - if orig_token is not None: - login(token=orig_token) - else: - logout() - if orig_custom_cache_repo is not None: - set_custom_cache_repo_name_in_hf_home(orig_custom_cache_repo, check_repo=False) - else: - delete_custom_cache_repo_name_from_hf_home() - - def _hub_test(create_local_cache: bool = False): orig_token = get_token() orig_custom_cache_repo = load_custom_cache_repo_name_from_hf_home() diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py new file mode 100644 index 000000000..1b02a20de --- /dev/null +++ b/tests/decoder/conftest.py @@ -0,0 +1,122 @@ +import copy +import logging +import subprocess +import sys +from tempfile import TemporaryDirectory + +import huggingface_hub +import pytest +from transformers import AutoTokenizer + +from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron.utils import synchronize_hub_cache +from optimum.neuron.version import __sdk_version__ as sdk_version +from optimum.neuron.version import __version__ as version + + +logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s", + stream=sys.stdout, +) +logger = logging.getLogger(__file__) + +OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache" + +# All model configurations below will be added to the neuron_model_config fixture +DECODER_MODEL_CONFIGURATIONS = { + "gpt2": { + "model_id": "gpt2", + "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, + }, + "llama": { + "model_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, + }, + "mistral": { + "model_id": "optimum/mistral-1.1b-testing", + "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + }, +} + + +def _get_hub_neuron_model_id(config_name: str): + return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" + + +def _export_model(model_id, export_kwargs, neuron_model_path): + export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"] + for kwarg, value in export_kwargs.items(): + export_command.append(f"--{kwarg}") + export_command.append(str(value)) + export_command.append(neuron_model_path) + logger.info(f"Exporting {model_id} with {export_kwargs}") + try: + subprocess.run(export_command, check=True) + except subprocess.CalledProcessError as e: + logger.error(f"Failed to export model: {e}") + return + + +@pytest.fixture(scope="session", params=DECODER_MODEL_CONFIGURATIONS.keys()) +def neuron_decoder_config(request): + """Expose a pre-trained neuron decoder model + + The fixture first makes sure the following model artifacts are present on the hub: + - exported neuron model under optimum-internal-testing/neuron-testing--, + - cached artifacts under optimum-internal-testing/neuron-testing-cache. + If not, it will export the model and push it to the hub. + + It then fetches the model locally and return a dictionary containing: + - a configuration name, + - the original model id, + - the export parameters, + - the neuron model id, + - the neuron model local path. + + For each exposed model, the local directory is maintained for the duration of the + test session and cleaned up afterwards. + The hub model artifacts are never cleaned up and persist accross sessions. + They must be cleaned up manually when the optimum-neuron version changes. + + """ + config_name = request.param + model_config = copy.deepcopy(DECODER_MODEL_CONFIGURATIONS[request.param]) + model_id = model_config["model_id"] + export_kwargs = model_config["export_kwargs"] + neuron_model_id = _get_hub_neuron_model_id(config_name) + with TemporaryDirectory() as neuron_model_path: + hub = huggingface_hub.HfApi() + if hub.repo_exists(neuron_model_id): + logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") + hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) + else: + _export_model(model_id, export_kwargs, neuron_model_path) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(neuron_model_path) + del tokenizer + # Create the test model on the hub + hub.create_repo(neuron_model_id, private=True) + hub.upload_folder( + folder_path=neuron_model_path, + repo_id=neuron_model_id, + ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"], + ) + # Make sure it is cached + synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID) + # Add dynamic parameters to the model configuration + model_config["neuron_model_path"] = neuron_model_path + model_config["neuron_model_id"] = neuron_model_id + # Also add model configuration name to allow tests to adapt their expectations + model_config["name"] = config_name + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + logger.info(f"{config_name} ready for testing ...") + yield model_config + logger.info(f"Done with {config_name}") + + +@pytest.fixture(scope="module") +def neuron_decoder_path(neuron_decoder_config): + yield neuron_decoder_config["neuron_model_path"] diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py new file mode 100644 index 000000000..4aaed4a20 --- /dev/null +++ b/tests/decoder/test_decoder_export.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tempfile import TemporaryDirectory + +import pytest +from transformers import AutoModelForCausalLM + +from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx + + +DECODER_MODEL_ARCHITECTURES = ["bloom", "gpt2", "llama", "mistral", "mixtral", "opt"] +DECODER_MODEL_NAMES = { + "bloom": "hf-internal-testing/tiny-random-BloomForCausalLM", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "llama": "dacorvo/tiny-random-llama", + "mistral": "dacorvo/tiny-random-MistralForCausalLM", + "mixtral": "dacorvo/Mixtral-tiny", + "opt": "hf-internal-testing/tiny-random-OPTForCausalLM", +} + + +@pytest.fixture( + scope="session", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES] +) +def export_decoder_id(request): + return request.param + + +def check_neuron_model(neuron_model, batch_size=None, sequence_length=None, num_cores=None, auto_cast_type=None): + neuron_config = getattr(neuron_model.config, "neuron", None) + assert neuron_config + if batch_size: + assert neuron_config["batch_size"] == batch_size + if sequence_length: + assert neuron_config["sequence_length"] == sequence_length + if num_cores: + assert neuron_config["num_cores"] == num_cores + if auto_cast_type: + assert neuron_config["auto_cast_type"] == auto_cast_type + + +@pytest.mark.parametrize( + "batch_size, sequence_length, num_cores, auto_cast_type", + [ + [1, 100, 2, "fp32"], + [1, 100, 2, "fp16"], + [2, 100, 2, "fp16"], + ], +) +@is_inferentia_test +@requires_neuronx +@pytest.mark.parametrize("local", [True, False], ids=["local", "from_hub"]) +def test_decoder_export_save_reload(local, export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type): + export_kwargs = { + "batch_size": batch_size, + "sequence_length": sequence_length, + "num_cores": num_cores, + "auto_cast_type": auto_cast_type, + } + with TemporaryDirectory() as model_path: + if local: + with TemporaryDirectory() as tmpdir: + model = AutoModelForCausalLM.from_pretrained(export_decoder_id) + model.save_pretrained(tmpdir) + model = NeuronModelForCausalLM.from_pretrained(tmpdir, export=True, **export_kwargs) + model.save_pretrained(model_path) + else: + model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, **export_kwargs) + model.save_pretrained(model_path) + check_neuron_model(model, **export_kwargs) + del model + model = NeuronModelForCausalLM.from_pretrained(model_path) + check_neuron_model(model, **export_kwargs) diff --git a/tests/generation/test_tnx_llama.py b/tests/decoder/test_decoder_generation.py similarity index 59% rename from tests/generation/test_tnx_llama.py rename to tests/decoder/test_decoder_generation.py index 3876e63f2..c2e67707f 100644 --- a/tests/generation/test_tnx_llama.py +++ b/tests/decoder/test_decoder_generation.py @@ -18,24 +18,78 @@ import pytest import torch from transformers import AutoTokenizer +from transformers.generation import StoppingCriteria from optimum.neuron import NeuronModelForCausalLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx @pytest.fixture(scope="module") -def neuron_model_config(): - model_id = "princeton-nlp/Sheared-LLaMA-1.3B" - model_kwargs = {"batch_size": 4, "sequence_length": 4096, "auto_cast_type": "f16", "num_cores": 2} - model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **model_kwargs) - tokenizer = AutoTokenizer.from_pretrained(model_id) +def model_and_tokenizer(neuron_decoder_path): + model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path) yield (model, tokenizer) +def _test_generation(model, batch_size, input_length, **gen_kwargs): + input_ids = torch.ones((batch_size, input_length), dtype=torch.int64) + sample_output = model.generate(input_ids, **gen_kwargs) + assert sample_output.shape[0] == batch_size + + +@pytest.mark.parametrize( + "gen_kwargs", + [ + {"do_sample": True}, + {"do_sample": True, "temperature": 0.7}, + {"do_sample": False}, + {"do_sample": False, "repetition_penalty": 1.2}, + ], + ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"], +) +@is_inferentia_test +@requires_neuronx +def test_decoder_generation_base(model_and_tokenizer, gen_kwargs): + model = model_and_tokenizer[0] + _test_generation(model, model.batch_size, 10, **gen_kwargs) + + +@is_inferentia_test +@requires_neuronx +def test_decoder_generation_input_dimensions(model_and_tokenizer): + model, tokenizer = model_and_tokenizer + # Using valid input dimensions + _test_generation(model, model.batch_size, model.max_length // 2) + # Using an incompatible batch_size + with pytest.raises(ValueError, match="The specified batch_size"): + _test_generation(model, model.batch_size + 1, model.max_length) + # Using an incompatible input length + with pytest.raises(ValueError, match="The input sequence length"): + _test_generation(model, model.batch_size, input_length=model.max_length * 2) + + +@is_inferentia_test +@requires_neuronx +def test_decoder_generation_custom_stopping_criteria(model_and_tokenizer): + model = model_and_tokenizer[0] + + class CustomStoppingCriteria(StoppingCriteria): + def __init__(self): + self.called = False + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + self.called = True + return True + + criteria = CustomStoppingCriteria() + model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria]) + assert criteria.called, "Custom StoppingCriteria should have been called" + + @is_inferentia_test @requires_neuronx -def test_generation_llama_padded_inputs(neuron_model_config): - model, tokenizer = neuron_model_config +def test_decoder_generation_padded_inputs(model_and_tokenizer): + model, tokenizer = model_and_tokenizer prompt = "One of my fondest memory is of my grandmother making homemade bread" first_input = tokenizer(prompt) first_ids = first_input["input_ids"] @@ -56,8 +110,8 @@ def test_generation_llama_padded_inputs(neuron_model_config): @is_inferentia_test @requires_neuronx -def test_decoder_generation_multiple_eos_token_ids(neuron_model_config): - model, tokenizer = neuron_model_config +def test_decoder_generation_multiple_eos_token_ids(model_and_tokenizer): + model, tokenizer = model_and_tokenizer prompt = "Name three fruits:" tokens = tokenizer(prompt, return_tensors="pt") generation_config = copy.deepcopy(model.generation_config) @@ -75,8 +129,8 @@ def test_decoder_generation_multiple_eos_token_ids(neuron_model_config): @is_inferentia_test @requires_neuronx -def test_decoder_generation_stop_strings(neuron_model_config): - model, tokenizer = neuron_model_config +def test_decoder_generation_stop_strings(model_and_tokenizer): + model, tokenizer = model_and_tokenizer prompt = "Name three fruits:" tokens = tokenizer(prompt, return_tensors="pt") generation_config = copy.deepcopy(model.generation_config) diff --git a/tests/decoder/test_decoder_hub.py b/tests/decoder/test_decoder_hub.py new file mode 100644 index 000000000..566d9659a --- /dev/null +++ b/tests/decoder/test_decoder_hub.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +import socket +from tempfile import TemporaryDirectory + +import pytest +from huggingface_hub import HfApi, get_token +from transformers import AutoModelForCausalLM + +from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx + + +@is_inferentia_test +@requires_neuronx +@pytest.mark.parametrize("from_local", [False, True], ids=["from_hub", "from_local"]) +def test_decoder_push_to_hub(from_local): + model_id = "hf-internal-testing/tiny-random-gpt2" + with TemporaryDirectory() as model_path: + if from_local: + hub_model = AutoModelForCausalLM.from_pretrained(model_id) + with TemporaryDirectory() as tmpdir: + hub_model.save_pretrained(tmpdir) + model = NeuronModelForCausalLM.from_pretrained(tmpdir, export=True) + # Save must happen within the context of the tmpdir or checkpoint dir is lost + model.save_pretrained(model_path) + else: + model = NeuronModelForCausalLM.from_pretrained(model_id, export=True) + model.save_pretrained(model_path) + # The hub model contains the checkpoint only when the model is exported from a local path + ignore_patterns = [] if from_local else [model.CHECKPOINT_DIR + "/*"] + hostname = socket.gethostname() + model_name = f"neuron-testing-{hostname}-decoder-push" + model_name += "-from-local" if from_local else "-from-hub" + repo_id = f"optimum-internal-testing/{model_name}" + model.push_to_hub(model_path, repo_id, use_auth_token=get_token()) + api = HfApi() + try: + hub_files_path = api.list_repo_files(repo_id) + for path, _, files in os.walk(model_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, model_path) + excluded = False + for pattern in ignore_patterns: + if re.compile(pattern).match(hub_file_path) is not None: + excluded = True + break + assert excluded or hub_file_path in hub_files_path + finally: + api.delete_repo(repo_id) diff --git a/tests/generation/test_fused_logits_warper.py b/tests/decoder/test_fused_logits_warper.py similarity index 100% rename from tests/generation/test_fused_logits_warper.py rename to tests/decoder/test_fused_logits_warper.py diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py index 845df8adf..0d19e865d 100644 --- a/tests/generation/conftest.py +++ b/tests/generation/conftest.py @@ -15,22 +15,13 @@ from tempfile import TemporaryDirectory import pytest -from transformers import AutoTokenizer, T5ForConditionalGeneration +from transformers import T5ForConditionalGeneration -from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM +from optimum.neuron import NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import requires_neuronx from optimum.utils.testing_utils import USER -DECODER_MODEL_ARCHITECTURES = ["bloom", "gpt2", "llama", "mistral", "mixtral", "opt"] -DECODER_MODEL_NAMES = { - "bloom": "hf-internal-testing/tiny-random-BloomForCausalLM", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "llama": "dacorvo/tiny-random-llama", - "mistral": "dacorvo/tiny-random-MistralForCausalLM", - "mixtral": "dacorvo/Mixtral-tiny", - "opt": "hf-internal-testing/tiny-random-OPTForCausalLM", -} TRN_DECODER_MODEL_ARCHITECTURES = ["bloom", "llama", "opt"] TRN_DECODER_MODEL_NAMES = { "bloom": "bigscience/bloom-560m", @@ -45,13 +36,6 @@ } -@pytest.fixture( - scope="session", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES] -) -def export_decoder_id(request): - return request.param - - @pytest.fixture( scope="module", params=[TRN_DECODER_MODEL_NAMES[model_arch] for model_arch in TRN_DECODER_MODEL_ARCHITECTURES] ) @@ -69,23 +53,6 @@ def export_seq2seq_model_class(request): return request.param -@pytest.fixture(scope="session") -@requires_neuronx -def neuron_decoder_path(export_decoder_id): - model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, batch_size=2, num_cores=2) - model_dir = TemporaryDirectory() - model_path = model_dir.name - model.save_pretrained(model_path) - del model - tokenizer = AutoTokenizer.from_pretrained(export_decoder_id) - tokenizer.save_pretrained(model_path) - del tokenizer - # Yield instead of returning to keep a reference to the temporary directory. - # It will go out of scope and be released only once all tests needing the fixture - # have been completed. - yield model_path - - @pytest.fixture(scope="module") @requires_neuronx def neuron_seq2seq_beam_path(export_seq2seq_id): diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py index 676a5e7ee..7737274ef 100644 --- a/tests/generation/test_export.py +++ b/tests/generation/test_export.py @@ -13,51 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from tempfile import TemporaryDirectory import pytest -from generation_utils import check_neuron_model -from transformers import AutoModelForCausalLM -from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM +from optimum.neuron import NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx -@pytest.mark.parametrize( - "batch_size, sequence_length, num_cores, auto_cast_type", - [ - [1, 100, 2, "fp32"], - [1, 100, 2, "fp16"], - [2, 100, 2, "fp16"], - ], -) -@is_inferentia_test -@requires_neuronx -@pytest.mark.parametrize("local", [True, False], ids=["local", "from_hub"]) -def test_decoder_export(local, export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type): - export_kwargs = { - "batch_size": batch_size, - "sequence_length": sequence_length, - "num_cores": num_cores, - "auto_cast_type": auto_cast_type, - } - if local: - with TemporaryDirectory() as model_path: - model = AutoModelForCausalLM.from_pretrained(export_decoder_id) - model.save_pretrained(model_path) - model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs) - else: - model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, **export_kwargs) - check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type) - - -@is_inferentia_test -@requires_neuronx -def test_model_from_path(neuron_decoder_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - check_neuron_model(model) - - @pytest.mark.parametrize( "batch_size, sequence_length, num_beams", [ diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py index fd0f127ba..7e372ad9a 100644 --- a/tests/generation/test_hub.py +++ b/tests/generation/test_hub.py @@ -14,33 +14,15 @@ # limitations under the License. import os import re -from tempfile import TemporaryDirectory -import pytest -from generation_utils import check_neuron_model from huggingface_hub import HfApi from transformers.testing_utils import ENDPOINT_STAGING -from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM +from optimum.neuron import NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx from optimum.utils.testing_utils import TOKEN, USER -@is_inferentia_test -@requires_neuronx -@pytest.mark.parametrize( - "model_id, revision", - [ - ["dacorvo/tiny-random-gpt2-neuronx", "1b3456cf877cc42c053ee8464f1067021eccde4b"], - ["dacorvo/tiny-random-gpt2-neuronx-no-checkpoint", "78eb2313ab7e149bbc22ff32257db93ba09e3033"], - ], - ids=["checkpoint", "no-checkpoint"], -) -def test_decoder_model_from_hub(model_id, revision): - model = NeuronModelForCausalLM.from_pretrained(model_id, revision=revision) - check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32") - - def _test_push_to_hub(model, model_path, repo_id, ignore_patterns=[]): model.push_to_hub(model_path, repo_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) @@ -66,18 +48,6 @@ def neuron_push_model_id(model_id): return repo_id -@is_inferentia_test -@requires_neuronx -def test_push_decoder_to_hub(): - model_id = "hf-internal-testing/tiny-random-gpt2" - model = NeuronModelForCausalLM.from_pretrained(model_id, export=True) - with TemporaryDirectory() as tmpdir: - model.save_pretrained(tmpdir) - ignore_patterns = [model.CHECKPOINT_DIR + "/*"] - neuron_push_decoder_id = neuron_push_model_id(model_id) - _test_push_to_hub(model, tmpdir, neuron_push_decoder_id, ignore_patterns) - - @is_inferentia_test @requires_neuronx def test_seq2seq_model_from_hub(): diff --git a/tests/generation/test_tnx_generate.py b/tests/generation/test_tnx_generate.py deleted file mode 100644 index 94b0f06c8..000000000 --- a/tests/generation/test_tnx_generate.py +++ /dev/null @@ -1,105 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch -from transformers import AutoTokenizer -from transformers.generation import StoppingCriteria - -from optimum.neuron import NeuronModelForCausalLM -from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx - - -def _test_generation(model, batch_size, input_length, **gen_kwargs): - input_ids = torch.ones((batch_size, input_length), dtype=torch.int64) - with torch.inference_mode(): - sample_output = model.generate(input_ids, **gen_kwargs) - assert sample_output.shape[0] == batch_size - - -@pytest.mark.parametrize( - "gen_kwargs", - [ - {"do_sample": True}, - {"do_sample": True, "temperature": 0.7}, - {"do_sample": False}, - {"do_sample": False, "repetition_penalty": 1.2}, - ], - ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"], -) -@is_inferentia_test -@requires_neuronx -def test_decoder_generation(neuron_decoder_path, gen_kwargs): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - _test_generation(model, model.batch_size, 10, **gen_kwargs) - - -@is_inferentia_test -@requires_neuronx -def test_model_generation_input_dimensions(neuron_decoder_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - AutoTokenizer.from_pretrained(neuron_decoder_path) - # Using valid input dimensions - _test_generation(model, model.batch_size, model.max_length // 2) - # Using an incompatible batch_size - with pytest.raises(ValueError, match="The specified batch_size"): - _test_generation(model, model.batch_size + 1, model.max_length) - # Using an incompatible input length - with pytest.raises(ValueError, match="The input sequence length"): - _test_generation(model, model.batch_size, input_length=model.max_length * 2) - - -@is_inferentia_test -@requires_neuronx -def test_decoder_generation_custom_stopping_criteria(neuron_decoder_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - - class CustomStoppingCriteria(StoppingCriteria): - def __init__(self): - self.called = False - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - self.called = True - return True - - criteria = CustomStoppingCriteria() - model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria]) - assert criteria.called, "Custom StoppingCriteria should have been called" - - -@is_inferentia_test -@requires_neuronx -def test_decoder_generation_padded_inputs(neuron_decoder_path): - model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) - assert model.batch_size >= 2 - tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path) - prompt = ( - "It was a bright cold day in April, and the clocks were striking thirteen." - " Winston Smith, his chin nuzzled into his breast in an effort to escape the" - " vile wind, slipped quickly through the glass doors of Victory Mansions," - ) - first_input = tokenizer(prompt) - first_ids = first_input["input_ids"] - first_mask = first_input["attention_mask"] - max_padding = 12 - input_len = len(first_ids) - for i in range(max_padding): - second_ids = [tokenizer.eos_token_id] * i + first_ids[: input_len - i] - second_mask = [0] * i + [1] * (input_len - i) - input_ids = torch.tensor([first_ids, second_ids], dtype=torch.int64) - attention_mask = torch.tensor([first_mask, second_mask], dtype=torch.int64) - outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=False) - # Verify we did not generate any unknown token - assert torch.all(outputs[:, -1] != 0) diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile index 0846ce1a4..2941fdd6e 100644 --- a/text-generation-inference/Dockerfile +++ b/text-generation-inference/Dockerfile @@ -20,6 +20,8 @@ COPY --from=tgi /tgi/proto proto COPY --from=tgi /tgi/benchmark benchmark COPY --from=tgi /tgi/router router COPY --from=tgi /tgi/launcher launcher +# Remove the next line when bumping rust version +RUN cargo update ravif --precise 0.11.6 RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder