TGI: update to controller version 1.4.0 & bug fixes (#470)

* feat(tgi): update controller version to 1.4.0 * fix(tgi): correctly handle single token inputs * feat(tgi): support seed generation parameter * fix(tgi): return correct finish reason * fix(tgi): return only pending requests in next batch On decode, finished requests were also included. * fix(decoder): do not modify generation_config parameter * fix(tgi): avoid repeated token in continuous batching The last generated token of paused slots was recreated and sent back instead of generating a new one. * fix(tgi): update max_new_tokens in continuous batching The max_new_tokens was not updated for pending requests while the generated tokens were now actually seen as input tokens. This was effectively as if the number of generated tokens had been reset. * test(tgi): add generator python tests * test(tgi): add docker tests
huggingface · Feb 8, 2024 · ab582ce · ab582ce
1 parent 1b7d07d
commit ab582ce
Show file tree

Hide file tree

Showing 10 changed files with 543 additions and 27 deletions.
diff --git a/Makefile b/Makefile
@@ -40,12 +40,21 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
+TGI_VERSION ?= 1.4.0
+
 neuronx-tgi: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 -t neuronx-tgi:$(VERSION) .
 	docker tag neuronx-tgi:$(VERSION) neuronx-tgi:latest
 
 neuronx-tgi-sagemaker: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --target sagemaker --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 --target sagemaker \
+				 -t neuronx-tgi:$(VERSION) .
 
 # Creates example scripts from Transformers
 transformers_examples:
@@ -81,10 +90,14 @@ test_installs:
 tgi_server:
 	python -m pip install -r text-generation-inference/server/build-requirements.txt
 	make -C text-generation-inference/server clean
-	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
 
 tgi_test: tgi_server
 	python -m pip install .[neuronx] pytest
 	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
 	                               -exec python -m pip install --force-reinstall {} \;
 	python -m pytest -s text-generation-inference/tests
+
+tgi_docker_test: neuronx-tgi
+	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
+	python -m pytest -s text-generation-inference/integration-tests
diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from typing import Optional
 
@@ -43,13 +44,16 @@ def __init__(
         eos_token_id: int,
         pad_token_id: int,
         logits_warper: Optional[LogitsProcessorList] = None,
+        seed: Optional[int] = 0,
     ):
         self.mode = mode
         self.logits_processor = logits_processor
         self.stopping_criteria = stopping_criteria
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.logits_warper = logits_warper
+        self.generator = torch.Generator()
+        self.generator.manual_seed(seed)
 
     @classmethod
     def create(
@@ -59,6 +63,7 @@ def create(
         model: GenerationMixin,
         max_seq_length: int,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
+        seed: Optional[int] = 0,
     ) -> "TokenSelector":
         r"""Creates the `TokenSelector` for a specific generation configuration.
 
@@ -74,10 +79,13 @@ def create(
             stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`):
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 generation config.
+            seed(`Optional[int]`):
+                The optional seed for sampling. Defaults to zero.
         Return:
             `torch.LongTensor`: A `torch.LongTensor` containing the selected tokens.
         """
         generation_config.validate()
+        generation_config = copy.deepcopy(generation_config)
 
         unsupported_generation_flags = [
             "output_attentions",
@@ -145,6 +153,7 @@ def create(
             logits_warper=logits_warper,
             eos_token_id=eos_token_id,
             pad_token_id=generation_config.pad_token_id,
+            seed=seed,
         )
 
     def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
@@ -171,7 +180,7 @@ def _sample(self, scores: torch.Tensor) -> torch.LongTensor:
 
         # sample
         probs = torch.nn.functional.softmax(scores, dim=-1)
-        next_tokens = torch.multinomial(probs, num_samples=1)
+        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
         # Convert the filtered tokens to actual vocabulary tokens
         next_tokens = torch.gather(next_token_indices, 1, next_tokens)
         return next_tokens.squeeze(1)
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
@@ -1,7 +1,9 @@
-# Fetch and extract the TGI sources
+# Fetch and extract the TGI sources (TGI_VERSION is mandatory)
 FROM alpine AS tgi
+ARG TGI_VERSION
+RUN test -n ${TGI_VERSION:?}
 RUN mkdir -p /tgi
-ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz /tgi/sources.tar.gz
+ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
 RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)

diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
@@ -0,0 +1,155 @@
+import asyncio
+import contextlib
+import os
+import random
+import shlex
+import subprocess
+import sys
+import time
+from tempfile import TemporaryDirectory
+from typing import List
+
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "neuronx-tgi:latest")
+HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for _ in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+                time.sleep(1)
+        raise RuntimeError("Health check failed")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        return container.status in ["running", "created"]
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    # Cleanup the temporary directory using sudo as it contains root files created by the container
+    subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))
+
+
+@pytest.fixture(scope="module")
+def launcher(event_loop, data_volume):
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        trust_remote_code: bool = False,
+    ):
+        port = random.randint(8000, 10_000)
+
+        args = ["--model-id", model_id, "--env"]
+
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{model_id.split('/')[-1]}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+
+        if HUGGING_FACE_HUB_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+
+        for var in ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_AUTOCAST_TYPE", "HF_NUM_CORES"]:
+            if var in os.environ:
+                env[var] = os.environ[var]
+
+        volumes = [f"{data_volume}:/data"]
+
+        container = client.containers.run(
+            DOCKER_IMAGE,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=False,
+            detach=True,
+            devices=["/dev/neuron0"],
+            volumes=volumes,
+            ports={"80/tcp": port},
+            shm_size="1G",
+        )
+
+        yield ContainerLauncherHandle(client, container.name, port)
+
+        try:
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        container_output = container.logs().decode("utf-8")
+        print(container_output, file=sys.stderr)
+
+        container.remove()
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]:
+        futures = [
+            client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n)
+        ]
+
+        return await asyncio.gather(*futures)
+
+    return generate_load_inner
diff --git a/text-generation-inference/integration-tests/pytest.ini b/text-generation-inference/integration-tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/text-generation-inference/integration-tests/requirements.txt b/text-generation-inference/integration-tests/requirements.txt
@@ -0,0 +1,18 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+text-generation >= 0.6.0
+pytest >= 7.4.0
+pytest-asyncio >= 0.21.1
+docker >= 6.1.3
+Levenshtein
diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
@@ -0,0 +1,99 @@
+import os
+
+import huggingface_hub
+import Levenshtein
+import pytest
+
+
+MODEL_ID = "gpt2"
+NEURON_MODEL_ID = "aws-neuron/gpt2-neuronx-bs4-seqlen1024"
+BATCH_SIZE = 4
+SEQUENCE_LENGTH = 1024
+NUM_CORES = 2
+
+
+@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
+def model_name_or_path(request, data_volume):
+    if request.param == "hub":
+        os.environ["HF_BATCH_SIZE"] = str(BATCH_SIZE)
+        os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
+        os.environ["HF_NUM_CORES"] = str(NUM_CORES)
+        yield MODEL_ID
+    elif request.param == "hub-neuron":
+        yield NEURON_MODEL_ID
+    else:
+        model_dir = f"gpt2-neuron-{BATCH_SIZE}x{SEQUENCE_LENGTH}x{NUM_CORES}"
+        local_path = os.path.join(data_volume, model_dir)
+        huggingface_hub.snapshot_download(NEURON_MODEL_ID, local_dir=local_path)
+        # Return the path of the model inside the mounted volume
+        yield os.path.join("/data", model_dir)
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_name_or_path):
+    with launcher(model_name_or_path) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(300)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client):
+
+    # Greedy bounded without input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"
+
+    # Greedy bounded with input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert (
+        response.generated_text
+        == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
+    )
+
+    # Sampling
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=1000,
+        seed=42,
+        decoder_input_details=True,
+    )
+    assert "The purpose of the current post is" in response.generated_text
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_client, generate_load):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = "\n\nDeep learning is a new field of research that has been around for a while"
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3