Skip to content

Commit

Permalink
TGI: update to controller version 1.4.0 & bug fixes (#470)
Browse files Browse the repository at this point in the history
* feat(tgi): update controller version to 1.4.0

* fix(tgi): correctly handle single token inputs

* feat(tgi): support seed generation parameter

* fix(tgi): return correct finish reason

* fix(tgi): return only pending requests in next batch

On decode, finished requests were also included.

* fix(decoder): do not modify generation_config parameter

* fix(tgi): avoid repeated token in continuous batching

The last generated token of paused slots was recreated and sent back
instead of generating a new one.

* fix(tgi): update max_new_tokens in continuous batching

The max_new_tokens was not updated for pending requests while the
generated tokens were now actually seen as input tokens.
This was effectively as if the number of generated tokens had been reset.

* test(tgi): add generator python tests

* test(tgi): add docker tests
  • Loading branch information
dacorvo authored Feb 8, 2024
1 parent 1b7d07d commit ab582ce
Show file tree
Hide file tree
Showing 10 changed files with 543 additions and 27 deletions.
19 changes: 16 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,21 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES) \
$(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
python -m build

TGI_VERSION ?= 1.4.0

neuronx-tgi: $(PACKAGE_DIST)
docker build --rm -f text-generation-inference/Dockerfile --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
docker build --rm -f text-generation-inference/Dockerfile \
--build-arg VERSION=$(VERSION) \
--build-arg TGI_VERSION=$(TGI_VERSION) \
-t neuronx-tgi:$(VERSION) .
docker tag neuronx-tgi:$(VERSION) neuronx-tgi:latest

neuronx-tgi-sagemaker: $(PACKAGE_DIST)
docker build --rm -f text-generation-inference/Dockerfile --target sagemaker --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
docker build --rm -f text-generation-inference/Dockerfile \
--build-arg VERSION=$(VERSION) \
--build-arg TGI_VERSION=$(TGI_VERSION) \
--target sagemaker \
-t neuronx-tgi:$(VERSION) .

# Creates example scripts from Transformers
transformers_examples:
Expand Down Expand Up @@ -81,10 +90,14 @@ test_installs:
tgi_server:
python -m pip install -r text-generation-inference/server/build-requirements.txt
make -C text-generation-inference/server clean
VERSION=${VERSION} make -C text-generation-inference/server gen-server
VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server

tgi_test: tgi_server
python -m pip install .[neuronx] pytest
find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
-exec python -m pip install --force-reinstall {} \;
python -m pytest -s text-generation-inference/tests

tgi_docker_test: neuronx-tgi
python -m pip install -r text-generation-inference/integration-tests/requirements.txt
python -m pytest -s text-generation-inference/integration-tests
11 changes: 10 additions & 1 deletion optimum/neuron/generation/token_selector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
from typing import Optional

Expand Down Expand Up @@ -43,13 +44,16 @@ def __init__(
eos_token_id: int,
pad_token_id: int,
logits_warper: Optional[LogitsProcessorList] = None,
seed: Optional[int] = 0,
):
self.mode = mode
self.logits_processor = logits_processor
self.stopping_criteria = stopping_criteria
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.logits_warper = logits_warper
self.generator = torch.Generator()
self.generator.manual_seed(seed)

@classmethod
def create(
Expand All @@ -59,6 +63,7 @@ def create(
model: GenerationMixin,
max_seq_length: int,
stopping_criteria: Optional[StoppingCriteriaList] = None,
seed: Optional[int] = 0,
) -> "TokenSelector":
r"""Creates the `TokenSelector` for a specific generation configuration.
Expand All @@ -74,10 +79,13 @@ def create(
stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`):
Custom stopping criteria that complement the default stopping criteria built from arguments and a
generation config.
seed(`Optional[int]`):
The optional seed for sampling. Defaults to zero.
Return:
`torch.LongTensor`: A `torch.LongTensor` containing the selected tokens.
"""
generation_config.validate()
generation_config = copy.deepcopy(generation_config)

unsupported_generation_flags = [
"output_attentions",
Expand Down Expand Up @@ -145,6 +153,7 @@ def create(
logits_warper=logits_warper,
eos_token_id=eos_token_id,
pad_token_id=generation_config.pad_token_id,
seed=seed,
)

def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
Expand All @@ -171,7 +180,7 @@ def _sample(self, scores: torch.Tensor) -> torch.LongTensor:

# sample
probs = torch.nn.functional.softmax(scores, dim=-1)
next_tokens = torch.multinomial(probs, num_samples=1)
next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
# Convert the filtered tokens to actual vocabulary tokens
next_tokens = torch.gather(next_token_indices, 1, next_tokens)
return next_tokens.squeeze(1)
6 changes: 4 additions & 2 deletions text-generation-inference/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Fetch and extract the TGI sources
# Fetch and extract the TGI sources (TGI_VERSION is mandatory)
FROM alpine AS tgi
ARG TGI_VERSION
RUN test -n ${TGI_VERSION:?}
RUN mkdir -p /tgi
ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz /tgi/sources.tar.gz
ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
Expand Down
155 changes: 155 additions & 0 deletions text-generation-inference/integration-tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import asyncio
import contextlib
import os
import random
import shlex
import subprocess
import sys
import time
from tempfile import TemporaryDirectory
from typing import List

import docker
import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from text_generation import AsyncClient
from text_generation.types import Response


DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "neuronx-tgi:latest")
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")


class LauncherHandle:
def __init__(self, port: int):
self.client = AsyncClient(f"http://localhost:{port}")

def _inner_health(self):
raise NotImplementedError

async def health(self, timeout: int = 60):
assert timeout > 0
for _ in range(timeout):
if not self._inner_health():
raise RuntimeError("Launcher crashed")

try:
await self.client.generate("test")
return
except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
time.sleep(1)
raise RuntimeError("Health check failed")


class ContainerLauncherHandle(LauncherHandle):
def __init__(self, docker_client, container_name, port: int):
super(ContainerLauncherHandle, self).__init__(port)
self.docker_client = docker_client
self.container_name = container_name

def _inner_health(self) -> bool:
container = self.docker_client.containers.get(self.container_name)
return container.status in ["running", "created"]


class ProcessLauncherHandle(LauncherHandle):
def __init__(self, process, port: int):
super(ProcessLauncherHandle, self).__init__(port)
self.process = process

def _inner_health(self) -> bool:
return self.process.poll() is None


@pytest.fixture(scope="module")
def event_loop():
loop = asyncio.get_event_loop()
yield loop
loop.close()


@pytest.fixture(scope="module")
def data_volume():
tmpdir = TemporaryDirectory()
yield tmpdir.name
# Cleanup the temporary directory using sudo as it contains root files created by the container
subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))


@pytest.fixture(scope="module")
def launcher(event_loop, data_volume):
@contextlib.contextmanager
def docker_launcher(
model_id: str,
trust_remote_code: bool = False,
):
port = random.randint(8000, 10_000)

args = ["--model-id", model_id, "--env"]

if trust_remote_code:
args.append("--trust-remote-code")

client = docker.from_env()

container_name = f"tgi-tests-{model_id.split('/')[-1]}"

try:
container = client.containers.get(container_name)
container.stop()
container.wait()
except NotFound:
pass

env = {"LOG_LEVEL": "info,text_generation_router=debug"}

if HUGGING_FACE_HUB_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN

for var in ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_AUTOCAST_TYPE", "HF_NUM_CORES"]:
if var in os.environ:
env[var] = os.environ[var]

volumes = [f"{data_volume}:/data"]

container = client.containers.run(
DOCKER_IMAGE,
command=args,
name=container_name,
environment=env,
auto_remove=False,
detach=True,
devices=["/dev/neuron0"],
volumes=volumes,
ports={"80/tcp": port},
shm_size="1G",
)

yield ContainerLauncherHandle(client, container.name, port)

try:
container.stop()
container.wait()
except NotFound:
pass

container_output = container.logs().decode("utf-8")
print(container_output, file=sys.stderr)

container.remove()

return docker_launcher


@pytest.fixture(scope="module")
def generate_load():
async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]:
futures = [
client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n)
]

return await asyncio.gather(*futures)

return generate_load_inner
2 changes: 2 additions & 0 deletions text-generation-inference/integration-tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
asyncio_mode = auto
18 changes: 18 additions & 0 deletions text-generation-inference/integration-tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
text-generation >= 0.6.0
pytest >= 7.4.0
pytest-asyncio >= 0.21.1
docker >= 6.1.3
Levenshtein
99 changes: 99 additions & 0 deletions text-generation-inference/integration-tests/test_gpt2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os

import huggingface_hub
import Levenshtein
import pytest


MODEL_ID = "gpt2"
NEURON_MODEL_ID = "aws-neuron/gpt2-neuronx-bs4-seqlen1024"
BATCH_SIZE = 4
SEQUENCE_LENGTH = 1024
NUM_CORES = 2


@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
def model_name_or_path(request, data_volume):
if request.param == "hub":
os.environ["HF_BATCH_SIZE"] = str(BATCH_SIZE)
os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
os.environ["HF_NUM_CORES"] = str(NUM_CORES)
yield MODEL_ID
elif request.param == "hub-neuron":
yield NEURON_MODEL_ID
else:
model_dir = f"gpt2-neuron-{BATCH_SIZE}x{SEQUENCE_LENGTH}x{NUM_CORES}"
local_path = os.path.join(data_volume, model_dir)
huggingface_hub.snapshot_download(NEURON_MODEL_ID, local_dir=local_path)
# Return the path of the model inside the mounted volume
yield os.path.join("/data", model_dir)


@pytest.fixture(scope="module")
def tgi_service(launcher, model_name_or_path):
with launcher(model_name_or_path) as tgi_service:
yield tgi_service


@pytest.fixture(scope="module")
async def tgi_client(tgi_service):
await tgi_service.health(300)
return tgi_service.client


@pytest.mark.asyncio
async def test_model_single_request(tgi_client):

# Greedy bounded without input
response = await tgi_client.generate(
"What is Deep Learning?",
max_new_tokens=17,
decoder_input_details=True,
)
assert response.details.generated_tokens == 17
assert response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"

# Greedy bounded with input
response = await tgi_client.generate(
"What is Deep Learning?",
max_new_tokens=17,
return_full_text=True,
decoder_input_details=True,
)
assert response.details.generated_tokens == 17
assert (
response.generated_text
== "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
)

# Sampling
response = await tgi_client.generate(
"What is Deep Learning?",
do_sample=True,
top_k=50,
top_p=0.9,
repetition_penalty=1.2,
max_new_tokens=1000,
seed=42,
decoder_input_details=True,
)
assert "The purpose of the current post is" in response.generated_text


@pytest.mark.asyncio
async def test_model_multiple_requests(tgi_client, generate_load):
num_requests = 4
responses = await generate_load(
tgi_client,
"What is Deep Learning?",
max_new_tokens=17,
n=num_requests,
)

assert len(responses) == 4
expected = "\n\nDeep learning is a new field of research that has been around for a while"
for r in responses:
assert r.details.generated_tokens == 17
# Compute the similarity with the expectation using the levenshtein distance
# We should not have more than two substitutions or additions
assert Levenshtein.distance(r.generated_text, expected) < 3
Loading

0 comments on commit ab582ce

Please sign in to comment.