diff --git a/packages/llm/transformers/Dockerfile b/packages/llm/transformers/Dockerfile index 4fa57fa61..7ecbfa3b3 100644 --- a/packages/llm/transformers/Dockerfile +++ b/packages/llm/transformers/Dockerfile @@ -3,14 +3,24 @@ # config: config.py # group: llm # depends: [pytorch, torchvision, huggingface_hub] -# test: test.py +# test: [test.py, huggingface-benchmark.py] +# docs: docs.md # notes: bitsandbytes dependency added on JetPack5 for 4-bit/8-bit quantization #--- ARG BASE_IMAGE FROM ${BASE_IMAGE} # this is actually a simple install, however the tests are useful -RUN pip3 install --no-cache-dir --verbose transformers +RUN pip3 install --no-cache-dir --verbose transformers accelerate # this now gets set in the huggingface-hub package # ENV TRANSFORMERS_CACHE=/data/models/huggingface + +# add benchmark script +COPY huggingface-benchmark.py /usr/local/bin + +# for benchmark timing +RUN apt-get update && \ + apt-get install -y --no-install-recommends time \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean \ No newline at end of file diff --git a/packages/llm/transformers/README.md b/packages/llm/transformers/README.md index 7e91dc2fa..892c88c67 100644 --- a/packages/llm/transformers/README.md +++ b/packages/llm/transformers/README.md @@ -2,6 +2,38 @@ > [`CONTAINERS`](#user-content-containers) [`IMAGES`](#user-content-images) [`RUN`](#user-content-run) [`BUILD`](#user-content-build) + +The HuggingFace [Transformers](https://huggingface.co/docs/transformers/index) library supports a wide variety of NLP and vision models with a convenient API, that many of the other LLM packages have adopted. + +### Text Generation Benchmark + +Substitute the [text-generation model](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) from [HuggingFace Hub](https://huggingface.co/models?search=gptq) that you want to run (it should be a CausalLM model like GPT, Llama, ect) + +```bash +./run.sh $(./autotag exllama) huggingface-benchmark.py --model=gpt2 +``` +> If the model repository is private or requires authentication, add `--env HUGGINGFACE_TOKEN=` + +By default, the performance is measured for generating 128 new output tokens (this can be set with `--tokens=N`) + +#### Precision / Quantization + +You can change the precision used and enable quantization with the `--precision` argument (options are: `fp32` `fp16` `fp4` `int8`) + +The default is `fp16` - on JetPack 5, the [`bitsandbytes`](/packages/llm/bitsandbytes) package is included in the container to enable 4-bit/8-bit quantization through the Transformers API. It's expected that 4-bit/8-bit quantization is slower through Transformers than FP16 (while consuming less memory). Other libraries like [`exllama`](/packages/llm/exllama), [`awq`](/packages/llm/awq), and [`AutoGPTQ`](/packages/llm/auto-gptq) have custom CUDA kernels and more efficient quantized performance. + +#### Llama2 + +* First request access from https://ai.meta.com/llama/ +* Then create a HuggingFace account, and request access to one of the Llama2 models there like https://huggingface.co/meta-llama/Llama-2-7b-hf (doing this will get you access to all the Llama2 models) +* Get a User Access Token from https://huggingface.co/settings/tokens +* In your terminal, run `export HUGGINGFACE_TOKEN=` + +```bash +./run.sh --env HUGGINGFACE_TOKEN=$HUGGINGFACE_TOKEN $(./autotag exllama) \ + huggingface-benchmark.py --model=meta-llama/Llama-2-7b-hf +``` +
CONTAINERS
diff --git a/packages/llm/transformers/docs.md b/packages/llm/transformers/docs.md new file mode 100644 index 000000000..8e0d4b40a --- /dev/null +++ b/packages/llm/transformers/docs.md @@ -0,0 +1,31 @@ + +The HuggingFace [Transformers](https://huggingface.co/docs/transformers/index) library supports a wide variety of NLP and vision models with a convenient API, that many of the other LLM packages have adopted. + +### Text Generation Benchmark + +Substitute the [text-generation model](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) from [HuggingFace Hub](https://huggingface.co/models?search=gptq) that you want to run (it should be a CausalLM model like GPT, Llama, ect) + +```bash +./run.sh $(./autotag exllama) huggingface-benchmark.py --model=gpt2 +``` +> If the model repository is private or requires authentication, add `--env HUGGINGFACE_TOKEN=` + +By default, the performance is measured for generating 128 new output tokens (this can be set with `--tokens=N`) + +#### Precision / Quantization + +You can change the precision used and enable quantization with the `--precision` argument (options are: `fp32` `fp16` `fp4` `int8`) + +The default is `fp16` - on JetPack 5, the [`bitsandbytes`](/packages/llm/bitsandbytes) package is included in the container to enable 4-bit/8-bit quantization through the Transformers API. It's expected that 4-bit/8-bit quantization is slower through Transformers than FP16 (while consuming less memory). Other libraries like [`exllama`](/packages/llm/exllama), [`awq`](/packages/llm/awq), and [`AutoGPTQ`](/packages/llm/auto-gptq) have custom CUDA kernels and more efficient quantized performance. + +#### Llama2 + +* First request access from https://ai.meta.com/llama/ +* Then create a HuggingFace account, and request access to one of the Llama2 models there like https://huggingface.co/meta-llama/Llama-2-7b-hf (doing this will get you access to all the Llama2 models) +* Get a User Access Token from https://huggingface.co/settings/tokens +* In your terminal, run `export HUGGINGFACE_TOKEN=` + +```bash +./run.sh --env HUGGINGFACE_TOKEN=$HUGGINGFACE_TOKEN $(./autotag exllama) \ + huggingface-benchmark.py --model=meta-llama/Llama-2-7b-hf +``` diff --git a/packages/llm/transformers/huggingface-benchmark.py b/packages/llm/transformers/huggingface-benchmark.py new file mode 100755 index 000000000..078b79fe8 --- /dev/null +++ b/packages/llm/transformers/huggingface-benchmark.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# benchmark a text-generation model (CausalLM) with huggingface transformers library +import psutil + +mem_free = psutil.virtual_memory().available + +import os +import time +import datetime +import argparse +import resource +import socket +import pprint + +import torch +import huggingface_hub + +from transformers import AutoModelForCausalLM, AutoTokenizer + +parser = argparse.ArgumentParser() + +parser.add_argument('--model', type=str, default='distilgpt2') +parser.add_argument('--prompt', type=str, default='Once upon a time,') +parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8']) +parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate (not including the input prompt)') +parser.add_argument('--token', type=str, default=os.environ.get('HUGGINGFACE_TOKEN', ''), help="HuggingFace account login token from https://huggingface.co/docs/hub/security-tokens (defaults to $HUGGINGFACE_TOKEN)") +parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations') +parser.add_argument('--warmup', type=int, default=2, help='the number of warmup iterations') +parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to') + +args = parser.parse_args() +print(args) + +# select compute device +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +print(f'Running on device {device}') + +# log into huggingface hub +if args.token: + print("Logging into HuggingFace Hub...") + huggingface_hub.login(token=args.token) + +# detect the type of model it is +model_info = huggingface_hub.model_info(args.model) +model_type = model_info.transformersInfo['auto_model'] + +if model_type != 'AutoModelForCausalLM': + raise ValueError(f"text-generation benchmark only supports CausalLM models (GPT,llama,ect) - {args.model} is {model_type}") + +# end the prompt with a newline +#args.prompt += '\n' + +# create tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.model) +input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device) + +# setup precision args +kwargs = {} + +if args.precision == 'int8': + kwargs['load_in_8bit'] = True + #kwargs['int8_threshold'] = 0 # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890 +elif args.precision == 'fp4': + kwargs['load_in_4bit'] = True +elif args.precision == 'fp16': + kwargs['torch_dtype'] = torch.float16 +elif args.precision == 'fp32': + kwargs['torch_dtype'] = torch.float32 + +# load model +print(f'Loading model {args.model} ({args.precision})') + +model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs) #AutoModelForCausalLM.from_pretrained(args.model, **kwargs) + +if args.precision == 'fp32' or args.precision == 'fp16': + model = model.to(device) # int8/int4 already sets the device + +# run inference +for num_tokens in args.tokens: + print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt: {args.prompt}") + + time_avg = 0 + + for run in range(args.runs + args.warmup): + time_begin = time.perf_counter() + generated_ids = model.generate(input_ids, do_sample=False, min_new_tokens=num_tokens, max_new_tokens=num_tokens) #min_length=num_tokens, max_length=num_tokens) # greedy generation of fixed # of tokens #max_new_tokens=args.max_new_tokens + time_elapsed = (time.perf_counter() - time_begin) + + print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) + + if run >= args.warmup: + time_avg += time_elapsed + + print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})") + + # compute statistics + time_avg /= args.runs + tokens_sec = num_tokens / time_avg + memory_usage = (mem_free - psutil.virtual_memory().available) / (1024**2) + + print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec memory={memory_usage:.2f} MB (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n") + + if args.save: + if not os.path.isfile(args.save): # csv header + with open(args.save, 'w') as file: + file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n") + with open(args.save, 'a') as file: + file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ") + file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n") diff --git a/packages/llm/transformers/test.py b/packages/llm/transformers/test.py index f225c23a3..04acf9e0c 100644 --- a/packages/llm/transformers/test.py +++ b/packages/llm/transformers/test.py @@ -1,93 +1,3 @@ -#!/usr/bin/env python3 -# benchmark a text-generation model with huggingface transformers library -import os -import time -import datetime -import argparse -import resource -import socket -import torch import transformers print('transformers version:', transformers.__version__) - -from transformers import AutoModelForCausalLM, AutoTokenizer - -parser = argparse.ArgumentParser() - -parser.add_argument('--model', type=str, default='distilgpt2') -parser.add_argument('--prompt', type=str, default='California is in which country?') -parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8']) -parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate, including the input prompt') -parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations') -parser.add_argument('--warmup', type=int, default=1, help='the number of warmup iterations') -parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to') - -args = parser.parse_args() -print(args) - -# select compute device -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -print(f'Running on device {device}') - -# end the prompt with a newline -args.prompt += '\n' - -# create tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.model) -input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device) - -# setup precision args -kwargs = {} - -if args.precision == 'int8': - kwargs['load_in_8bit'] = True - #kwargs['int8_threshold'] = 0 # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890 -elif args.precision == 'fp4': - kwargs['load_in_4bit'] = True -elif args.precision == 'fp16': - kwargs['torch_dtype'] = torch.float16 -elif args.precision == 'fp32': - kwargs['torch_dtype'] = torch.float32 - -# load model -print(f'Loading model {args.model}') - -model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs) - -if args.precision == 'fp32' or args.precision == 'fp16': - model = model.to(device) # int8/int4 already sets the device - -# run inference -for num_tokens in args.tokens: - print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt: {args.prompt}") - - time_avg = 0 - - for run in range(args.runs + args.warmup): - time_begin = time.perf_counter() - generated_ids = model.generate(input_ids, do_sample=False, min_length=num_tokens, max_length=num_tokens) # greedy generation of fixed # of tokens #max_new_tokens=args.max_new_tokens - time_elapsed = (time.perf_counter() - time_begin) - - print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) - - if run >= args.warmup: - time_avg += time_elapsed - - print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})") - - # compute statistics - time_avg /= args.runs - tokens_sec = num_tokens / time_avg - memory_usage = (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss) / 1024 # https://stackoverflow.com/a/7669482 - memory_info_gpu = torch.cuda.mem_get_info() - - print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec memory={memory_usage:.2f} MB (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n") - - if args.save: - if not os.path.isfile(args.save): # csv header - with open(args.save, 'w') as file: - file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n") - with open(args.save, 'a') as file: - file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ") - file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n")