forked from dusty-nv/jetson-containers
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added huggingface-benchmark.py to transformers
- Loading branch information
Showing
5 changed files
with
184 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
The HuggingFace [Transformers](https://huggingface.co/docs/transformers/index) library supports a wide variety of NLP and vision models with a convenient API, that many of the other LLM packages have adopted. | ||
|
||
### Text Generation Benchmark | ||
|
||
Substitute the [text-generation model](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) from [HuggingFace Hub](https://huggingface.co/models?search=gptq) that you want to run (it should be a CausalLM model like GPT, Llama, ect) | ||
|
||
```bash | ||
./run.sh $(./autotag exllama) huggingface-benchmark.py --model=gpt2 | ||
``` | ||
> If the model repository is private or requires authentication, add `--env HUGGINGFACE_TOKEN=<YOUR-ACCESS-TOKEN>` | ||
By default, the performance is measured for generating 128 new output tokens (this can be set with `--tokens=N`) | ||
|
||
#### Precision / Quantization | ||
|
||
You can change the precision used and enable quantization with the `--precision` argument (options are: `fp32` `fp16` `fp4` `int8`) | ||
|
||
The default is `fp16` - on JetPack 5, the [`bitsandbytes`](/packages/llm/bitsandbytes) package is included in the container to enable 4-bit/8-bit quantization through the Transformers API. It's expected that 4-bit/8-bit quantization is slower through Transformers than FP16 (while consuming less memory). Other libraries like [`exllama`](/packages/llm/exllama), [`awq`](/packages/llm/awq), and [`AutoGPTQ`](/packages/llm/auto-gptq) have custom CUDA kernels and more efficient quantized performance. | ||
|
||
#### Llama2 | ||
|
||
* First request access from https://ai.meta.com/llama/ | ||
* Then create a HuggingFace account, and request access to one of the Llama2 models there like https://huggingface.co/meta-llama/Llama-2-7b-hf (doing this will get you access to all the Llama2 models) | ||
* Get a User Access Token from https://huggingface.co/settings/tokens | ||
* In your terminal, run `export HUGGINGFACE_TOKEN=<COPY-TOKEN-HERE>` | ||
|
||
```bash | ||
./run.sh --env HUGGINGFACE_TOKEN=$HUGGINGFACE_TOKEN $(./autotag exllama) \ | ||
huggingface-benchmark.py --model=meta-llama/Llama-2-7b-hf | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/env python3 | ||
# benchmark a text-generation model (CausalLM) with huggingface transformers library | ||
import psutil | ||
|
||
mem_free = psutil.virtual_memory().available | ||
|
||
import os | ||
import time | ||
import datetime | ||
import argparse | ||
import resource | ||
import socket | ||
import pprint | ||
|
||
import torch | ||
import huggingface_hub | ||
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument('--model', type=str, default='distilgpt2') | ||
parser.add_argument('--prompt', type=str, default='Once upon a time,') | ||
parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8']) | ||
parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate (not including the input prompt)') | ||
parser.add_argument('--token', type=str, default=os.environ.get('HUGGINGFACE_TOKEN', ''), help="HuggingFace account login token from https://huggingface.co/docs/hub/security-tokens (defaults to $HUGGINGFACE_TOKEN)") | ||
parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations') | ||
parser.add_argument('--warmup', type=int, default=2, help='the number of warmup iterations') | ||
parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to') | ||
|
||
args = parser.parse_args() | ||
print(args) | ||
|
||
# select compute device | ||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | ||
print(f'Running on device {device}') | ||
|
||
# log into huggingface hub | ||
if args.token: | ||
print("Logging into HuggingFace Hub...") | ||
huggingface_hub.login(token=args.token) | ||
|
||
# detect the type of model it is | ||
model_info = huggingface_hub.model_info(args.model) | ||
model_type = model_info.transformersInfo['auto_model'] | ||
|
||
if model_type != 'AutoModelForCausalLM': | ||
raise ValueError(f"text-generation benchmark only supports CausalLM models (GPT,llama,ect) - {args.model} is {model_type}") | ||
|
||
# end the prompt with a newline | ||
#args.prompt += '\n' | ||
|
||
# create tokenizer | ||
tokenizer = AutoTokenizer.from_pretrained(args.model) | ||
input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device) | ||
|
||
# setup precision args | ||
kwargs = {} | ||
|
||
if args.precision == 'int8': | ||
kwargs['load_in_8bit'] = True | ||
#kwargs['int8_threshold'] = 0 # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890 | ||
elif args.precision == 'fp4': | ||
kwargs['load_in_4bit'] = True | ||
elif args.precision == 'fp16': | ||
kwargs['torch_dtype'] = torch.float16 | ||
elif args.precision == 'fp32': | ||
kwargs['torch_dtype'] = torch.float32 | ||
|
||
# load model | ||
print(f'Loading model {args.model} ({args.precision})') | ||
|
||
model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs) #AutoModelForCausalLM.from_pretrained(args.model, **kwargs) | ||
|
||
if args.precision == 'fp32' or args.precision == 'fp16': | ||
model = model.to(device) # int8/int4 already sets the device | ||
|
||
# run inference | ||
for num_tokens in args.tokens: | ||
print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt: {args.prompt}") | ||
|
||
time_avg = 0 | ||
|
||
for run in range(args.runs + args.warmup): | ||
time_begin = time.perf_counter() | ||
generated_ids = model.generate(input_ids, do_sample=False, min_new_tokens=num_tokens, max_new_tokens=num_tokens) #min_length=num_tokens, max_length=num_tokens) # greedy generation of fixed # of tokens #max_new_tokens=args.max_new_tokens | ||
time_elapsed = (time.perf_counter() - time_begin) | ||
|
||
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) | ||
|
||
if run >= args.warmup: | ||
time_avg += time_elapsed | ||
|
||
print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})") | ||
|
||
# compute statistics | ||
time_avg /= args.runs | ||
tokens_sec = num_tokens / time_avg | ||
memory_usage = (mem_free - psutil.virtual_memory().available) / (1024**2) | ||
|
||
print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec memory={memory_usage:.2f} MB (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n") | ||
|
||
if args.save: | ||
if not os.path.isfile(args.save): # csv header | ||
with open(args.save, 'w') as file: | ||
file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n") | ||
with open(args.save, 'a') as file: | ||
file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ") | ||
file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,3 @@ | ||
#!/usr/bin/env python3 | ||
# benchmark a text-generation model with huggingface transformers library | ||
import os | ||
import time | ||
import datetime | ||
import argparse | ||
import resource | ||
import socket | ||
import torch | ||
|
||
import transformers | ||
print('transformers version:', transformers.__version__) | ||
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument('--model', type=str, default='distilgpt2') | ||
parser.add_argument('--prompt', type=str, default='California is in which country?') | ||
parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8']) | ||
parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate, including the input prompt') | ||
parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations') | ||
parser.add_argument('--warmup', type=int, default=1, help='the number of warmup iterations') | ||
parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to') | ||
|
||
args = parser.parse_args() | ||
print(args) | ||
|
||
# select compute device | ||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | ||
print(f'Running on device {device}') | ||
|
||
# end the prompt with a newline | ||
args.prompt += '\n' | ||
|
||
# create tokenizer | ||
tokenizer = AutoTokenizer.from_pretrained(args.model) | ||
input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device) | ||
|
||
# setup precision args | ||
kwargs = {} | ||
|
||
if args.precision == 'int8': | ||
kwargs['load_in_8bit'] = True | ||
#kwargs['int8_threshold'] = 0 # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890 | ||
elif args.precision == 'fp4': | ||
kwargs['load_in_4bit'] = True | ||
elif args.precision == 'fp16': | ||
kwargs['torch_dtype'] = torch.float16 | ||
elif args.precision == 'fp32': | ||
kwargs['torch_dtype'] = torch.float32 | ||
|
||
# load model | ||
print(f'Loading model {args.model}') | ||
|
||
model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs) | ||
|
||
if args.precision == 'fp32' or args.precision == 'fp16': | ||
model = model.to(device) # int8/int4 already sets the device | ||
|
||
# run inference | ||
for num_tokens in args.tokens: | ||
print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt: {args.prompt}") | ||
|
||
time_avg = 0 | ||
|
||
for run in range(args.runs + args.warmup): | ||
time_begin = time.perf_counter() | ||
generated_ids = model.generate(input_ids, do_sample=False, min_length=num_tokens, max_length=num_tokens) # greedy generation of fixed # of tokens #max_new_tokens=args.max_new_tokens | ||
time_elapsed = (time.perf_counter() - time_begin) | ||
|
||
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) | ||
|
||
if run >= args.warmup: | ||
time_avg += time_elapsed | ||
|
||
print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})") | ||
|
||
# compute statistics | ||
time_avg /= args.runs | ||
tokens_sec = num_tokens / time_avg | ||
memory_usage = (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss) / 1024 # https://stackoverflow.com/a/7669482 | ||
memory_info_gpu = torch.cuda.mem_get_info() | ||
|
||
print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec memory={memory_usage:.2f} MB (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n") | ||
|
||
if args.save: | ||
if not os.path.isfile(args.save): # csv header | ||
with open(args.save, 'w') as file: | ||
file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n") | ||
with open(args.save, 'a') as file: | ||
file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ") | ||
file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n") |