added huggingface-benchmark.py to transformers

MISTLab · Aug 7, 2023 · ba4bd5c · ba4bd5c
1 parent da34583
commit ba4bd5c
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 92 deletions.
diff --git a/packages/llm/transformers/Dockerfile b/packages/llm/transformers/Dockerfile
@@ -3,14 +3,24 @@
 # config: config.py
 # group: llm
 # depends: [pytorch, torchvision, huggingface_hub]
-# test: test.py
+# test: [test.py, huggingface-benchmark.py]
+# docs: docs.md
 # notes: bitsandbytes dependency added on JetPack5 for 4-bit/8-bit quantization
 #---
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 
 # this is actually a simple install, however the tests are useful
-RUN pip3 install --no-cache-dir --verbose transformers
+RUN pip3 install --no-cache-dir --verbose transformers accelerate
 
 # this now gets set in the huggingface-hub package
 # ENV TRANSFORMERS_CACHE=/data/models/huggingface
+
+# add benchmark script
+COPY huggingface-benchmark.py /usr/local/bin
+
+# for benchmark timing
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends time \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
diff --git a/packages/llm/transformers/README.md b/packages/llm/transformers/README.md
@@ -2,6 +2,38 @@
 
 > [`CONTAINERS`](#user-content-containers) [`IMAGES`](#user-content-images) [`RUN`](#user-content-run) [`BUILD`](#user-content-build)
 
+
+The HuggingFace [Transformers](https://huggingface.co/docs/transformers/index) library supports a wide variety of NLP and vision models with a convenient API, that many of the other LLM packages have adopted.
+
+### Text Generation Benchmark
+
+Substitute the [text-generation model](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) from [HuggingFace Hub](https://huggingface.co/models?search=gptq) that you want to run (it should be a CausalLM model like GPT, Llama, ect)
+
+```bash
+./run.sh $(./autotag exllama) huggingface-benchmark.py --model=gpt2
+```
+> If the model repository is private or requires authentication, add `--env HUGGINGFACE_TOKEN=<YOUR-ACCESS-TOKEN>`
+
+By default, the performance is measured for generating 128 new output tokens (this can be set with `--tokens=N`)
+
+#### Precision / Quantization
+
+You can change the precision used and enable quantization with the `--precision` argument (options are: `fp32` `fp16` `fp4` `int8`)
+
+The default is `fp16` - on JetPack 5, the [`bitsandbytes`](/packages/llm/bitsandbytes) package is included in the container to enable 4-bit/8-bit quantization through the Transformers API.  It's expected that 4-bit/8-bit quantization is slower through Transformers than FP16 (while consuming less memory).  Other libraries like [`exllama`](/packages/llm/exllama), [`awq`](/packages/llm/awq), and [`AutoGPTQ`](/packages/llm/auto-gptq) have custom CUDA kernels and more efficient quantized performance. 
+
+#### Llama2
+
+* First request access from https://ai.meta.com/llama/
+* Then create a HuggingFace account, and request access to one of the Llama2 models there like https://huggingface.co/meta-llama/Llama-2-7b-hf (doing this will get you access to all the Llama2 models)
+* Get a User Access Token from https://huggingface.co/settings/tokens
+* In your terminal, run `export HUGGINGFACE_TOKEN=<COPY-TOKEN-HERE>`
+
+```bash
+./run.sh --env HUGGINGFACE_TOKEN=$HUGGINGFACE_TOKEN $(./autotag exllama) \
+  huggingface-benchmark.py --model=meta-llama/Llama-2-7b-hf
+```
+
 <details open>
 <summary><b><a id="containers">CONTAINERS</a></b></summary>
 <br>

diff --git a/packages/llm/transformers/docs.md b/packages/llm/transformers/docs.md
@@ -0,0 +1,31 @@
+
+The HuggingFace [Transformers](https://huggingface.co/docs/transformers/index) library supports a wide variety of NLP and vision models with a convenient API, that many of the other LLM packages have adopted.
+
+### Text Generation Benchmark
+
+Substitute the [text-generation model](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) from [HuggingFace Hub](https://huggingface.co/models?search=gptq) that you want to run (it should be a CausalLM model like GPT, Llama, ect)
+
+```bash
+./run.sh $(./autotag exllama) huggingface-benchmark.py --model=gpt2
+```
+> If the model repository is private or requires authentication, add `--env HUGGINGFACE_TOKEN=<YOUR-ACCESS-TOKEN>`
+
+By default, the performance is measured for generating 128 new output tokens (this can be set with `--tokens=N`)
+
+#### Precision / Quantization
+
+You can change the precision used and enable quantization with the `--precision` argument (options are: `fp32` `fp16` `fp4` `int8`)
+
+The default is `fp16` - on JetPack 5, the [`bitsandbytes`](/packages/llm/bitsandbytes) package is included in the container to enable 4-bit/8-bit quantization through the Transformers API.  It's expected that 4-bit/8-bit quantization is slower through Transformers than FP16 (while consuming less memory).  Other libraries like [`exllama`](/packages/llm/exllama), [`awq`](/packages/llm/awq), and [`AutoGPTQ`](/packages/llm/auto-gptq) have custom CUDA kernels and more efficient quantized performance. 
+
+#### Llama2
+
+* First request access from https://ai.meta.com/llama/
+* Then create a HuggingFace account, and request access to one of the Llama2 models there like https://huggingface.co/meta-llama/Llama-2-7b-hf (doing this will get you access to all the Llama2 models)
+* Get a User Access Token from https://huggingface.co/settings/tokens
+* In your terminal, run `export HUGGINGFACE_TOKEN=<COPY-TOKEN-HERE>`
+
+```bash
+./run.sh --env HUGGINGFACE_TOKEN=$HUGGINGFACE_TOKEN $(./autotag exllama) \
+  huggingface-benchmark.py --model=meta-llama/Llama-2-7b-hf
+```
diff --git a/packages/llm/transformers/huggingface-benchmark.py b/packages/llm/transformers/huggingface-benchmark.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# benchmark a text-generation model (CausalLM) with huggingface transformers library
+import psutil
+
+mem_free = psutil.virtual_memory().available
+
+import os
+import time
+import datetime
+import argparse
+import resource
+import socket
+import pprint
+
+import torch
+import huggingface_hub
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--model', type=str, default='distilgpt2')
+parser.add_argument('--prompt', type=str, default='Once upon a time,')
+parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8'])
+parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate (not including the input prompt)')
+parser.add_argument('--token', type=str, default=os.environ.get('HUGGINGFACE_TOKEN', ''), help="HuggingFace account login token from https://huggingface.co/docs/hub/security-tokens (defaults to $HUGGINGFACE_TOKEN)")
+parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations')
+parser.add_argument('--warmup', type=int, default=2, help='the number of warmup iterations')
+parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to')
+
+args = parser.parse_args()
+print(args)
+
+# select compute device
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f'Running on device {device}')
+
+# log into huggingface hub
+if args.token:
+    print("Logging into HuggingFace Hub...")
+    huggingface_hub.login(token=args.token)
+
+# detect the type of model it is
+model_info = huggingface_hub.model_info(args.model)
+model_type = model_info.transformersInfo['auto_model']
+
+if model_type != 'AutoModelForCausalLM':
+    raise ValueError(f"text-generation benchmark only supports CausalLM models (GPT,llama,ect) - {args.model} is {model_type}")
+
+# end the prompt with a newline
+#args.prompt += '\n'
+
+# create tokenizer
+tokenizer = AutoTokenizer.from_pretrained(args.model)
+input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
+
+# setup precision args
+kwargs = {}
+
+if args.precision == 'int8':
+    kwargs['load_in_8bit'] = True
+    #kwargs['int8_threshold'] = 0   # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890
+elif args.precision == 'fp4':
+    kwargs['load_in_4bit'] = True
+elif args.precision == 'fp16':
+    kwargs['torch_dtype'] = torch.float16
+elif args.precision == 'fp32':
+    kwargs['torch_dtype'] = torch.float32
+
+# load model
+print(f'Loading model {args.model} ({args.precision})')
+
+model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs) #AutoModelForCausalLM.from_pretrained(args.model, **kwargs)
+
+if args.precision == 'fp32' or args.precision == 'fp16':
+    model = model.to(device)   # int8/int4 already sets the device
+
+# run inference
+for num_tokens in args.tokens:
+    print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt:  {args.prompt}")
+
+    time_avg = 0
+
+    for run in range(args.runs + args.warmup):
+        time_begin = time.perf_counter()
+        generated_ids = model.generate(input_ids, do_sample=False, min_new_tokens=num_tokens, max_new_tokens=num_tokens) #min_length=num_tokens, max_length=num_tokens)  # greedy generation of fixed # of tokens   #max_new_tokens=args.max_new_tokens
+        time_elapsed = (time.perf_counter() - time_begin)
+
+        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
+
+        if run >= args.warmup:
+            time_avg += time_elapsed
+
+        print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})")
+
+    # compute statistics
+    time_avg /= args.runs  
+    tokens_sec = num_tokens / time_avg
+    memory_usage = (mem_free - psutil.virtual_memory().available) / (1024**2)
+
+    print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec  memory={memory_usage:.2f} MB  (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n")
+
+    if args.save:
+        if not os.path.isfile(args.save):  # csv header
+            with open(args.save, 'w') as file:
+                file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n")
+        with open(args.save, 'a') as file:
+            file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ")
+            file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n")
diff --git a/packages/llm/transformers/test.py b/packages/llm/transformers/test.py
@@ -1,93 +1,3 @@
-#!/usr/bin/env python3
-# benchmark a text-generation model with huggingface transformers library
-import os
-import time
-import datetime
-import argparse
-import resource
-import socket
-import torch
 
 import transformers
 print('transformers version:', transformers.__version__)
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument('--model', type=str, default='distilgpt2')
-parser.add_argument('--prompt', type=str, default='California is in which country?')
-parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'fp4', 'int8'])
-parser.add_argument('--tokens', type=int, nargs='+', default=[20], help='number of output tokens to generate, including the input prompt')
-parser.add_argument('--runs', type=int, default=5, help='the number of benchmark timing iterations')
-parser.add_argument('--warmup', type=int, default=1, help='the number of warmup iterations')
-parser.add_argument('--save', type=str, default='', help='CSV file to save benchmarking results to')
-
-args = parser.parse_args()
-print(args)
-
-# select compute device
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f'Running on device {device}')
-
-# end the prompt with a newline
-args.prompt += '\n'
-
-# create tokenizer
-tokenizer = AutoTokenizer.from_pretrained(args.model)
-input_ids = tokenizer(args.prompt, return_tensors="pt").input_ids.to(device)
-
-# setup precision args
-kwargs = {}
-
-if args.precision == 'int8':
-    kwargs['load_in_8bit'] = True
-    #kwargs['int8_threshold'] = 0   # https://github.com/TimDettmers/bitsandbytes/issues/6#issuecomment-1225990890
-elif args.precision == 'fp4':
-    kwargs['load_in_4bit'] = True
-elif args.precision == 'fp16':
-    kwargs['torch_dtype'] = torch.float16
-elif args.precision == 'fp32':
-    kwargs['torch_dtype'] = torch.float32
-
-# load model
-print(f'Loading model {args.model}')
-
-model = AutoModelForCausalLM.from_pretrained(args.model, **kwargs)
-
-if args.precision == 'fp32' or args.precision == 'fp16':
-    model = model.to(device)   # int8/int4 already sets the device
-
-# run inference
-for num_tokens in args.tokens:
-    print(f"Generating {num_tokens} tokens with {args.model} {args.precision} on prompt:  {args.prompt}")
-
-    time_avg = 0
-
-    for run in range(args.runs + args.warmup):
-        time_begin = time.perf_counter()
-        generated_ids = model.generate(input_ids, do_sample=False, min_length=num_tokens, max_length=num_tokens)  # greedy generation of fixed # of tokens   #max_new_tokens=args.max_new_tokens
-        time_elapsed = (time.perf_counter() - time_begin)
-
-        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
-
-        if run >= args.warmup:
-            time_avg += time_elapsed
-
-        print(f"\n{'WARMUP' if run < args.warmup else 'RUN'} {run} = {time_elapsed:.4f} seconds, {num_tokens/time_elapsed:.1f} tokens/sec ({args.precision})")
-
-    # compute statistics
-    time_avg /= args.runs  
-    tokens_sec = num_tokens / time_avg
-    memory_usage = (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss) / 1024  # https://stackoverflow.com/a/7669482
-    memory_info_gpu = torch.cuda.mem_get_info()
-
-    print(f"AVG = {time_avg:.4f} seconds, {tokens_sec:.1f} tokens/sec  memory={memory_usage:.2f} MB  (--model={args.model} --precision={args.precision} --tokens={num_tokens})\n")
-
-    if args.save:
-        if not os.path.isfile(args.save):  # csv header
-            with open(args.save, 'w') as file:
-                file.write(f"timestamp, hostname, model, precision, tokens, tokens/sec, latency, memory\n")
-        with open(args.save, 'a') as file:
-            file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ")
-            file.write(f"{args.model}, {args.precision}, {num_tokens}, {tokens_sec}, {time_avg}, {memory_usage}\n")