diff --git a/.gitignore b/.gitignore
index dec8915..e6214f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,9 @@ __pycache__
*.pth
test.py
test.ipynb
-experiment
\ No newline at end of file
+experiment
+analysis
+output
+rebuttal/
+*quant_cuda_kernel_*
+demo*
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/README.md b/README.md
index efe2188..fbb5885 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,23 @@
-# OWQ: Lessons learned from activation outliers for weight quantization in large language models
+# [AAAI 2024 (Oral)] OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models
-This is the code for the paper [OWQ: Lessons learned from activation outliers for weight quantization in large language models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration.
+
+
+
+This is the code for the paper [OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration.
+
+
+
+
-## Updates (2024-01-22)
+## Updates (2024-01-29)
* Integrated all models (OPT, LLaMA, BLOOM, Falcon) into `main.py` file. You can easily add custom or open-accessible huggingface models to `model_config.json` if you want.
* Support 4bit matrix - FP16 vector product CUDA kernel.
* Support BFloat16.
## Features
* Implementation of the OWQ algorithm: `owq/recon.py`, `main.py`
-* 3/4-bit weight quantization of LLMs (OPT, LLaMA1,2 families and etc..): `main.py`
+* 3/4-bit weight quantization of LLMs (OPT, LLaMA-1,2 families and etc ...): `main.py`
* Evaluating the perplexity of quantized models: `main.py`
* Evaluating the zero-shot accuracy of quantized models: `zeroshot.py`
* Supports 3/4-bit packed weight save / load (~1/5, ~1/4 file size of FP16 checkpoint, respectively.)
@@ -21,7 +28,7 @@ This is the code for the paper [OWQ: Lessons learned from activation outliers fo
* [Install](#install)
* [Usage](#usage)
* [Zero-shot](#zero-shot)
-* [3-bit CUDA kernel](#3-bit-cuda-kernels)
+* [3/4-bit CUDA kernels](#34-bit-cuda-kernels)
## Install
We highly recommend to use docker image that supports CUDA. If you use anaconda instead, you need to setup CUDA for kernel use.
@@ -67,43 +74,43 @@ We have tested 3/4-bit CUDA kernel on the NVIDIA A100, A6000 and RTX3090 GPU.
### Running OWQ & measuring the perplexity (PPL)
-Here we use OPT-1.3b model as an example. You can replace the model argument `opt-1.3b` among `opt-125m`, `opt-350m`, `opt-2.7b`, `opt-6.7b`, `opt-13b`, `opt-66b` or other models (e.g. `meta-llama/Llama-2-7b-hf`).
+Here we use llama-7b model (huggyllama/llama-7b) as an example. You can replace the model argument `llama-7b` among `llama-13b`, `llama-30b`, and `llama-65b` or other model families (e.g. `meta-llama/Llama-2-7b-hf`, `facebook/opt-6.7b`, `lmsys/vicuna-33b-v1.3`, etc ...).
* OWQ using 3.01-bit (3-bit quantization + few FP16 weight columns)
```
-python main.py facebook/opt-1.3b c4 --wbits 3 --target_bit 3.01
+python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01
```
* OWQ using 4.01-bit (4-bit quantization + few FP16 weight columns)
```
-python main.py facebook/opt-1.3b c4 --wbits 4 --target_bit 4.01
+python main.py huggyllama/llama-7b c4 --wbits 4 --target_bit 4.01
```
Below are the example for the other options (FP16, RTN, GPTQ).
```
# Measuring the ppl of the full precision (FP16) model
-python main.py facebook/opt-1.3b c4 --wbits 16
+python main.py huggyllama/llama-7b c4 --wbits 16
# 4-bit Round-to-Nearest (RTN) quantization
-python main.py facebook/opt-1.3b c4 --wbits 4 --nearest
+python main.py huggyllama/llama-7b c4 --wbits 4 --nearest
# GPTQ with 3-bit quantization
-python main.py facebook/opt-1.3b c4 --wbits 3 --tuning minmax
+python main.py huggyllama/llama-7b c4 --wbits 3 --tuning minmax
```
### Zero-shot
-Here we give an example of measuring zero-shot accuracy on `lambada_openai` and `piqa` tasks using opt-125m model.
+Here we give an example of measuring zero-shot accuracy on `hellaswag` tasks using llama-7b model.
You need to generate quantized model checkpoint before measuring the zero-shot accuracy.
```
# making checkpoint file of OWQ reconstruction
-python main.py facebook/opt-125m c4 --wbits 3 --target_bit 3.05 --no-eval --save opt-125m_3_05.pth --packing
+python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01 --no-eval --save llama-7b_3_01.pth --packing
-# measuring zero-shot accuracy (single-gpu)
-CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth --batch_size 4 --tasks lambada_openai --no_cache
+# measuring zero-shot accuracy (using single-gpu)
+CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth --batch_size 4 --tasks hellaswag --no_cache
# multi-gpu
-CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth,use_accelerate=True --batch_size 4 --tasks lambada_openai --no_cache
+CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth,use_accelerate=True --batch_size 4 --tasks hellaswag --no_cache
```
-### Easy OWQ + Measuring PPL, Zeroshot sample
+### Easy OPT OWQ + Measuring PPL, Zeroshot sample
```
bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b
```
@@ -111,7 +118,7 @@ bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b
## Demo
Please refer to the README in the `demo` directory.
-## 3-bit CUDA Kernels
+## 3/4-bit CUDA Kernels
### Benchmark kernel performance
```
@@ -120,9 +127,9 @@ cd owq/kernel/
python test_kernel.py
```
-### Benchmark language generation with 3/4-bit packed model (opt, llama)
+### Benchmark language generation with 3/4-bit packed model (opt, llama, etc...)
```
-# Example of OPT-65b language generation (single token)
+# Example of OPT-66b language generation (single token)
# Save compressed model
python main.py facebook/opt-66b c4 --wbits 3 --target_bit 3.01 --no-eval --save opt-66b_3_01.pth --packing
@@ -157,4 +164,4 @@ If you find our code or OWQ useful for your research, please consider citing:
journal={arXiv preprint arXiv:2306.02272},
year={2023}
}
-```
\ No newline at end of file
+```
diff --git a/bloom.py b/bloom.py
deleted file mode 100644
index c0b3d42..0000000
--- a/bloom.py
+++ /dev/null
@@ -1,445 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['qkv','dense','fc1','fc2']
-n_out_dict = {'self_attention.query_key_value':0,
- 'self_attention.dense':0,
- 'mlp.dense_h_to_4h':0,
- 'mlp.dense_4h_to_h':0}
-
-def get_bloom(model):
- import torch
- def skip(*args, **kwargs):
- pass
- torch.nn.init.kaiming_uniform_ = skip
- torch.nn.init.uniform_ = skip
- torch.nn.init.normal_ = skip
- from transformers import BloomForCausalLM
- model = BloomForCausalLM.from_pretrained(model, torch_dtype='auto')
- model.seqlen = 2048
- return model
-
-@torch.no_grad()
-def bloom_sequential(model, dataloader, dev, means=None, stds=None):
- print('Starting ...')
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.transformer.h
-
- model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev)
- model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
- cache = {'i': 0, 'attention_mask': None, 'alibi': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- cache['alibi'] = kwargs['alibi']
- raise ValueError
- layers[0] = Catcher(layers[0])
- for batch in dataloader:
- try:
- model(batch[0].to(dev))
- except ValueError:
- pass
- layers[0] = layers[0].module
-
- layers[0] = layers[0].cpu()
- model.transformer.word_embeddings = model.transformer.word_embeddings.cpu()
- model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
- alibi = cache['alibi']
-
- print('Ready.')
-
- if args.target_bit is not None:
- args.layers = layer_list if args.layers is None else args.layers
- n_mp_layers = len(args.layers)
- if 'qkv' in args.layers:
- n_mp_layers += 2 # q k v
-
- r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
- # r = (args.target_bit - args.wbits) * 16 / 12
- r /= n_mp_layers
-
- layer = find_layers(layers[0])
-
- for i in range(len(args.layers)):
- if args.layers[i] == 'qkv':
- name = 'self_attention.query_key_value'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) * 3
- elif args.layers[i] == 'dense':
- name = 'self_attention.dense'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
- elif args.layers[i] == 'fc1':
- name = 'mlp.dense_h_to_4h'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
- elif args.layers[i] == 'fc2':
- name = 'mlp.dense_4h_to_h'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
-
- quantizers = {}
- for i in range(len(layers)):
- layer = layers[i].to(dev)
- block_layers = find_layers(layer)
-
- if args.true_sequential:
- sequential = [
- ['self_attention.query_key_value'], ['self_attention.dense'],
- ['mlp.dense_h_to_4h'], ['mlp.dense_4h_to_h']
- ]
- else:
- sequential = [list(block_layers.keys())]
-
- for names in sequential:
- subset = {n: block_layers[n] for n in names}
-
- gptq = {}
- for name in subset:
- gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
- gptq[name].quantizer = Quantizer()
- gptq[name].quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
- )
- gptq[name].quantizer.n_out = n_out_dict[name]
-
- def add_batch(name):
- def tmp(_, inp, out):
- gptq[name].add_batch(inp[0].data, out.data)
- return tmp
- handles = []
- for name in subset:
- handles.append(subset[name].register_forward_hook(add_batch(name)))
- for j in range(args.nsamples):
- layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)
- for h in handles:
- h.remove()
-
- for name in names:
- if name.endswith('query_key_value') and args.target_bit is not None:
- name = 'self_attention.query_key_value'
- layer_qkv = subset[name]
- W_q, W_k, W_v = torch.chunk(layer_qkv.weight.data, 3, dim=0)
- W_attn_dict = {'self_attention.query':W_q, 'self_attention.key':W_k, 'self_attention.value':W_v}
- for name1 in W_attn_dict:
- W = W_attn_dict[name1]
- subset[name1] = nn.Linear(W.shape[1], W.shape[0], device=W.device, dtype=W.dtype)
- subset[name1].weight.data = W.clone()
- gptq[name1] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
- gptq[name1].quantizer = Quantizer()
- gptq[name1].quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
- )
- gptq[name1].quantizer.n_out = n_out_dict[name] // 3
- gptq[name1].H = gptq[name].H.clone()
-
- del subset[name]
- del W_q, W_k, W_v
- del gptq[name]
- torch.cuda.empty_cache()
- break
-
- for name in subset:
- if not args.no_frob_norm:
- W = subset[name].weight.data.clone().to(torch.float)
- temp_quantizer = Quantizer()
- temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
- temp_quantizer.find_params(W, weight=True, num=40)
- W_quant = temp_quantizer.quantize(W)
- frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
- else:
- frob_norm_error = None
- out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
- gptq[name].quantizer.out_ids = out_ids.cpu()
-
- if not args.no_frob_norm:
- del W
- del W_quant
- del temp_quantizer
- torch.cuda.empty_cache()
-
- for name in subset:
- print(f"Quantizing model.decoder.layers.{i}.{name}")
- gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
- gptq[name].free()
-
- for name in names:
- if name.endswith('query_key_value') and args.target_bit is not None:
- W_qkv = [subset[n].weight.data.clone() for n in W_attn_dict]
- layer_qkv.weight.data = torch.concat(W_qkv,dim=0)
- del W_qkv
- break
-
- for j in range(args.nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0]
-
- layers[i] = layer.cpu()
- del layer
- del gptq
- torch.cuda.empty_cache()
-
- inps, outs = outs, inps
-
- model.config.use_cache = use_cache
-
-@torch.no_grad()
-def bloom_eval(model, testenc, dev):
- print('Evaluation...')
-
- testenc = testenc.input_ids
- nsamples = testenc.numel() // model.seqlen
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.transformer.h
-
- model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev)
- model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
- cache = {'i': 0, 'attention_mask': None, 'alibi': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- cache['alibi'] = kwargs['alibi']
- raise ValueError
- layers[0] = Catcher(layers[0])
- for i in range(nsamples):
- batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
- try:
- model(batch)
- except ValueError:
- pass
- layers[0] = layers[0].module
-
- layers[0] = layers[0].cpu()
- model.transformer.word_embeddings = model.transformer.word_embeddings.cpu()
- model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
- alibi = cache['alibi']
-
- for i in tqdm(range(len(layers))):
- layer = layers[i].to(dev)
-
- if args.nearest:
- subset = find_layers(layer)
- for name in subset:
- quantizer = Quantizer()
- quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=False
- )
- W = subset[name].weight.data
- quantizer.find_params(W, weight=True)
- subset[name].weight.data = quantize(
- W, quantizer.scale, quantizer.zero, quantizer.maxq
- ).to(next(iter(layer.parameters())).dtype)
-
- for j in range(nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0]
- layers[i] = layer.cpu()
- del layer
- torch.cuda.empty_cache()
- inps, outs = outs, inps
-
- model.transformer.ln_f = model.transformer.ln_f.to(dev)
- model.lm_head = model.lm_head.to(dev)
-
- testenc = testenc.to(dev)
- nlls = []
- for i in range(nsamples):
- hidden_states = inps[i].unsqueeze(0)
- hidden_states = model.transformer.ln_f(hidden_states)
- lm_logits = model.lm_head(hidden_states)
- shift_logits = lm_logits[:, :-1, :].contiguous()
- shift_labels = testenc[
- :, (i * model.seqlen):((i + 1) * model.seqlen)
- ][:, 1:]
- loss_fct = nn.CrossEntropyLoss()
- loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
- neg_log_likelihood = loss.float() * model.seqlen
- nlls.append(neg_log_likelihood)
- ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
- print(ppl.item())
-
- model.config.use_cache = use_cache
- return ppl.item()
-
-if __name__ == '__main__':
-
- parser = argparse.ArgumentParser()
-
- parser.add_argument(
- 'model', type=str,
- help='BLOOM model to load; pass `bigscience/bloom-X`.'
- )
- parser.add_argument(
- 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
- help='Where to extract calibration data from.'
- )
- parser.add_argument(
- '--nsamples', type=int, default=128,
- help='Number of calibration data samples.'
- )
- parser.add_argument(
- '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
- help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
- )
- parser.add_argument(
- '--target_bit', type=float, default=None,
- help='Effctive target bits for OWQ.'
- )
- parser.add_argument(
- '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
- help='Method for quantization parameter tuning.'
- )
- parser.add_argument(
- '--no_frob_norm', action='store_true',
- help='Whether to use Frobenius norm for OWQ.'
- )
- parser.add_argument(
- '--percdamp', type=float, default=.01,
- help='Percent of the average Hessian diagonal to use for dampening.'
- )
- parser.add_argument(
- '--layers', nargs='+', type=str, default=None, choices=layer_list,
- help='Layers to apply OWQ.'
- )
- parser.add_argument(
- '--seed', type=int, default=0,
- help='Seed for sampling the calibration data.'
- )
- parser.add_argument(
- '--nearest', action='store_true',
- help='Whether to run the round-to-nearest quantization.'
- )
- parser.add_argument(
- '--groupsize', type=int, default=-1,
- help='Groupsize for fine-grained quantization; default uses full row.'
- )
-
- parser.add_argument(
- '--no-eval', action='store_true',
- help='Whether to evaluate model on WikiText-2, PTB and C4'
- )
- parser.add_argument(
- '--save', type=str, default='',
- help='Save quantized checkpoint under this name.'
- )
- parser.add_argument(
- '--load', type=str, default='',
- help='Load fake or 3bit quantized checkpoint.'
- )
- parser.add_argument(
- '--logfile', type=str, default='',
- help='Logging file name'
- )
-
- parser.add_argument(
- '--old-eval', action='store_true',
- help='Whether to use the old version of PTB and C4 evaluation.'
- )
- parser.add_argument(
- '--act-order', action='store_true',
- help='Whether to apply the activation order GPTQ heuristic'
- )
- parser.add_argument(
- '--true-sequential', action='store_true',
- help='Whether to run in true sequential model.'
- )
-
- args = parser.parse_args()
- check_arguments(args)
- device = torch.device('cuda:0')
-
- def seed_all(seed):
- random.seed(seed)
- os.environ['PYTHONHASHSEED'] = str(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- seed_all(args.seed)
-
- model = get_bloom(args.model)
- model.eval()
- t = 0
- if args.load:
- print(f"Loading {args.load} ....")
- model.load_state_dict(torch.load(args.load))
- print("Done.")
- else:
- dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
- )
- if args.wbits < 16 and not args.nearest:
- tick = time.time()
- quantizers = bloom_sequential(model, dataloader, device)
- t = round((time.time() - tick),1)
- print(f"Running Time : {t}")
-
- t1 = time.time()
- ppl_scores = []
- if not args.no_eval:
- if args.old_eval:
- ppl_tasks = ['wikitext2', 'ptb', 'c4']
- else:
- ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
- for dataset in ppl_tasks:
- testloader = get_loaders(
- dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
- )
- print(dataset)
- ppl_score = bloom_eval(model, testloader, device)
- ppl_scores.append((dataset,ppl_score))
- t2 = time.time() - t1
-
- if args.logfile:
- with open(f'{args.logfile}','a') as fp:
- add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
- fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
- for i in range(len(ppl_scores)):
- fp.write(f"{ppl_scores[i][1]} ")
- fp.write(f"\n\n")
-
- if args.save:
- torch.save(model.state_dict(), args.save)
diff --git a/demo/README.md b/demo/README.md
index c80cb45..f91c21a 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -26,7 +26,7 @@ python demo_2model.py lmsys/vicuna-7b-v1.3 lmsys/vicuna-33b-v1.3 --load2 {quanti
```
Then you can get accessible Link to the demo page. Please enjoy!
-Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage comparing to FP vicuna-7B model.**
+Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage compared to the FP vicuna-7B model.**
### LLaMA-2 70B + OWQ 3.01 bit
@@ -40,7 +40,7 @@ python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama
python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama-2-70b-weight-location} --gpus 0,1
```
-Please Note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**.
+Please note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**.
diff --git a/demo/demo_2model.py b/demo/demo_2model.py
index d4f1619..ace16bf 100644
--- a/demo/demo_2model.py
+++ b/demo/demo_2model.py
@@ -17,9 +17,6 @@
def main(args):
assert len(args.gpus.split(',')) == 2, "Two GPU devices are required. Please enter them separated by commas"
- os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus
-
global id1, id2
id1, id2 = args.gpus.split(',')
fmodel_name = args.fmodel.split('/')[-1].upper()
diff --git a/demo/demo_llama2_70b.py b/demo/demo_llama2_70b.py
index 63f56dc..dfc0df6 100644
--- a/demo/demo_llama2_70b.py
+++ b/demo/demo_llama2_70b.py
@@ -85,8 +85,6 @@ def main(args):
multigpu = True if len(gpus_list) > 1 else False
if multigpu:
- os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus
id1, id2 = gpus_list
dev1 = torch.device(f'cuda:{id1}')
diff --git a/images/owq_figure.png b/images/owq_figure.png
new file mode 100644
index 0000000..6ced283
Binary files /dev/null and b/images/owq_figure.png differ
diff --git a/images/owq_llama.png b/images/owq_llama.png
new file mode 100644
index 0000000..d989829
Binary files /dev/null and b/images/owq_llama.png differ
diff --git a/llama.py b/llama.py
deleted file mode 100644
index 3fd5bf0..0000000
--- a/llama.py
+++ /dev/null
@@ -1,573 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['k','v','q','o','up','gate','down']
-n_out_dict = {'self_attn.k_proj':0,
- 'self_attn.v_proj':0,
- 'self_attn.q_proj':0,
- 'self_attn.o_proj':0,
- 'mlp.up_proj':0,
- 'mlp.gate_proj':0,
- 'mlp.down_proj':0 }
-
-def get_llama(model):
- import torch
- def skip(*args, **kwargs):
- pass
- torch.nn.init.kaiming_uniform_ = skip
- torch.nn.init.uniform_ = skip
- torch.nn.init.normal_ = skip
- from transformers import LlamaForCausalLM
- model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
- model.seqlen = 2048
- return model
-
-@torch.no_grad()
-def llama_sequential(model, dataloader, dev):
- print('Starting ...')
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.model.layers
-
- model.model.embed_tokens = model.model.embed_tokens.to(dev)
- model.model.norm = model.model.norm.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
-
- cache = {'i': 0, 'attention_mask': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- cache['position_ids'] = kwargs['position_ids']
- raise ValueError
-
- layers[0] = Catcher(layers[0])
- for batch in dataloader:
- try:
- model(batch[0].to(dev))
- except ValueError:
- pass
- layers[0] = layers[0].module
- layers[0] = layers[0].cpu()
- model.model.embed_tokens = model.model.embed_tokens.cpu()
- model.model.norm = model.model.norm.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
- position_ids = cache['position_ids']
-
- print('Ready.')
-
- if args.target_bit is not None:
- args.layers = layer_list if args.layers is None else args.layers
- n_mp_layers = len(args.layers)
-
- r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
- # r = (args.target_bit - args.wbits) * 16 / 12
- r /= n_mp_layers
-
- layer = find_layers(layers[0])
-
- for i in range(len(args.layers)):
- if args.layers[i] in ('k','v','q','o'):
- name = 'self_attn.' + args.layers[i] + '_proj'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
- else:
- name = 'mlp.' + args.layers[i] + '_proj'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r * 3 / 8)
-
- quantizers = {}
- for i in range(len(layers)):
- layer = layers[i].to(dev)
- block_layers = find_layers(layer)
-
- if args.true_sequential:
- sequential = [
- ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
- ['self_attn.o_proj'],
- ['mlp.up_proj', 'mlp.gate_proj'],
- ['mlp.down_proj']
- ]
- else:
- sequential = [list(block_layers.keys())]
-
- for names in sequential:
- subset = {n: block_layers[n] for n in names}
-
- gptq = {}
- for name in subset:
- gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
- gptq[name].quantizer = Quantizer()
- gptq[name].quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
- )
- gptq[name].quantizer.n_out = n_out_dict[name]
-
- def add_batch(name):
- def tmp(_, inp, out):
- gptq[name].add_batch(inp[0].data, out.data)
- return tmp
- handles = []
- for name in subset:
- handles.append(subset[name].register_forward_hook(add_batch(name)))
- for j in range(args.nsamples):
- layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
- for h in handles:
- h.remove()
-
- for name in subset:
- if not args.no_frob_norm:
- W = subset[name].weight.data.clone().to(torch.float)
- temp_quantizer = Quantizer()
- temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
- temp_quantizer.find_params(W, weight=True, num=40)
- W_quant = temp_quantizer.quantize(W)
- frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
- else:
- frob_norm_error = None
- out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
- gptq[name].quantizer.out_ids = out_ids.cpu()
-
- if not args.no_frob_norm:
- del W
- del W_quant
- del temp_quantizer
- torch.cuda.empty_cache()
-
- for name in subset:
- print(f"Quantizing model.decoder.layers.{i}.{name}")
- gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
- quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu()
- gptq[name].free()
-
- for j in range(args.nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
- outs = torch.nan_to_num(outs)
-
- layers[i] = layer.cpu()
- del layer
- del gptq
- torch.cuda.empty_cache()
-
- inps, outs = outs, inps
-
- model.config.use_cache = use_cache
-
- return quantizers
-
-@torch.no_grad()
-def llama_eval(model, testenc, dev):
- print('Evaluating ...')
-
- testenc = testenc.input_ids
- nsamples = testenc.numel() // model.seqlen
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.model.layers
-
- model.model.embed_tokens = model.model.embed_tokens.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
- cache = {'i': 0, 'attention_mask': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- cache['position_ids'] = kwargs['position_ids']
- raise ValueError
- layers[0] = Catcher(layers[0])
- for i in range(nsamples):
- batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
- try:
- model(batch)
- except ValueError:
- pass
- layers[0] = layers[0].module
-
- layers[0] = layers[0].cpu()
- model.model.embed_tokens = model.model.embed_tokens.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
- position_ids = cache['position_ids']
-
- for i in tqdm(range(len(layers))):
- layer = layers[i].to(dev)
-
- if args.nearest:
- subset = find_layers(layer)
- for name in subset:
- quantizer = Quantizer()
- quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=False
- )
- W = subset[name].weight.data
- quantizer.find_params(W, weight=True)
- subset[name].weight.data = quantize(
- W, quantizer.scale, quantizer.zero, quantizer.maxq
- ).to(next(iter(layer.parameters())).dtype)
-
- for j in range(nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
- outs = torch.nan_to_num(outs)
-
- layers[i] = layer.cpu()
- del layer
- torch.cuda.empty_cache()
- inps, outs = outs, inps
-
- if model.model.norm is not None:
- model.model.norm = model.model.norm.to(dev)
- model.lm_head = model.lm_head.to(dev)
-
- testenc = testenc.to(dev)
- nlls = []
- for i in range(nsamples):
- hidden_states = inps[i].unsqueeze(0)
- if model.model.norm is not None:
- hidden_states = model.model.norm(hidden_states)
- lm_logits = model.lm_head(hidden_states)
- shift_logits = lm_logits[:, :-1, :].contiguous()
- shift_labels = testenc[
- :, (i * model.seqlen):((i + 1) * model.seqlen)
- ][:, 1:]
- loss_fct = nn.CrossEntropyLoss()
- loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
- neg_log_likelihood = loss.float() * model.seqlen
- nlls.append(neg_log_likelihood)
- ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
- print(ppl.item())
-
- model.config.use_cache = use_cache
- return ppl.item()
-
-def load_quant3(model, checkpoint, faster=False):
- from transformers import LlamaConfig, LlamaForCausalLM
- config = LlamaConfig.from_pretrained(model)
- def noop(*args, **kwargs):
- pass
- torch.nn.init.kaiming_uniform_ = noop
- torch.nn.init.uniform_ = noop
- torch.nn.init.normal_ = noop
-
- torch.set_default_dtype(torch.half)
- transformers.modeling_utils._init_weights = False
- torch.set_default_dtype(torch.half)
- model = LlamaForCausalLM(config)
- torch.set_default_dtype(torch.float)
- model = model.eval()
- layers = find_layers(model)
- for name in ['lm_head']:
- if name in layers:
- del layers[name]
-
- ckpt = torch.load(checkpoint)
- n_out_dict = ckpt['n_out_dict']
-
- make_quant3(model, n_out_dict, faster=faster)
-
- model.load_state_dict(ckpt['model_state_dict'])
- model.seqlen = model.config.max_position_embeddings
-
- return model
-
-def llama_multigpu(model, gpus):
- model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
- model.model.norm = model.model.norm.to(gpus[-1])
- import copy
- import math
- model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
- cache = {'mask': None, 'pos_ids': None}
-
- class MoveModule(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- self.dev = next(iter(self.module.parameters())).device
- def forward(self, *inp, **kwargs):
- inp = list(inp)
- if inp[0].device != self.dev:
- inp[0] = inp[0].to(self.dev)
- if cache['mask'] is None or cache['mask'].device != self.dev:
- cache['mask'] = kwargs['attention_mask'].to(self.dev)
- if cache['pos_ids'] is None or cache['pos_ids'].device != self.dev:
- cache['pos_ids'] = kwargs['position_ids'].to(self.dev)
- kwargs['attention_mask'] = cache['mask']
- kwargs['position_ids'] = cache['pos_ids']
- tmp = self.module(*inp, **kwargs)
- return tmp
-
- layers = model.model.layers
- pergpu = math.ceil(len(layers) / len(gpus))
- for i in range(len(layers)):
- layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
- model.gpus = gpus
-
-def benchmark(model, input_ids):
- dev = torch.device('cuda:0')
- input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev)
- torch.cuda.synchronize()
-
- cache = {'past': None}
- def clear_past(i):
- def tmp(layer, inp, out):
- if cache['past']:
- cache['past'][i] = None
- return tmp
- for i, layer in enumerate(model.model.layers):
- layer.register_forward_hook(clear_past(i))
-
- print('Benchmarking ...')
-
- loss = nn.CrossEntropyLoss()
- tot = 0.
-
- def sync():
- if hasattr(model, 'gpus'):
- for gpu in model.gpus:
- torch.cuda.synchronize(gpu)
- else:
- torch.cuda.synchronize()
- with torch.no_grad():
- attention_mask = torch.ones((1, input_ids.numel()), device=dev)
- position_ids = torch.arange(0,input_ids.numel(), device=dev)
- times = []
- for i in range(input_ids.numel()):
- print(i)
- tick = time.time()
- out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'],
- attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)),
- position_ids=position_ids[i])
- sync()
- times.append(time.time() - tick)
- if i != input_ids.numel() - 1:
- tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float()
- cache['past'] = list(out.past_key_values)
- del out
- sync()
-
- print('Median:', np.median(times))
- print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-
-if __name__ == '__main__':
-
- parser = argparse.ArgumentParser()
-
- parser.add_argument(
- 'model', type=str,
- help='LlaMa model to load; /path/to/llama_hf'
- )
- parser.add_argument(
- 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
- help='Where to extract calibration data from.'
- )
- parser.add_argument(
- '--nsamples', type=int, default=128,
- help='Number of calibration data samples.'
- )
- parser.add_argument(
- '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
- help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
- )
- parser.add_argument(
- '--target_bit', type=float, default=None,
- help='Effctive target bits for OWQ.'
- )
- parser.add_argument(
- '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
- help='Method for quantization parameter tuning.'
- )
- parser.add_argument(
- '--no_frob_norm', action='store_true',
- help='Whether to use Frobenius norm for OWQ.'
- )
- parser.add_argument(
- '--percdamp', type=float, default=.01,
- help='Percent of the average Hessian diagonal to use for dampening.'
- )
- parser.add_argument(
- '--layers', nargs='+', type=str, default=None, choices=layer_list,
- help='Layers to apply OWQ.'
- )
- parser.add_argument(
- '--seed', type=int, default=0,
- help='Seed for sampling the calibration data.'
- )
- parser.add_argument(
- '--nearest', action='store_true',
- help='Whether to run the round-to-nearest quantization.'
- )
- parser.add_argument(
- '--groupsize', type=int, default=-1,
- help='Groupsize for fine-grained quantization; default uses full row.'
- )
-
- parser.add_argument(
- '--no-eval', action='store_true',
- help='Whether to evaluate model on WikiText-2, PTB and C4'
- )
- parser.add_argument(
- '--save', type=str, default='',
- help='Save quantized checkpoint under this name.'
- )
- parser.add_argument(
- '--load', type=str, default='',
- help='Load fake or 3bit quantized checkpoint.'
- )
- parser.add_argument(
- '--logfile', type=str, default='',
- help='Logging file name'
- )
- parser.add_argument(
- '--packing', action='store_true',
- help='Whether to save 3bit quantized model.'
- )
- parser.add_argument(
- '--faster-kernel', action='store_true',
- help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.'
- )
- parser.add_argument(
- '--benchmark', type=int, default=0,
- help='Number of tokens to use for benchmarking.'
- )
-
- parser.add_argument(
- '--old-eval', action='store_true',
- help='Whether to use the old version of PTB and C4 evaluation.'
- )
- parser.add_argument(
- '--act-order', action='store_true',
- help='Whether to apply the activation order GPTQ heuristic'
- )
- parser.add_argument(
- '--true-sequential', action='store_true',
- help='Whether to run in true sequential model.'
- )
-
- args = parser.parse_args()
- check_arguments(args)
- device = torch.device('cuda:0')
-
- def seed_all(seed):
- random.seed(seed)
- os.environ['PYTHONHASHSEED'] = str(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- seed_all(args.seed)
-
- t = 0
- if args.load:
- print(f"Loading {args.load} ....")
- if args.packing:
- model = load_quant3(args.model, args.load, args.faster_kernel)
- else:
- model = get_llama(args.model)
- model.load_state_dict(torch.load(args.load))
- model.eval()
- print("Done.")
- else:
- model = get_llama(args.model)
- model.eval()
-
- if args.wbits < 16 and not args.nearest:
- dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
- )
- tick = time.time()
- quantizers = llama_sequential(model, dataloader, device)
- t = round((time.time() - tick),1)
- print(f"Running Time : {t}")
-
- if args.benchmark:
- dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
- )
- gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
- if len(gpus) > 1:
- llama_multigpu(model, gpus)
- else:
- model = model.to(device)
- if args.benchmark:
- input_ids = dataloader.input_ids[:, :args.benchmark]
- benchmark(model, input_ids)
- exit()
-
- t1 = time.time()
- ppl_scores = []
- if not args.no_eval:
- if args.old_eval:
- ppl_tasks = ['wikitext2', 'ptb', 'c4']
- else:
- ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
- for dataset in ppl_tasks:
- testloader = get_loaders(
- dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
- )
- print(dataset)
- ppl_score = llama_eval(model, testloader, device)
- ppl_scores.append((dataset,ppl_score))
- t2 = time.time() - t1
-
- if args.logfile:
- with open(f'{args.logfile}','a') as fp:
- add_str = f"| layers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
- fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
- for i in range(len(ppl_scores)):
- fp.write(f"{ppl_scores[i][1]} ")
- fp.write(f"\n")
-
- if args.save:
- torch.save(model.state_dict(), args.save)
- print(f"fake quantized model is saved to {args.save}")
- if args.packing and args.wbits == 3:
- temp = args.save.split('/')
- temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1]
- ckpt_path = '/'.join(temp)
- n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers}
- lm_pack3(model, quantizers, faster=args.faster_kernel)
- torch.save({
- 'model_state_dict' : model.state_dict(),
- 'n_out_dict' : n_out_dict}, ckpt_path)
- print(f"3bit quantized model is saved to {ckpt_path}")
- else:
- print("Only 3bits quantized model is supported")
\ No newline at end of file
diff --git a/main.py b/main.py
index cb4bd84..034bf4e 100644
--- a/main.py
+++ b/main.py
@@ -15,7 +15,6 @@
@torch.no_grad()
def layerwise_quantize(model, dataloader, dev, args):
- # assert args.no_frob_norm == True
meta = args.meta
print('Starting ...')
@@ -78,7 +77,7 @@ def forward(self, inp, **kwargs):
# r = (args.target_bit - args.wbits) * 16 / 12
r /= n_owq_layers
- layer = find_layers(layers[0], layers=[nn.Linear])
+ layer = find_layers(layers[0])
for l in owq_layers:
# for even number of n_out
@@ -92,7 +91,7 @@ def forward(self, inp, **kwargs):
quantizers = {}
for i in range(len(layers)):
layer = layers[i].to(dev)
- block_layers = find_layers(layer, layers=[nn.Linear])
+ block_layers = find_layers(layer)
if args.true_sequential:
sequential = meta['sequential']
@@ -226,7 +225,7 @@ def forward(self, inp, **kwargs):
layer = layers[i].to(dev)
if args.nearest:
- subset = find_layers(layer, layers=args.meta['linears'])
+ subset = find_layers(layer)
for name in subset:
quantizer = Quantizer(args.wbits, perchannel=True, sym=args.sym, mse=False)
W = subset[name].weight.data
@@ -290,7 +289,7 @@ def forward(self, *inp, **kwargs):
if inp[0].device != self.dev:
inp[0] = inp[0].to(self.dev)
for key in meta['inp_kwargs']:
- if kwargs[key].device != self.dev:
+ if kwargs[key] != None and kwargs[key].device != self.dev:
kwargs[key] = kwargs[key].to(self.dev)
tmp = self.module(*inp, **kwargs)
return tmp
@@ -495,7 +494,7 @@ def sync():
# benchmark
if args.benchmark:
dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True
+ args.dataset, nsamples=1, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True
)
gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
if len(gpus) > 1:
@@ -514,7 +513,7 @@ def sync():
t1 = time.time()
ppl_scores = []
if not args.no_eval:
- ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
+ ppl_tasks = ['wikitext2','ptb', 'c4']
for dataset in ppl_tasks:
testloader = get_loaders(
dataset, seed=args.seed, model=args.model, seqlen=args.seqlen, train=False
diff --git a/model_config.json b/model_config.json
index 35ec950..260854b 100644
--- a/model_config.json
+++ b/model_config.json
@@ -74,7 +74,7 @@
},
"falcon":{
"map_layer":{"qkv":"self_attention.query_key_value","dense":"self_attention.dense","fc1":"mlp.dense_h_to_4h","fc2":"mlp.dense_4h_to_h"},
- "ratios":{"self_attention.query_key_value":3,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25},
+ "ratios":{"self_attention.query_key_value":1,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25},
"sequential":[
["self_attention.query_key_value"],
["self_attention.dense"],
diff --git a/opt.py b/opt.py
deleted file mode 100644
index 396e1e9..0000000
--- a/opt.py
+++ /dev/null
@@ -1,586 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['k','v','q','out','fc1','fc2']
-n_out_dict = {'self_attn.k_proj':0,
- 'self_attn.v_proj':0,
- 'self_attn.q_proj':0,
- 'self_attn.out_proj':0,
- 'fc1':0, 'fc2':0 }
-
-def get_opt(model):
- import torch
- def skip(*args, **kwargs):
- pass
- torch.nn.init.kaiming_uniform_ = skip
- torch.nn.init.uniform_ = skip
- torch.nn.init.normal_ = skip
- from transformers import OPTForCausalLM
- model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
- model.seqlen = model.config.max_position_embeddings
- return model
-
-@torch.no_grad()
-def opt_sequential(model, dataloader, dev):
- print('Starting ...')
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.model.decoder.layers
-
- model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
- model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
- if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
- model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
- if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
- model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
-
- cache = {'i': 0, 'attention_mask': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- raise ValueError
-
- layers[0] = Catcher(layers[0])
- for batch in dataloader:
- try:
- model(batch[0].to(dev))
- except ValueError:
- pass
- layers[0] = layers[0].module
- layers[0] = layers[0].cpu()
- model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
- model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
- if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
- model.model.decoder.project_out = model.model.decoder.project_out.cpu()
- if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
- model.model.decoder.project_in = model.model.decoder.project_in.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
-
- print('Ready.')
-
- if args.target_bit is not None:
- args.layers = layer_list if args.layers is None else args.layers
- n_mp_layers = len(args.layers)
-
- r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
- # r = (args.target_bit - args.wbits) * 16 / 12
- r /= n_mp_layers
-
- layer = find_layers(layers[0])
-
- for i in range(len(args.layers)):
- if args.layers[i] in ('k','v','q','out'):
- name = 'self_attn.' + args.layers[i] + '_proj'
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
- else:
- name = args.layers[i]
- n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
-
- quantizers = {}
- for i in range(len(layers)):
- layer = layers[i].to(dev)
- block_layers = find_layers(layer)
-
- if args.true_sequential:
- sequential = [
- ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
- ['self_attn.out_proj'],
- ['fc1'],
- ['fc2']
- ]
- else:
- sequential = [list(block_layers.keys())]
-
- for names in sequential:
- subset = {n: block_layers[n] for n in names}
-
- gptq = {}
- for name in subset:
- gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
- gptq[name].quantizer = Quantizer()
- gptq[name].quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
- )
- gptq[name].quantizer.n_out = n_out_dict[name]
-
- def add_batch(name):
- def tmp(_, inp, out):
- gptq[name].add_batch(inp[0].data, out.data)
- return tmp
- handles = []
- for name in subset:
- handles.append(subset[name].register_forward_hook(add_batch(name)))
- for j in range(args.nsamples):
- layer(inps[j].unsqueeze(0), attention_mask=attention_mask)
- for h in handles:
- h.remove()
-
- for name in subset:
- if not args.no_frob_norm:
- W = subset[name].weight.data.clone().to(torch.float)
- temp_quantizer = Quantizer()
- temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
- temp_quantizer.find_params(W, weight=True, num=40)
- W_quant = temp_quantizer.quantize(W)
- frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
- else:
- frob_norm_error = None
- out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
- gptq[name].quantizer.out_ids = out_ids.cpu()
-
- if not args.no_frob_norm:
- del W
- del W_quant
- del temp_quantizer
- torch.cuda.empty_cache()
-
- for name in subset:
- print(f"Quantizing model.decoder.layers.{i}.{name}")
- gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
- quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu()
- gptq[name].free()
-
- for j in range(args.nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-
- layers[i] = layer.cpu()
- del layer
- del gptq
- torch.cuda.empty_cache()
-
- inps, outs = outs, inps
-
- model.config.use_cache = use_cache
-
- return quantizers
-
-@torch.no_grad()
-def opt_eval(model, testenc, dev):
- print('Evaluating ...')
-
- testenc = testenc.input_ids
- nsamples = testenc.numel() // model.seqlen
-
- use_cache = model.config.use_cache
- model.config.use_cache = False
- layers = model.model.decoder.layers
-
- model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
- model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
- if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
- model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
- if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
- model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
- layers[0] = layers[0].to(dev)
-
- dtype = next(iter(model.parameters())).dtype
- inps = torch.zeros(
- (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
- )
- cache = {'i': 0, 'attention_mask': None}
-
- class Catcher(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- def forward(self, inp, **kwargs):
- inps[cache['i']] = inp
- cache['i'] += 1
- cache['attention_mask'] = kwargs['attention_mask']
- raise ValueError
- layers[0] = Catcher(layers[0])
- for i in range(nsamples):
- batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
- try:
- model(batch)
- except ValueError:
- pass
- layers[0] = layers[0].module
-
- layers[0] = layers[0].cpu()
- model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
- model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
- if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
- model.model.decoder.project_out = model.model.decoder.project_out.cpu()
- if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
- model.model.decoder.project_in = model.model.decoder.project_in.cpu()
- torch.cuda.empty_cache()
-
- outs = torch.zeros_like(inps)
- attention_mask = cache['attention_mask']
-
- for i in tqdm(range(len(layers))):
- layer = layers[i].to(dev)
-
- if args.nearest:
- subset = find_layers(layer)
- for name in subset:
- quantizer = Quantizer()
- quantizer.configure(
- args.wbits, perchannel=True, sym=False, mse=False
- )
- W = subset[name].weight.data
- quantizer.find_params(W, weight=True)
- subset[name].weight.data = quantize(
- W, quantizer.scale, quantizer.zero, quantizer.maxq
- ).to(next(iter(layer.parameters())).dtype)
-
- for j in range(nsamples):
- outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
- layers[i] = layer.cpu()
- del layer
- torch.cuda.empty_cache()
- inps, outs = outs, inps
-
- if model.model.decoder.final_layer_norm is not None:
- model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
- if model.model.decoder.project_out is not None:
- model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
- model.lm_head = model.lm_head.to(dev)
-
- testenc = testenc.to(dev)
- nlls = []
- for i in range(nsamples):
- hidden_states = inps[i].unsqueeze(0)
- if model.model.decoder.final_layer_norm is not None:
- hidden_states = model.model.decoder.final_layer_norm(hidden_states)
- if model.model.decoder.project_out is not None:
- hidden_states = model.model.decoder.project_out(hidden_states)
- lm_logits = model.lm_head(hidden_states)
- shift_logits = lm_logits[:, :-1, :].contiguous()
- shift_labels = testenc[
- :, (i * model.seqlen):((i + 1) * model.seqlen)
- ][:, 1:]
- loss_fct = nn.CrossEntropyLoss()
- loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
- neg_log_likelihood = loss.float() * model.seqlen
- nlls.append(neg_log_likelihood)
- ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
- print(ppl.item())
-
- model.config.use_cache = use_cache
- return ppl.item()
-
-def load_quant3(model, checkpoint, faster=False):
- from transformers import OPTConfig, OPTForCausalLM
- config = OPTConfig.from_pretrained(model)
- def noop(*args, **kwargs):
- pass
- torch.nn.init.kaiming_uniform_ = noop
- torch.nn.init.uniform_ = noop
- torch.nn.init.normal_ = noop
-
- torch.set_default_dtype(torch.half)
- transformers.modeling_utils._init_weights = False
- torch.set_default_dtype(torch.half)
- model = OPTForCausalLM(config)
- torch.set_default_dtype(torch.float)
- model = model.eval()
- layers = find_layers(model)
- for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
- if name in layers:
- del layers[name]
-
- ckpt = torch.load(checkpoint)
- n_out_dict = ckpt['n_out_dict']
-
- make_quant3(model, n_out_dict, faster=faster)
-
- model.load_state_dict(ckpt['model_state_dict'])
- model.seqlen = model.config.max_position_embeddings
-
- return model
-
-def opt_multigpu(model, gpus):
- model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
- model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
- if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
- model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
- if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
- model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
- if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
- model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
- import copy
- import math
- model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
- cache = {'mask': None}
-
- class MoveModule(nn.Module):
- def __init__(self, module):
- super().__init__()
- self.module = module
- self.dev = next(iter(self.module.parameters())).device
- def forward(self, *inp, **kwargs):
- inp = list(inp)
- if inp[0].device != self.dev:
- inp[0] = inp[0].to(self.dev)
- if cache['mask'] is None or cache['mask'].device != self.dev:
- cache['mask'] = kwargs['attention_mask'].to(self.dev)
- kwargs['attention_mask'] = cache['mask']
- tmp = self.module(*inp, **kwargs)
- return tmp
-
- layers = model.model.decoder.layers
- pergpu = math.ceil(len(layers) / len(gpus))
- for i in range(len(layers)):
- layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
- model.gpus = gpus
-
-def benchmark(model, input_ids):
- dev = torch.device('cuda:0')
- input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev)
- torch.cuda.synchronize()
-
- cache = {'past': None}
- def clear_past(i):
- def tmp(layer, inp, out):
- if cache['past']:
- cache['past'][i] = None
- return tmp
- for i, layer in enumerate(model.model.decoder.layers):
- layer.register_forward_hook(clear_past(i))
-
- print('Benchmarking ...')
-
- loss = nn.CrossEntropyLoss()
- tot = 0.
-
- def sync():
- if hasattr(model, 'gpus'):
- for gpu in model.gpus:
- torch.cuda.synchronize(gpu)
- else:
- torch.cuda.synchronize()
- with torch.no_grad():
- attention_mask = torch.ones((1, input_ids.numel()), device=dev)
- times = []
- for i in range(input_ids.numel()):
- tick = time.time()
- out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'],attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
- sync()
- times.append(time.time() - tick)
- if i != input_ids.numel() - 1:
- tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float()
- cache['past'] = list(out.past_key_values)
- del out
- sync()
-
- print('Median:', np.median(times))
- print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-
-if __name__ == '__main__':
-
- parser = argparse.ArgumentParser()
-
- parser.add_argument(
- 'model', type=str,
- help='OPT model to load; pass `facebook/opt-X`.'
- )
- parser.add_argument(
- 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
- help='Where to extract calibration data from.'
- )
- parser.add_argument(
- '--nsamples', type=int, default=128,
- help='Number of calibration data samples.'
- )
- parser.add_argument(
- '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
- help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
- )
- parser.add_argument(
- '--target_bit', type=float, default=None,
- help='Effctive target bits for OWQ.'
- )
- parser.add_argument(
- '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
- help='Method for quantization parameter tuning.'
- )
- parser.add_argument(
- '--no_frob_norm', action='store_true',
- help='Whether to use Frobenius norm for OWQ.'
- )
- parser.add_argument(
- '--percdamp', type=float, default=.01,
- help='Percent of the average Hessian diagonal to use for dampening.'
- )
- parser.add_argument(
- '--layers', nargs='+', type=str, default=None, choices=layer_list,
- help='Layers to apply OWQ.'
- )
- parser.add_argument(
- '--seed', type=int, default=0,
- help='Seed for sampling the calibration data.'
- )
- parser.add_argument(
- '--nearest', action='store_true',
- help='Whether to run the round-to-nearest quantization.'
- )
- parser.add_argument(
- '--groupsize', type=int, default=-1,
- help='Groupsize for fine-grained quantization; default uses full row.'
- )
-
- parser.add_argument(
- '--no-eval', action='store_true',
- help='Whether to evaluate model on WikiText-2, PTB and C4'
- )
- parser.add_argument(
- '--save', type=str, default='',
- help='Save quantized checkpoint under this name.'
- )
- parser.add_argument(
- '--load', type=str, default='',
- help='Load fake or 3bit quantized checkpoint.'
- )
- parser.add_argument(
- '--logfile', type=str, default='',
- help='Logging file name'
- )
- parser.add_argument(
- '--packing', action='store_true',
- help='Whether to save 3bit quantized model.'
- )
- parser.add_argument(
- '--faster-kernel', action='store_true',
- help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.'
- )
- parser.add_argument(
- '--benchmark', type=int, default=0,
- help='Number of tokens to use for benchmarking.'
- )
-
- parser.add_argument(
- '--old-eval', action='store_true',
- help='Whether to use the old version of PTB and C4 evaluation.'
- )
- parser.add_argument(
- '--act-order', action='store_true',
- help='Whether to apply the activation order GPTQ heuristic'
- )
- parser.add_argument(
- '--true-sequential', action='store_true',
- help='Whether to run in true sequential model.'
- )
-
- args = parser.parse_args()
- check_arguments(args)
-
- device = torch.device('cuda:0')
-
- def seed_all(seed):
- random.seed(seed)
- os.environ['PYTHONHASHSEED'] = str(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- seed_all(args.seed)
-
- t = 0
- if args.load:
- print(f"Loading {args.load} ....")
- if args.packing:
- model = load_quant3(args.model, args.load, args.faster_kernel)
- else:
- model = get_opt(args.model)
- model.load_state_dict(torch.load(args.load))
- model.eval()
- print("Done.")
- else:
- model = get_opt(args.model)
- model.eval()
-
- if args.wbits < 16 and not args.nearest:
- dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
- )
- tick = time.time()
- quantizers = opt_sequential(model, dataloader, device)
- t = round((time.time() - tick),1)
- print(f"Running Time : {t}")
-
- if args.benchmark:
- dataloader = get_loaders(
- args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
- )
- gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
- if len(gpus) > 1:
- opt_multigpu(model, gpus)
- else:
- model = model.to(device)
- if args.benchmark:
- input_ids = dataloader.input_ids[:, :args.benchmark]
- benchmark(model, input_ids)
- exit()
-
- t1 = time.time()
- ppl_scores = []
- if not args.no_eval:
- if args.old_eval:
- ppl_tasks = ['wikitext2', 'ptb', 'c4']
- else:
- ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
- for dataset in ppl_tasks:
- testloader = get_loaders(
- dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
- )
- print(dataset)
- ppl_score = opt_eval(model, testloader, device)
- ppl_scores.append((dataset,ppl_score))
- t2 = time.time() - t1
-
- if args.logfile:
- with open(f'{args.logfile}','a') as fp:
- add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
- fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
- for i in range(len(ppl_scores)):
- fp.write(f"{ppl_scores[i][1]} ")
- fp.write(f"\n\n")
-
- if args.save:
- torch.save(model.state_dict(), args.save)
- print(f"fake quantized model is saved to {args.save}")
- if args.packing and args.wbits == 3:
- temp = args.save.split('/')
- temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1]
- ckpt_path = '/'.join(temp)
- n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers}
- lm_pack3(model, quantizers, faster=args.faster_kernel)
- torch.save({
- 'model_state_dict' : model.state_dict(),
- 'n_out_dict' : n_out_dict}, ckpt_path)
- print(f"3bit quantized model is saved to {ckpt_path}")
- else:
- print("Only 3bits quantized model is supported")
diff --git a/owq/kernel/setup_cuda.py b/owq/kernel/setup_cuda.py
index 98d3263..eaef4fd 100644
--- a/owq/kernel/setup_cuda.py
+++ b/owq/kernel/setup_cuda.py
@@ -1,10 +1,31 @@
from setuptools import setup, Extension
from torch.utils import cpp_extension
+extra_compile_args = {
+ "cxx": [
+ "-g",
+ "-O3",
+ "-fopenmp",
+ "-lgomp",
+ "-std=c++17",
+ ],
+ "nvcc": [
+ "-O3",
+ "-std=c++17",
+ "--expt-relaxed-constexpr",
+ "--expt-extended-lambda",
+ "--use_fast_math",
+ "--threads=8"
+ ],
+}
+
setup(
name='owq_cuda',
ext_modules=[cpp_extension.CUDAExtension(
- 'owq_cuda', ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu']
+ name = 'owq_cuda',
+ sources = ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu'],
+ extra_compile_args=extra_compile_args,
)],
- cmdclass={'build_ext': cpp_extension.BuildExtension}
-)
+ cmdclass={'build_ext': cpp_extension.BuildExtension},
+ install_requires = ["torch"],
+)
\ No newline at end of file
diff --git a/owq/kernel/test_kernel.py b/owq/kernel/test_kernel.py
index f2c1805..0ca6b5d 100644
--- a/owq/kernel/test_kernel.py
+++ b/owq/kernel/test_kernel.py
@@ -132,8 +132,8 @@ def correctness(M=4*12288, N=12288, bits=3, outlieridx=[], faster=False):
if __name__=="__main__":
bits=3
- for d, model in zip([4096],['opt-6.7b']): # opt
- # for d, model in zip([12288],['opt-175b']): # opt
+ for d, model in zip([4096],['opt-6.7b']): # opt-6.7b
+ # for d, model in zip([12288],['opt-175b']): # opt-175b
n = 6
print(f'Benchmarking {model.upper()} matvec with outlier ...')
for M,N in [[d,d],[d,d*4],[d*4,d]]:
diff --git a/owq/quant.py b/owq/quant.py
index 1ee2e2f..83e226e 100644
--- a/owq/quant.py
+++ b/owq/quant.py
@@ -1,6 +1,7 @@
import numpy as np
import torch
import torch.nn as nn
+from transformers.models.falcon.modeling_falcon import FalconLinear
try:
import owq_cuda
@@ -200,7 +201,7 @@ def make_quant(module, n_out_infos, wbits, name=''):
for name1, child in module.named_children():
make_quant(child, n_out_infos, wbits, name + '.' + name1 if name != '' else name1)
-def lm_pack(model, quantinfos, wbits, linears=[nn.Linear]):
+def lm_pack(model, quantinfos, wbits, linears=[nn.Linear, FalconLinear]):
from owq.utils.misc import find_layers
layers = find_layers(model, linears)
layers = {n: layers[n] for n in quantinfos}
diff --git a/owq/utils/misc.py b/owq/utils/misc.py
index 3e2f6b5..287e96c 100644
--- a/owq/utils/misc.py
+++ b/owq/utils/misc.py
@@ -1,10 +1,11 @@
import torch
import torch.nn as nn
import math
+from transformers.models.falcon.modeling_falcon import FalconLinear
layer_list = ['q','k','v','qkv','o','out','dense','fc1','fc2','up','gate','down']
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+def find_layers(module, layers=[nn.Linear, FalconLinear], name=''):
if type(module) in layers:
return {name: module}
res = {}
diff --git a/requirements.txt b/requirements.txt
index 2dad1e3..d16d45a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
# torch==2.0.0
transformers
datasets
+accelerate
+peft
sacrebleu
sqlitedict
scikit-learn