diff --git a/.gitignore b/.gitignore index dec8915..e6214f3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,9 @@ __pycache__ *.pth test.py test.ipynb -experiment \ No newline at end of file +experiment +analysis +output +rebuttal/ +*quant_cuda_kernel_* +demo* \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/README.md b/README.md index efe2188..fbb5885 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,23 @@ -# OWQ: Lessons learned from activation outliers for weight quantization in large language models +# [AAAI 2024 (Oral)]   OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models -This is the code for the paper [OWQ: Lessons learned from activation outliers for weight quantization in large language models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration. +

+ +

+This is the code for the paper [OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration. +

+
+ +

-## Updates (2024-01-22) +## Updates (2024-01-29) * Integrated all models (OPT, LLaMA, BLOOM, Falcon) into `main.py` file. You can easily add custom or open-accessible huggingface models to `model_config.json` if you want. * Support 4bit matrix - FP16 vector product CUDA kernel. * Support BFloat16. ## Features * Implementation of the OWQ algorithm: `owq/recon.py`, `main.py` -* 3/4-bit weight quantization of LLMs (OPT, LLaMA1,2 families and etc..): `main.py` +* 3/4-bit weight quantization of LLMs (OPT, LLaMA-1,2 families and etc ...): `main.py` * Evaluating the perplexity of quantized models: `main.py` * Evaluating the zero-shot accuracy of quantized models: `zeroshot.py` * Supports 3/4-bit packed weight save / load (~1/5, ~1/4 file size of FP16 checkpoint, respectively.) @@ -21,7 +28,7 @@ This is the code for the paper [OWQ: Lessons learned from activation outliers fo * [Install](#install) * [Usage](#usage) * [Zero-shot](#zero-shot) -* [3-bit CUDA kernel](#3-bit-cuda-kernels) +* [3/4-bit CUDA kernels](#34-bit-cuda-kernels) ## Install We highly recommend to use docker image that supports CUDA. If you use anaconda instead, you need to setup CUDA for kernel use. @@ -67,43 +74,43 @@ We have tested 3/4-bit CUDA kernel on the NVIDIA A100, A6000 and RTX3090 GPU. ### Running OWQ & measuring the perplexity (PPL) -Here we use OPT-1.3b model as an example. You can replace the model argument `opt-1.3b` among `opt-125m`, `opt-350m`, `opt-2.7b`, `opt-6.7b`, `opt-13b`, `opt-66b` or other models (e.g. `meta-llama/Llama-2-7b-hf`). +Here we use llama-7b model (huggyllama/llama-7b) as an example. You can replace the model argument `llama-7b` among `llama-13b`, `llama-30b`, and `llama-65b` or other model families (e.g. `meta-llama/Llama-2-7b-hf`, `facebook/opt-6.7b`, `lmsys/vicuna-33b-v1.3`, etc ...). * OWQ using 3.01-bit (3-bit quantization + few FP16 weight columns) ``` -python main.py facebook/opt-1.3b c4 --wbits 3 --target_bit 3.01 +python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01 ``` * OWQ using 4.01-bit (4-bit quantization + few FP16 weight columns) ``` -python main.py facebook/opt-1.3b c4 --wbits 4 --target_bit 4.01 +python main.py huggyllama/llama-7b c4 --wbits 4 --target_bit 4.01 ``` Below are the example for the other options (FP16, RTN, GPTQ). ``` # Measuring the ppl of the full precision (FP16) model -python main.py facebook/opt-1.3b c4 --wbits 16 +python main.py huggyllama/llama-7b c4 --wbits 16 # 4-bit Round-to-Nearest (RTN) quantization -python main.py facebook/opt-1.3b c4 --wbits 4 --nearest +python main.py huggyllama/llama-7b c4 --wbits 4 --nearest # GPTQ with 3-bit quantization -python main.py facebook/opt-1.3b c4 --wbits 3 --tuning minmax +python main.py huggyllama/llama-7b c4 --wbits 3 --tuning minmax ``` ### Zero-shot -Here we give an example of measuring zero-shot accuracy on `lambada_openai` and `piqa` tasks using opt-125m model. +Here we give an example of measuring zero-shot accuracy on `hellaswag` tasks using llama-7b model. You need to generate quantized model checkpoint before measuring the zero-shot accuracy. ``` # making checkpoint file of OWQ reconstruction -python main.py facebook/opt-125m c4 --wbits 3 --target_bit 3.05 --no-eval --save opt-125m_3_05.pth --packing +python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01 --no-eval --save llama-7b_3_01.pth --packing -# measuring zero-shot accuracy (single-gpu) -CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth --batch_size 4 --tasks lambada_openai --no_cache +# measuring zero-shot accuracy (using single-gpu) +CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth --batch_size 4 --tasks hellaswag --no_cache # multi-gpu -CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth,use_accelerate=True --batch_size 4 --tasks lambada_openai --no_cache +CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth,use_accelerate=True --batch_size 4 --tasks hellaswag --no_cache ``` -### Easy OWQ + Measuring PPL, Zeroshot sample +### Easy OPT OWQ + Measuring PPL, Zeroshot sample ``` bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b ``` @@ -111,7 +118,7 @@ bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b ## Demo Please refer to the README in the `demo` directory. -## 3-bit CUDA Kernels +## 3/4-bit CUDA Kernels ### Benchmark kernel performance ``` @@ -120,9 +127,9 @@ cd owq/kernel/ python test_kernel.py ``` -### Benchmark language generation with 3/4-bit packed model (opt, llama) +### Benchmark language generation with 3/4-bit packed model (opt, llama, etc...) ``` -# Example of OPT-65b language generation (single token) +# Example of OPT-66b language generation (single token) # Save compressed model python main.py facebook/opt-66b c4 --wbits 3 --target_bit 3.01 --no-eval --save opt-66b_3_01.pth --packing @@ -157,4 +164,4 @@ If you find our code or OWQ useful for your research, please consider citing: journal={arXiv preprint arXiv:2306.02272}, year={2023} } -``` \ No newline at end of file +``` diff --git a/bloom.py b/bloom.py deleted file mode 100644 index c0b3d42..0000000 --- a/bloom.py +++ /dev/null @@ -1,445 +0,0 @@ -import time - -import torch -import torch.nn as nn - -import transformers - -from owq.recon import GPTQ_OWQ -from owq.quant import * -from owq.utils.misc import find_layers, check_arguments -from owq.utils.datautils import * - -import argparse -import random -import os -import numpy as np -from tqdm import tqdm - -layer_list = ['qkv','dense','fc1','fc2'] -n_out_dict = {'self_attention.query_key_value':0, - 'self_attention.dense':0, - 'mlp.dense_h_to_4h':0, - 'mlp.dense_4h_to_h':0} - -def get_bloom(model): - import torch - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - from transformers import BloomForCausalLM - model = BloomForCausalLM.from_pretrained(model, torch_dtype='auto') - model.seqlen = 2048 - return model - -@torch.no_grad() -def bloom_sequential(model, dataloader, dev, means=None, stds=None): - print('Starting ...') - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.transformer.h - - model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev) - model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None, 'alibi': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - cache['alibi'] = kwargs['alibi'] - raise ValueError - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.transformer.word_embeddings = model.transformer.word_embeddings.cpu() - model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - alibi = cache['alibi'] - - print('Ready.') - - if args.target_bit is not None: - args.layers = layer_list if args.layers is None else args.layers - n_mp_layers = len(args.layers) - if 'qkv' in args.layers: - n_mp_layers += 2 # q k v - - r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits) - # r = (args.target_bit - args.wbits) * 16 / 12 - r /= n_mp_layers - - layer = find_layers(layers[0]) - - for i in range(len(args.layers)): - if args.layers[i] == 'qkv': - name = 'self_attention.query_key_value' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) * 3 - elif args.layers[i] == 'dense': - name = 'self_attention.dense' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) - elif args.layers[i] == 'fc1': - name = 'mlp.dense_h_to_4h' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4) - elif args.layers[i] == 'fc2': - name = 'mlp.dense_4h_to_h' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4) - - quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) - block_layers = find_layers(layer) - - if args.true_sequential: - sequential = [ - ['self_attention.query_key_value'], ['self_attention.dense'], - ['mlp.dense_h_to_4h'], ['mlp.dense_4h_to_h'] - ] - else: - sequential = [list(block_layers.keys())] - - for names in sequential: - subset = {n: block_layers[n] for n in names} - - gptq = {} - for name in subset: - gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name]) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse') - ) - gptq[name].quantizer.n_out = n_out_dict[name] - - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(args.nsamples): - layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi) - for h in handles: - h.remove() - - for name in names: - if name.endswith('query_key_value') and args.target_bit is not None: - name = 'self_attention.query_key_value' - layer_qkv = subset[name] - W_q, W_k, W_v = torch.chunk(layer_qkv.weight.data, 3, dim=0) - W_attn_dict = {'self_attention.query':W_q, 'self_attention.key':W_k, 'self_attention.value':W_v} - for name1 in W_attn_dict: - W = W_attn_dict[name1] - subset[name1] = nn.Linear(W.shape[1], W.shape[0], device=W.device, dtype=W.dtype) - subset[name1].weight.data = W.clone() - gptq[name1] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name]) - gptq[name1].quantizer = Quantizer() - gptq[name1].quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse') - ) - gptq[name1].quantizer.n_out = n_out_dict[name] // 3 - gptq[name1].H = gptq[name].H.clone() - - del subset[name] - del W_q, W_k, W_v - del gptq[name] - torch.cuda.empty_cache() - break - - for name in subset: - if not args.no_frob_norm: - W = subset[name].weight.data.clone().to(torch.float) - temp_quantizer = Quantizer() - temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')) - temp_quantizer.find_params(W, weight=True, num=40) - W_quant = temp_quantizer.quantize(W) - frob_norm_error = (W - W_quant).pow(2).sum(dim=0) - else: - frob_norm_error = None - out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error) - gptq[name].quantizer.out_ids = out_ids.cpu() - - if not args.no_frob_norm: - del W - del W_quant - del temp_quantizer - torch.cuda.empty_cache() - - for name in subset: - print(f"Quantizing model.decoder.layers.{i}.{name}") - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - gptq[name].free() - - for name in names: - if name.endswith('query_key_value') and args.target_bit is not None: - W_qkv = [subset[n].weight.data.clone() for n in W_attn_dict] - layer_qkv.weight.data = torch.concat(W_qkv,dim=0) - del W_qkv - break - - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0] - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = outs, inps - - model.config.use_cache = use_cache - -@torch.no_grad() -def bloom_eval(model, testenc, dev): - print('Evaluation...') - - testenc = testenc.input_ids - nsamples = testenc.numel() // model.seqlen - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.transformer.h - - model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev) - model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None, 'alibi': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - cache['alibi'] = kwargs['alibi'] - raise ValueError - layers[0] = Catcher(layers[0]) - for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) - try: - model(batch) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.transformer.word_embeddings = model.transformer.word_embeddings.cpu() - model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - alibi = cache['alibi'] - - for i in tqdm(range(len(layers))): - layer = layers[i].to(dev) - - if args.nearest: - subset = find_layers(layer) - for name in subset: - quantizer = Quantizer() - quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=False - ) - W = subset[name].weight.data - quantizer.find_params(W, weight=True) - subset[name].weight.data = quantize( - W, quantizer.scale, quantizer.zero, quantizer.maxq - ).to(next(iter(layer.parameters())).dtype) - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0] - layers[i] = layer.cpu() - del layer - torch.cuda.empty_cache() - inps, outs = outs, inps - - model.transformer.ln_f = model.transformer.ln_f.to(dev) - model.lm_head = model.lm_head.to(dev) - - testenc = testenc.to(dev) - nlls = [] - for i in range(nsamples): - hidden_states = inps[i].unsqueeze(0) - hidden_states = model.transformer.ln_f(hidden_states) - lm_logits = model.lm_head(hidden_states) - shift_logits = lm_logits[:, :-1, :].contiguous() - shift_labels = testenc[ - :, (i * model.seqlen):((i + 1) * model.seqlen) - ][:, 1:] - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - neg_log_likelihood = loss.float() * model.seqlen - nlls.append(neg_log_likelihood) - ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) - print(ppl.item()) - - model.config.use_cache = use_cache - return ppl.item() - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - - parser.add_argument( - 'model', type=str, - help='BLOOM model to load; pass `bigscience/bloom-X`.' - ) - parser.add_argument( - 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], - help='Where to extract calibration data from.' - ) - parser.add_argument( - '--nsamples', type=int, default=128, - help='Number of calibration data samples.' - ) - parser.add_argument( - '--wbits', type=int, default=16, choices=[2, 3, 4, 16], - help='The number of bits to use for weight quantization; use 16 for evaluating base model.' - ) - parser.add_argument( - '--target_bit', type=float, default=None, - help='Effctive target bits for OWQ.' - ) - parser.add_argument( - '--tuning', type=str, default='mse', choices=['mse', 'minmax'], - help='Method for quantization parameter tuning.' - ) - parser.add_argument( - '--no_frob_norm', action='store_true', - help='Whether to use Frobenius norm for OWQ.' - ) - parser.add_argument( - '--percdamp', type=float, default=.01, - help='Percent of the average Hessian diagonal to use for dampening.' - ) - parser.add_argument( - '--layers', nargs='+', type=str, default=None, choices=layer_list, - help='Layers to apply OWQ.' - ) - parser.add_argument( - '--seed', type=int, default=0, - help='Seed for sampling the calibration data.' - ) - parser.add_argument( - '--nearest', action='store_true', - help='Whether to run the round-to-nearest quantization.' - ) - parser.add_argument( - '--groupsize', type=int, default=-1, - help='Groupsize for fine-grained quantization; default uses full row.' - ) - - parser.add_argument( - '--no-eval', action='store_true', - help='Whether to evaluate model on WikiText-2, PTB and C4' - ) - parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' - ) - parser.add_argument( - '--load', type=str, default='', - help='Load fake or 3bit quantized checkpoint.' - ) - parser.add_argument( - '--logfile', type=str, default='', - help='Logging file name' - ) - - parser.add_argument( - '--old-eval', action='store_true', - help='Whether to use the old version of PTB and C4 evaluation.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' - ) - parser.add_argument( - '--true-sequential', action='store_true', - help='Whether to run in true sequential model.' - ) - - args = parser.parse_args() - check_arguments(args) - device = torch.device('cuda:0') - - def seed_all(seed): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - seed_all(args.seed) - - model = get_bloom(args.model) - model.eval() - t = 0 - if args.load: - print(f"Loading {args.load} ....") - model.load_state_dict(torch.load(args.load)) - print("Done.") - else: - dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True - ) - if args.wbits < 16 and not args.nearest: - tick = time.time() - quantizers = bloom_sequential(model, dataloader, device) - t = round((time.time() - tick),1) - print(f"Running Time : {t}") - - t1 = time.time() - ppl_scores = [] - if not args.no_eval: - if args.old_eval: - ppl_tasks = ['wikitext2', 'ptb', 'c4'] - else: - ppl_tasks = ['wikitext2','ptb-new', 'c4-new'] - for dataset in ppl_tasks: - testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False - ) - print(dataset) - ppl_score = bloom_eval(model, testloader, device) - ppl_scores.append((dataset,ppl_score)) - t2 = time.time() - t1 - - if args.logfile: - with open(f'{args.logfile}','a') as fp: - add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n' - fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}") - for i in range(len(ppl_scores)): - fp.write(f"{ppl_scores[i][1]} ") - fp.write(f"\n\n") - - if args.save: - torch.save(model.state_dict(), args.save) diff --git a/demo/README.md b/demo/README.md index c80cb45..f91c21a 100644 --- a/demo/README.md +++ b/demo/README.md @@ -26,7 +26,7 @@ python demo_2model.py lmsys/vicuna-7b-v1.3 lmsys/vicuna-33b-v1.3 --load2 {quanti ``` Then you can get accessible Link to the demo page. Please enjoy! -Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage comparing to FP vicuna-7B model.** +Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage compared to the FP vicuna-7B model.** ### LLaMA-2 70B + OWQ 3.01 bit @@ -40,7 +40,7 @@ python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama-2-70b-weight-location} --gpus 0,1 ``` -Please Note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**. +Please note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**. diff --git a/demo/demo_2model.py b/demo/demo_2model.py index d4f1619..ace16bf 100644 --- a/demo/demo_2model.py +++ b/demo/demo_2model.py @@ -17,9 +17,6 @@ def main(args): assert len(args.gpus.split(',')) == 2, "Two GPU devices are required. Please enter them separated by commas" - os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus - global id1, id2 id1, id2 = args.gpus.split(',') fmodel_name = args.fmodel.split('/')[-1].upper() diff --git a/demo/demo_llama2_70b.py b/demo/demo_llama2_70b.py index 63f56dc..dfc0df6 100644 --- a/demo/demo_llama2_70b.py +++ b/demo/demo_llama2_70b.py @@ -85,8 +85,6 @@ def main(args): multigpu = True if len(gpus_list) > 1 else False if multigpu: - os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus id1, id2 = gpus_list dev1 = torch.device(f'cuda:{id1}') diff --git a/images/owq_figure.png b/images/owq_figure.png new file mode 100644 index 0000000..6ced283 Binary files /dev/null and b/images/owq_figure.png differ diff --git a/images/owq_llama.png b/images/owq_llama.png new file mode 100644 index 0000000..d989829 Binary files /dev/null and b/images/owq_llama.png differ diff --git a/llama.py b/llama.py deleted file mode 100644 index 3fd5bf0..0000000 --- a/llama.py +++ /dev/null @@ -1,573 +0,0 @@ -import time - -import torch -import torch.nn as nn - -import transformers - -from owq.recon import GPTQ_OWQ -from owq.quant import * -from owq.utils.misc import find_layers, check_arguments -from owq.utils.datautils import * - -import argparse -import random -import os -import numpy as np -from tqdm import tqdm - -layer_list = ['k','v','q','o','up','gate','down'] -n_out_dict = {'self_attn.k_proj':0, - 'self_attn.v_proj':0, - 'self_attn.q_proj':0, - 'self_attn.o_proj':0, - 'mlp.up_proj':0, - 'mlp.gate_proj':0, - 'mlp.down_proj':0 } - -def get_llama(model): - import torch - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - from transformers import LlamaForCausalLM - model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') - model.seqlen = 2048 - return model - -@torch.no_grad() -def llama_sequential(model, dataloader, dev): - print('Starting ...') - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.layers - - model.model.embed_tokens = model.model.embed_tokens.to(dev) - model.model.norm = model.model.norm.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - cache['position_ids'] = kwargs['position_ids'] - raise ValueError - - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - layers[0] = layers[0].cpu() - model.model.embed_tokens = model.model.embed_tokens.cpu() - model.model.norm = model.model.norm.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - position_ids = cache['position_ids'] - - print('Ready.') - - if args.target_bit is not None: - args.layers = layer_list if args.layers is None else args.layers - n_mp_layers = len(args.layers) - - r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits) - # r = (args.target_bit - args.wbits) * 16 / 12 - r /= n_mp_layers - - layer = find_layers(layers[0]) - - for i in range(len(args.layers)): - if args.layers[i] in ('k','v','q','o'): - name = 'self_attn.' + args.layers[i] + '_proj' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) - else: - name = 'mlp.' + args.layers[i] + '_proj' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r * 3 / 8) - - quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) - block_layers = find_layers(layer) - - if args.true_sequential: - sequential = [ - ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], - ['self_attn.o_proj'], - ['mlp.up_proj', 'mlp.gate_proj'], - ['mlp.down_proj'] - ] - else: - sequential = [list(block_layers.keys())] - - for names in sequential: - subset = {n: block_layers[n] for n in names} - - gptq = {} - for name in subset: - gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name]) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse') - ) - gptq[name].quantizer.n_out = n_out_dict[name] - - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(args.nsamples): - layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] - for h in handles: - h.remove() - - for name in subset: - if not args.no_frob_norm: - W = subset[name].weight.data.clone().to(torch.float) - temp_quantizer = Quantizer() - temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')) - temp_quantizer.find_params(W, weight=True, num=40) - W_quant = temp_quantizer.quantize(W) - frob_norm_error = (W - W_quant).pow(2).sum(dim=0) - else: - frob_norm_error = None - out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error) - gptq[name].quantizer.out_ids = out_ids.cpu() - - if not args.no_frob_norm: - del W - del W_quant - del temp_quantizer - torch.cuda.empty_cache() - - for name in subset: - print(f"Quantizing model.decoder.layers.{i}.{name}") - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu() - gptq[name].free() - - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] - outs = torch.nan_to_num(outs) - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = outs, inps - - model.config.use_cache = use_cache - - return quantizers - -@torch.no_grad() -def llama_eval(model, testenc, dev): - print('Evaluating ...') - - testenc = testenc.input_ids - nsamples = testenc.numel() // model.seqlen - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.layers - - model.model.embed_tokens = model.model.embed_tokens.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - cache['position_ids'] = kwargs['position_ids'] - raise ValueError - layers[0] = Catcher(layers[0]) - for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) - try: - model(batch) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.model.embed_tokens = model.model.embed_tokens.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - position_ids = cache['position_ids'] - - for i in tqdm(range(len(layers))): - layer = layers[i].to(dev) - - if args.nearest: - subset = find_layers(layer) - for name in subset: - quantizer = Quantizer() - quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=False - ) - W = subset[name].weight.data - quantizer.find_params(W, weight=True) - subset[name].weight.data = quantize( - W, quantizer.scale, quantizer.zero, quantizer.maxq - ).to(next(iter(layer.parameters())).dtype) - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] - outs = torch.nan_to_num(outs) - - layers[i] = layer.cpu() - del layer - torch.cuda.empty_cache() - inps, outs = outs, inps - - if model.model.norm is not None: - model.model.norm = model.model.norm.to(dev) - model.lm_head = model.lm_head.to(dev) - - testenc = testenc.to(dev) - nlls = [] - for i in range(nsamples): - hidden_states = inps[i].unsqueeze(0) - if model.model.norm is not None: - hidden_states = model.model.norm(hidden_states) - lm_logits = model.lm_head(hidden_states) - shift_logits = lm_logits[:, :-1, :].contiguous() - shift_labels = testenc[ - :, (i * model.seqlen):((i + 1) * model.seqlen) - ][:, 1:] - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - neg_log_likelihood = loss.float() * model.seqlen - nlls.append(neg_log_likelihood) - ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) - print(ppl.item()) - - model.config.use_cache = use_cache - return ppl.item() - -def load_quant3(model, checkpoint, faster=False): - from transformers import LlamaConfig, LlamaForCausalLM - config = LlamaConfig.from_pretrained(model) - def noop(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = LlamaForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ['lm_head']: - if name in layers: - del layers[name] - - ckpt = torch.load(checkpoint) - n_out_dict = ckpt['n_out_dict'] - - make_quant3(model, n_out_dict, faster=faster) - - model.load_state_dict(ckpt['model_state_dict']) - model.seqlen = model.config.max_position_embeddings - - return model - -def llama_multigpu(model, gpus): - model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) - model.model.norm = model.model.norm.to(gpus[-1]) - import copy - import math - model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) - - cache = {'mask': None, 'pos_ids': None} - - class MoveModule(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - self.dev = next(iter(self.module.parameters())).device - def forward(self, *inp, **kwargs): - inp = list(inp) - if inp[0].device != self.dev: - inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache['mask'].device != self.dev: - cache['mask'] = kwargs['attention_mask'].to(self.dev) - if cache['pos_ids'] is None or cache['pos_ids'].device != self.dev: - cache['pos_ids'] = kwargs['position_ids'].to(self.dev) - kwargs['attention_mask'] = cache['mask'] - kwargs['position_ids'] = cache['pos_ids'] - tmp = self.module(*inp, **kwargs) - return tmp - - layers = model.model.layers - pergpu = math.ceil(len(layers) / len(gpus)) - for i in range(len(layers)): - layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) - - model.gpus = gpus - -def benchmark(model, input_ids): - dev = torch.device('cuda:0') - input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev) - torch.cuda.synchronize() - - cache = {'past': None} - def clear_past(i): - def tmp(layer, inp, out): - if cache['past']: - cache['past'][i] = None - return tmp - for i, layer in enumerate(model.model.layers): - layer.register_forward_hook(clear_past(i)) - - print('Benchmarking ...') - - loss = nn.CrossEntropyLoss() - tot = 0. - - def sync(): - if hasattr(model, 'gpus'): - for gpu in model.gpus: - torch.cuda.synchronize(gpu) - else: - torch.cuda.synchronize() - with torch.no_grad(): - attention_mask = torch.ones((1, input_ids.numel()), device=dev) - position_ids = torch.arange(0,input_ids.numel(), device=dev) - times = [] - for i in range(input_ids.numel()): - print(i) - tick = time.time() - out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'], - attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)), - position_ids=position_ids[i]) - sync() - times.append(time.time() - tick) - if i != input_ids.numel() - 1: - tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float() - cache['past'] = list(out.past_key_values) - del out - sync() - - print('Median:', np.median(times)) - print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - - parser.add_argument( - 'model', type=str, - help='LlaMa model to load; /path/to/llama_hf' - ) - parser.add_argument( - 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], - help='Where to extract calibration data from.' - ) - parser.add_argument( - '--nsamples', type=int, default=128, - help='Number of calibration data samples.' - ) - parser.add_argument( - '--wbits', type=int, default=16, choices=[2, 3, 4, 16], - help='The number of bits to use for weight quantization; use 16 for evaluating base model.' - ) - parser.add_argument( - '--target_bit', type=float, default=None, - help='Effctive target bits for OWQ.' - ) - parser.add_argument( - '--tuning', type=str, default='mse', choices=['mse', 'minmax'], - help='Method for quantization parameter tuning.' - ) - parser.add_argument( - '--no_frob_norm', action='store_true', - help='Whether to use Frobenius norm for OWQ.' - ) - parser.add_argument( - '--percdamp', type=float, default=.01, - help='Percent of the average Hessian diagonal to use for dampening.' - ) - parser.add_argument( - '--layers', nargs='+', type=str, default=None, choices=layer_list, - help='Layers to apply OWQ.' - ) - parser.add_argument( - '--seed', type=int, default=0, - help='Seed for sampling the calibration data.' - ) - parser.add_argument( - '--nearest', action='store_true', - help='Whether to run the round-to-nearest quantization.' - ) - parser.add_argument( - '--groupsize', type=int, default=-1, - help='Groupsize for fine-grained quantization; default uses full row.' - ) - - parser.add_argument( - '--no-eval', action='store_true', - help='Whether to evaluate model on WikiText-2, PTB and C4' - ) - parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' - ) - parser.add_argument( - '--load', type=str, default='', - help='Load fake or 3bit quantized checkpoint.' - ) - parser.add_argument( - '--logfile', type=str, default='', - help='Logging file name' - ) - parser.add_argument( - '--packing', action='store_true', - help='Whether to save 3bit quantized model.' - ) - parser.add_argument( - '--faster-kernel', action='store_true', - help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.' - ) - parser.add_argument( - '--benchmark', type=int, default=0, - help='Number of tokens to use for benchmarking.' - ) - - parser.add_argument( - '--old-eval', action='store_true', - help='Whether to use the old version of PTB and C4 evaluation.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' - ) - parser.add_argument( - '--true-sequential', action='store_true', - help='Whether to run in true sequential model.' - ) - - args = parser.parse_args() - check_arguments(args) - device = torch.device('cuda:0') - - def seed_all(seed): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - seed_all(args.seed) - - t = 0 - if args.load: - print(f"Loading {args.load} ....") - if args.packing: - model = load_quant3(args.model, args.load, args.faster_kernel) - else: - model = get_llama(args.model) - model.load_state_dict(torch.load(args.load)) - model.eval() - print("Done.") - else: - model = get_llama(args.model) - model.eval() - - if args.wbits < 16 and not args.nearest: - dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True - ) - tick = time.time() - quantizers = llama_sequential(model, dataloader, device) - t = round((time.time() - tick),1) - print(f"Running Time : {t}") - - if args.benchmark: - dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False - ) - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - llama_multigpu(model, gpus) - else: - model = model.to(device) - if args.benchmark: - input_ids = dataloader.input_ids[:, :args.benchmark] - benchmark(model, input_ids) - exit() - - t1 = time.time() - ppl_scores = [] - if not args.no_eval: - if args.old_eval: - ppl_tasks = ['wikitext2', 'ptb', 'c4'] - else: - ppl_tasks = ['wikitext2','ptb-new', 'c4-new'] - for dataset in ppl_tasks: - testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False - ) - print(dataset) - ppl_score = llama_eval(model, testloader, device) - ppl_scores.append((dataset,ppl_score)) - t2 = time.time() - t1 - - if args.logfile: - with open(f'{args.logfile}','a') as fp: - add_str = f"| layers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n' - fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}") - for i in range(len(ppl_scores)): - fp.write(f"{ppl_scores[i][1]} ") - fp.write(f"\n") - - if args.save: - torch.save(model.state_dict(), args.save) - print(f"fake quantized model is saved to {args.save}") - if args.packing and args.wbits == 3: - temp = args.save.split('/') - temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1] - ckpt_path = '/'.join(temp) - n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers} - lm_pack3(model, quantizers, faster=args.faster_kernel) - torch.save({ - 'model_state_dict' : model.state_dict(), - 'n_out_dict' : n_out_dict}, ckpt_path) - print(f"3bit quantized model is saved to {ckpt_path}") - else: - print("Only 3bits quantized model is supported") \ No newline at end of file diff --git a/main.py b/main.py index cb4bd84..034bf4e 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,6 @@ @torch.no_grad() def layerwise_quantize(model, dataloader, dev, args): - # assert args.no_frob_norm == True meta = args.meta print('Starting ...') @@ -78,7 +77,7 @@ def forward(self, inp, **kwargs): # r = (args.target_bit - args.wbits) * 16 / 12 r /= n_owq_layers - layer = find_layers(layers[0], layers=[nn.Linear]) + layer = find_layers(layers[0]) for l in owq_layers: # for even number of n_out @@ -92,7 +91,7 @@ def forward(self, inp, **kwargs): quantizers = {} for i in range(len(layers)): layer = layers[i].to(dev) - block_layers = find_layers(layer, layers=[nn.Linear]) + block_layers = find_layers(layer) if args.true_sequential: sequential = meta['sequential'] @@ -226,7 +225,7 @@ def forward(self, inp, **kwargs): layer = layers[i].to(dev) if args.nearest: - subset = find_layers(layer, layers=args.meta['linears']) + subset = find_layers(layer) for name in subset: quantizer = Quantizer(args.wbits, perchannel=True, sym=args.sym, mse=False) W = subset[name].weight.data @@ -290,7 +289,7 @@ def forward(self, *inp, **kwargs): if inp[0].device != self.dev: inp[0] = inp[0].to(self.dev) for key in meta['inp_kwargs']: - if kwargs[key].device != self.dev: + if kwargs[key] != None and kwargs[key].device != self.dev: kwargs[key] = kwargs[key].to(self.dev) tmp = self.module(*inp, **kwargs) return tmp @@ -495,7 +494,7 @@ def sync(): # benchmark if args.benchmark: dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True + args.dataset, nsamples=1, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True ) gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] if len(gpus) > 1: @@ -514,7 +513,7 @@ def sync(): t1 = time.time() ppl_scores = [] if not args.no_eval: - ppl_tasks = ['wikitext2','ptb-new', 'c4-new'] + ppl_tasks = ['wikitext2','ptb', 'c4'] for dataset in ppl_tasks: testloader = get_loaders( dataset, seed=args.seed, model=args.model, seqlen=args.seqlen, train=False diff --git a/model_config.json b/model_config.json index 35ec950..260854b 100644 --- a/model_config.json +++ b/model_config.json @@ -74,7 +74,7 @@ }, "falcon":{ "map_layer":{"qkv":"self_attention.query_key_value","dense":"self_attention.dense","fc1":"mlp.dense_h_to_4h","fc2":"mlp.dense_4h_to_h"}, - "ratios":{"self_attention.query_key_value":3,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25}, + "ratios":{"self_attention.query_key_value":1,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25}, "sequential":[ ["self_attention.query_key_value"], ["self_attention.dense"], diff --git a/opt.py b/opt.py deleted file mode 100644 index 396e1e9..0000000 --- a/opt.py +++ /dev/null @@ -1,586 +0,0 @@ -import time - -import torch -import torch.nn as nn - -import transformers - -from owq.recon import GPTQ_OWQ -from owq.quant import * -from owq.utils.misc import find_layers, check_arguments -from owq.utils.datautils import * - -import argparse -import random -import os -import numpy as np -from tqdm import tqdm - -layer_list = ['k','v','q','out','fc1','fc2'] -n_out_dict = {'self_attn.k_proj':0, - 'self_attn.v_proj':0, - 'self_attn.q_proj':0, - 'self_attn.out_proj':0, - 'fc1':0, 'fc2':0 } - -def get_opt(model): - import torch - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - from transformers import OPTForCausalLM - model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') - model.seqlen = model.config.max_position_embeddings - return model - -@torch.no_grad() -def opt_sequential(model, dataloader, dev): - print('Starting ...') - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - - layers[0] = Catcher(layers[0]) - for batch in dataloader: - try: - model(batch[0].to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - print('Ready.') - - if args.target_bit is not None: - args.layers = layer_list if args.layers is None else args.layers - n_mp_layers = len(args.layers) - - r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits) - # r = (args.target_bit - args.wbits) * 16 / 12 - r /= n_mp_layers - - layer = find_layers(layers[0]) - - for i in range(len(args.layers)): - if args.layers[i] in ('k','v','q','out'): - name = 'self_attn.' + args.layers[i] + '_proj' - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) - else: - name = args.layers[i] - n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4) - - quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) - block_layers = find_layers(layer) - - if args.true_sequential: - sequential = [ - ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], - ['self_attn.out_proj'], - ['fc1'], - ['fc2'] - ] - else: - sequential = [list(block_layers.keys())] - - for names in sequential: - subset = {n: block_layers[n] for n in names} - - gptq = {} - for name in subset: - gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name]) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse') - ) - gptq[name].quantizer.n_out = n_out_dict[name] - - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - for j in range(args.nsamples): - layer(inps[j].unsqueeze(0), attention_mask=attention_mask) - for h in handles: - h.remove() - - for name in subset: - if not args.no_frob_norm: - W = subset[name].weight.data.clone().to(torch.float) - temp_quantizer = Quantizer() - temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')) - temp_quantizer.find_params(W, weight=True, num=40) - W_quant = temp_quantizer.quantize(W) - frob_norm_error = (W - W_quant).pow(2).sum(dim=0) - else: - frob_norm_error = None - out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error) - gptq[name].quantizer.out_ids = out_ids.cpu() - - if not args.no_frob_norm: - del W - del W_quant - del temp_quantizer - torch.cuda.empty_cache() - - for name in subset: - print(f"Quantizing model.decoder.layers.{i}.{name}") - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu() - gptq[name].free() - - for j in range(args.nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = outs, inps - - model.config.use_cache = use_cache - - return quantizers - -@torch.no_grad() -def opt_eval(model, testenc, dev): - print('Evaluating ...') - - testenc = testenc.input_ids - nsamples = testenc.numel() // model.seqlen - - use_cache = model.config.use_cache - model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - inps = torch.zeros( - (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev - ) - cache = {'i': 0, 'attention_mask': None} - - class Catcher(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - raise ValueError - layers[0] = Catcher(layers[0]) - for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) - try: - model(batch) - except ValueError: - pass - layers[0] = layers[0].module - - layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() - torch.cuda.empty_cache() - - outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'] - - for i in tqdm(range(len(layers))): - layer = layers[i].to(dev) - - if args.nearest: - subset = find_layers(layer) - for name in subset: - quantizer = Quantizer() - quantizer.configure( - args.wbits, perchannel=True, sym=False, mse=False - ) - W = subset[name].weight.data - quantizer.find_params(W, weight=True) - subset[name].weight.data = quantize( - W, quantizer.scale, quantizer.zero, quantizer.maxq - ).to(next(iter(layer.parameters())).dtype) - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] - layers[i] = layer.cpu() - del layer - torch.cuda.empty_cache() - inps, outs = outs, inps - - if model.model.decoder.final_layer_norm is not None: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) - if model.model.decoder.project_out is not None: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - model.lm_head = model.lm_head.to(dev) - - testenc = testenc.to(dev) - nlls = [] - for i in range(nsamples): - hidden_states = inps[i].unsqueeze(0) - if model.model.decoder.final_layer_norm is not None: - hidden_states = model.model.decoder.final_layer_norm(hidden_states) - if model.model.decoder.project_out is not None: - hidden_states = model.model.decoder.project_out(hidden_states) - lm_logits = model.lm_head(hidden_states) - shift_logits = lm_logits[:, :-1, :].contiguous() - shift_labels = testenc[ - :, (i * model.seqlen):((i + 1) * model.seqlen) - ][:, 1:] - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - neg_log_likelihood = loss.float() * model.seqlen - nlls.append(neg_log_likelihood) - ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) - print(ppl.item()) - - model.config.use_cache = use_cache - return ppl.item() - -def load_quant3(model, checkpoint, faster=False): - from transformers import OPTConfig, OPTForCausalLM - config = OPTConfig.from_pretrained(model) - def noop(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = OPTForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: - if name in layers: - del layers[name] - - ckpt = torch.load(checkpoint) - n_out_dict = ckpt['n_out_dict'] - - make_quant3(model, n_out_dict, faster=faster) - - model.load_state_dict(ckpt['model_state_dict']) - model.seqlen = model.config.max_position_embeddings - - return model - -def opt_multigpu(model, gpus): - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) - if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) - import copy - import math - model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) - - cache = {'mask': None} - - class MoveModule(nn.Module): - def __init__(self, module): - super().__init__() - self.module = module - self.dev = next(iter(self.module.parameters())).device - def forward(self, *inp, **kwargs): - inp = list(inp) - if inp[0].device != self.dev: - inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache['mask'].device != self.dev: - cache['mask'] = kwargs['attention_mask'].to(self.dev) - kwargs['attention_mask'] = cache['mask'] - tmp = self.module(*inp, **kwargs) - return tmp - - layers = model.model.decoder.layers - pergpu = math.ceil(len(layers) / len(gpus)) - for i in range(len(layers)): - layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) - - model.gpus = gpus - -def benchmark(model, input_ids): - dev = torch.device('cuda:0') - input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev) - torch.cuda.synchronize() - - cache = {'past': None} - def clear_past(i): - def tmp(layer, inp, out): - if cache['past']: - cache['past'][i] = None - return tmp - for i, layer in enumerate(model.model.decoder.layers): - layer.register_forward_hook(clear_past(i)) - - print('Benchmarking ...') - - loss = nn.CrossEntropyLoss() - tot = 0. - - def sync(): - if hasattr(model, 'gpus'): - for gpu in model.gpus: - torch.cuda.synchronize(gpu) - else: - torch.cuda.synchronize() - with torch.no_grad(): - attention_mask = torch.ones((1, input_ids.numel()), device=dev) - times = [] - for i in range(input_ids.numel()): - tick = time.time() - out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'],attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))) - sync() - times.append(time.time() - tick) - if i != input_ids.numel() - 1: - tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float() - cache['past'] = list(out.past_key_values) - del out - sync() - - print('Median:', np.median(times)) - print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - - parser.add_argument( - 'model', type=str, - help='OPT model to load; pass `facebook/opt-X`.' - ) - parser.add_argument( - 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], - help='Where to extract calibration data from.' - ) - parser.add_argument( - '--nsamples', type=int, default=128, - help='Number of calibration data samples.' - ) - parser.add_argument( - '--wbits', type=int, default=16, choices=[2, 3, 4, 16], - help='The number of bits to use for weight quantization; use 16 for evaluating base model.' - ) - parser.add_argument( - '--target_bit', type=float, default=None, - help='Effctive target bits for OWQ.' - ) - parser.add_argument( - '--tuning', type=str, default='mse', choices=['mse', 'minmax'], - help='Method for quantization parameter tuning.' - ) - parser.add_argument( - '--no_frob_norm', action='store_true', - help='Whether to use Frobenius norm for OWQ.' - ) - parser.add_argument( - '--percdamp', type=float, default=.01, - help='Percent of the average Hessian diagonal to use for dampening.' - ) - parser.add_argument( - '--layers', nargs='+', type=str, default=None, choices=layer_list, - help='Layers to apply OWQ.' - ) - parser.add_argument( - '--seed', type=int, default=0, - help='Seed for sampling the calibration data.' - ) - parser.add_argument( - '--nearest', action='store_true', - help='Whether to run the round-to-nearest quantization.' - ) - parser.add_argument( - '--groupsize', type=int, default=-1, - help='Groupsize for fine-grained quantization; default uses full row.' - ) - - parser.add_argument( - '--no-eval', action='store_true', - help='Whether to evaluate model on WikiText-2, PTB and C4' - ) - parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' - ) - parser.add_argument( - '--load', type=str, default='', - help='Load fake or 3bit quantized checkpoint.' - ) - parser.add_argument( - '--logfile', type=str, default='', - help='Logging file name' - ) - parser.add_argument( - '--packing', action='store_true', - help='Whether to save 3bit quantized model.' - ) - parser.add_argument( - '--faster-kernel', action='store_true', - help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.' - ) - parser.add_argument( - '--benchmark', type=int, default=0, - help='Number of tokens to use for benchmarking.' - ) - - parser.add_argument( - '--old-eval', action='store_true', - help='Whether to use the old version of PTB and C4 evaluation.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' - ) - parser.add_argument( - '--true-sequential', action='store_true', - help='Whether to run in true sequential model.' - ) - - args = parser.parse_args() - check_arguments(args) - - device = torch.device('cuda:0') - - def seed_all(seed): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - seed_all(args.seed) - - t = 0 - if args.load: - print(f"Loading {args.load} ....") - if args.packing: - model = load_quant3(args.model, args.load, args.faster_kernel) - else: - model = get_opt(args.model) - model.load_state_dict(torch.load(args.load)) - model.eval() - print("Done.") - else: - model = get_opt(args.model) - model.eval() - - if args.wbits < 16 and not args.nearest: - dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True - ) - tick = time.time() - quantizers = opt_sequential(model, dataloader, device) - t = round((time.time() - tick),1) - print(f"Running Time : {t}") - - if args.benchmark: - dataloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False - ) - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - opt_multigpu(model, gpus) - else: - model = model.to(device) - if args.benchmark: - input_ids = dataloader.input_ids[:, :args.benchmark] - benchmark(model, input_ids) - exit() - - t1 = time.time() - ppl_scores = [] - if not args.no_eval: - if args.old_eval: - ppl_tasks = ['wikitext2', 'ptb', 'c4'] - else: - ppl_tasks = ['wikitext2','ptb-new', 'c4-new'] - for dataset in ppl_tasks: - testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False - ) - print(dataset) - ppl_score = opt_eval(model, testloader, device) - ppl_scores.append((dataset,ppl_score)) - t2 = time.time() - t1 - - if args.logfile: - with open(f'{args.logfile}','a') as fp: - add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n' - fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}") - for i in range(len(ppl_scores)): - fp.write(f"{ppl_scores[i][1]} ") - fp.write(f"\n\n") - - if args.save: - torch.save(model.state_dict(), args.save) - print(f"fake quantized model is saved to {args.save}") - if args.packing and args.wbits == 3: - temp = args.save.split('/') - temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1] - ckpt_path = '/'.join(temp) - n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers} - lm_pack3(model, quantizers, faster=args.faster_kernel) - torch.save({ - 'model_state_dict' : model.state_dict(), - 'n_out_dict' : n_out_dict}, ckpt_path) - print(f"3bit quantized model is saved to {ckpt_path}") - else: - print("Only 3bits quantized model is supported") diff --git a/owq/kernel/setup_cuda.py b/owq/kernel/setup_cuda.py index 98d3263..eaef4fd 100644 --- a/owq/kernel/setup_cuda.py +++ b/owq/kernel/setup_cuda.py @@ -1,10 +1,31 @@ from setuptools import setup, Extension from torch.utils import cpp_extension +extra_compile_args = { + "cxx": [ + "-g", + "-O3", + "-fopenmp", + "-lgomp", + "-std=c++17", + ], + "nvcc": [ + "-O3", + "-std=c++17", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "--use_fast_math", + "--threads=8" + ], +} + setup( name='owq_cuda', ext_modules=[cpp_extension.CUDAExtension( - 'owq_cuda', ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu'] + name = 'owq_cuda', + sources = ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu'], + extra_compile_args=extra_compile_args, )], - cmdclass={'build_ext': cpp_extension.BuildExtension} -) + cmdclass={'build_ext': cpp_extension.BuildExtension}, + install_requires = ["torch"], +) \ No newline at end of file diff --git a/owq/kernel/test_kernel.py b/owq/kernel/test_kernel.py index f2c1805..0ca6b5d 100644 --- a/owq/kernel/test_kernel.py +++ b/owq/kernel/test_kernel.py @@ -132,8 +132,8 @@ def correctness(M=4*12288, N=12288, bits=3, outlieridx=[], faster=False): if __name__=="__main__": bits=3 - for d, model in zip([4096],['opt-6.7b']): # opt - # for d, model in zip([12288],['opt-175b']): # opt + for d, model in zip([4096],['opt-6.7b']): # opt-6.7b + # for d, model in zip([12288],['opt-175b']): # opt-175b n = 6 print(f'Benchmarking {model.upper()} matvec with outlier ...') for M,N in [[d,d],[d,d*4],[d*4,d]]: diff --git a/owq/quant.py b/owq/quant.py index 1ee2e2f..83e226e 100644 --- a/owq/quant.py +++ b/owq/quant.py @@ -1,6 +1,7 @@ import numpy as np import torch import torch.nn as nn +from transformers.models.falcon.modeling_falcon import FalconLinear try: import owq_cuda @@ -200,7 +201,7 @@ def make_quant(module, n_out_infos, wbits, name=''): for name1, child in module.named_children(): make_quant(child, n_out_infos, wbits, name + '.' + name1 if name != '' else name1) -def lm_pack(model, quantinfos, wbits, linears=[nn.Linear]): +def lm_pack(model, quantinfos, wbits, linears=[nn.Linear, FalconLinear]): from owq.utils.misc import find_layers layers = find_layers(model, linears) layers = {n: layers[n] for n in quantinfos} diff --git a/owq/utils/misc.py b/owq/utils/misc.py index 3e2f6b5..287e96c 100644 --- a/owq/utils/misc.py +++ b/owq/utils/misc.py @@ -1,10 +1,11 @@ import torch import torch.nn as nn import math +from transformers.models.falcon.modeling_falcon import FalconLinear layer_list = ['q','k','v','qkv','o','out','dense','fc1','fc2','up','gate','down'] -def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): +def find_layers(module, layers=[nn.Linear, FalconLinear], name=''): if type(module) in layers: return {name: module} res = {} diff --git a/requirements.txt b/requirements.txt index 2dad1e3..d16d45a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ # torch==2.0.0 transformers datasets +accelerate +peft sacrebleu sqlitedict scikit-learn